source: trunk/gsdl/src/mgpp/text/text.h@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.6 KB
Line 
1/**************************************************************************
2 *
3 * text.h -- Header file for compression related stuff
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#ifndef H_TEXT
23#define H_TEXT
24
25#include "huffman.h"
26#include <stdio.h>
27
28/*****************************************************************************
29 *
30 * There are several different methods of compressing the text of the
31 * database. The following defines list the different methods of text
32 * compression
33 *
34 */
35
36
37/* The dictionary contains all the fragments that occur in the collection
38 i.e. escapes are not possible */
39#define MG_COMPLETE_DICTIONARY 0
40
41
42/* Certain words have been deleted from the dictionary. The words deleted
43 have been used to create the frequency huffman codes of the characters.
44 This dictionary has an escape code and may be used to compress novel
45 words. This dictionary may fail if there is a novel character. */
46#define MG_PARTIAL_DICTIONARY 1
47
48
49/* This dictionary has an escape so that novel words and non-words can be
50 coded. The method for coding the novel words and non-words is determined
51 by a dictionary parameter. */
52#define MG_SEED_DICTIONARY 2
53
54
55
56
57
58
59
60
61
62/*****************************************************************************
63 *
64 * With a seed dictionary there are several methods for coding the novel
65 * words and non-words the following defined values specify the different
66 * methods of coding.
67 *
68 */
69
70
71/* Code novel words and non-words character by character using huffman codes.
72 The huffman codes for the word and non-word lengths and characters are
73 generated from the distribution of lengths and characters in the
74 dictionary. */
75#define MG_NOVEL_HUFFMAN_CHARS 0
76
77
78/* This method codes novel words using delta codes. The novel words are stored
79 in a auxillary dictionary which is built by pass two. */
80#define MG_NOVEL_DELTA 2
81
82
83/* This method codes novel words using hybrid version of delta. The novel
84 words are stored in a auxillary dictionary which is built by pass two. */
85#define MG_NOVEL_HYBRID 3
86
87
88
89
90
91
92/* This specified an amount of extra space allocated in the
93 compression_dict_header for adding new parameters. As new
94 parameters are added this should be decreased. */
95#define TEXT_PARAMS 15
96
97
98
99struct compression_dict_header {
100 u_long dict_type;
101 u_long novel_method;
102 u_long params[TEXT_PARAMS];
103 u_long num_words[2];
104 u_long num_word_chars[2];
105 u_long lookback;
106};
107
108
109struct comp_frags_header {
110 huff_data hd;
111 u_long uncompressed_size;
112 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
113};
114
115// BOGUSTEXTLEN is used to replace the ratio
116// in the compressed_text_header while preparing
117// for UCArray
118#define BOGUSTEXTLEN 1000000
119
120struct compressed_text_header {
121 u_long num_of_docs;
122 u_long num_of_words; // number of words in collection
123 double num_of_bytes;
124
125 compressed_text_header ();
126 void Clear ();
127
128 // you must seek to the appropriate place before calling
129 // Read or Write
130 bool Read (FILE *f);
131 bool Write (FILE *f) const;
132};
133
134
135struct compression_stats_header {
136 u_long num_docs;
137 u_long dummy; // alignment
138 double num_bytes;
139};
140
141
142struct frags_stats_header {
143 u_long num_frags;
144 u_long mem_for_frags;
145};
146
147
148struct aux_frags_header {
149 u_long num_frags;
150 u_long mem_for_frags;
151};
152
153
154struct auxiliary_dict {
155 aux_frags_header afh[2];
156 u_char *word_data[2];
157 u_char **words[2];
158 int blk_start[2][33], blk_end[2][33]; /* blk_start and blk_end are required
159 for the hybrid methods */
160
161 auxiliary_dict ();
162 ~auxiliary_dict ();
163
164 void Clear ();
165};
166
167
168struct compression_dict {
169 compression_dict_header cdh;
170 comp_frags_header *cfh[2];
171 unsigned long MemForCompDict;
172 u_char ***values[2];
173 u_char *escape[2];
174 huff_data *chars_huff[2];
175 u_long **chars_vals[2];
176 huff_data *lens_huff[2];
177 u_long **lens_vals[2];
178 auxiliary_dict *ad;
179 int fast_loaded;
180
181 compression_dict ();
182 ~compression_dict ();
183
184 void Clear ();
185};
186
187
188#endif
Note: See TracBrowser for help on using the repository browser.