source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/text.h@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.9 KB
Line 
1/**************************************************************************
2 *
3 * text.h -- Header file for compression related stuff
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#ifndef H_TEXT
23#define H_TEXT
24
25// some elderly windows compilers need to include <vector> (or something
26// similar) here so that they know what a "bool" is
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\vector>
29#elif defined(GSDL_USE_STL_H)
30# include <vector.h>
31#else
32# include <vector>
33#endif
34
35
36#include "huffman.h"
37#include <stdio.h>
38
39/*****************************************************************************
40 *
41 * There are several different methods of compressing the text of the
42 * database. The following defines list the different methods of text
43 * compression
44 *
45 */
46
47
48/* The dictionary contains all the fragments that occur in the collection
49 i.e. escapes are not possible */
50#define MG_COMPLETE_DICTIONARY 0
51
52
53/* Certain words have been deleted from the dictionary. The words deleted
54 have been used to create the frequency huffman codes of the characters.
55 This dictionary has an escape code and may be used to compress novel
56 words. This dictionary may fail if there is a novel character. */
57#define MG_PARTIAL_DICTIONARY 1
58
59
60/* This dictionary has an escape so that novel words and non-words can be
61 coded. The method for coding the novel words and non-words is determined
62 by a dictionary parameter. */
63#define MG_SEED_DICTIONARY 2
64
65
66
67
68
69
70
71
72
73/*****************************************************************************
74 *
75 * With a seed dictionary there are several methods for coding the novel
76 * words and non-words the following defined values specify the different
77 * methods of coding.
78 *
79 */
80
81
82/* Code novel words and non-words character by character using huffman codes.
83 The huffman codes for the word and non-word lengths and characters are
84 generated from the distribution of lengths and characters in the
85 dictionary. */
86#define MG_NOVEL_HUFFMAN_CHARS 0
87
88
89/* This method codes novel words using delta codes. The novel words are stored
90 in a auxillary dictionary which is built by pass two. */
91#define MG_NOVEL_DELTA 2
92
93
94/* This method codes novel words using hybrid version of delta. The novel
95 words are stored in a auxillary dictionary which is built by pass two. */
96#define MG_NOVEL_HYBRID 3
97
98
99
100
101
102
103/* This specified an amount of extra space allocated in the
104 compression_dict_header for adding new parameters. As new
105 parameters are added this should be decreased. */
106#define TEXT_PARAMS 15
107
108
109
110struct compression_dict_header {
111 mg_u_long dict_type;
112 mg_u_long novel_method;
113 mg_u_long params[TEXT_PARAMS];
114 mg_u_long num_words[2];
115 mg_u_long num_word_chars[2];
116 mg_u_long lookback;
117};
118
119
120struct comp_frags_header {
121 huff_data hd;
122 mg_u_long uncompressed_size;
123 mg_u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
124};
125
126// BOGUSTEXTLEN is used to replace the ratio
127// in the compressed_text_header while preparing
128// for UCArray
129#define BOGUSTEXTLEN 1000000
130
131struct compressed_text_header {
132 mg_u_long num_of_docs;
133 mg_u_long num_of_words; // number of words in collection
134 double num_of_bytes;
135
136 compressed_text_header ();
137 void Clear ();
138
139 // you must seek to the appropriate place before calling
140 // Read or Write
141 bool Read (FILE *f);
142 bool Write (FILE *f) const;
143};
144
145
146struct compression_stats_header {
147 mg_u_long num_docs;
148 mg_u_long dummy; // alignment
149 double num_bytes;
150};
151
152
153struct frags_stats_header {
154 mg_u_long num_frags;
155 mg_u_long mem_for_frags;
156};
157
158
159struct aux_frags_header {
160 mg_u_long num_frags;
161 mg_u_long mem_for_frags;
162};
163
164
165struct auxiliary_dict {
166 aux_frags_header afh[2];
167 u_char *word_data[2];
168 u_char **words[2];
169 int blk_start[2][33], blk_end[2][33]; /* blk_start and blk_end are required
170 for the hybrid methods */
171
172 auxiliary_dict ();
173 ~auxiliary_dict ();
174
175 void Clear ();
176};
177
178
179struct compression_dict {
180 compression_dict_header cdh;
181 comp_frags_header *cfh[2];
182 mg_u_long MemForCompDict;
183 u_char ***values[2];
184 u_char *escape[2];
185 huff_data *chars_huff[2];
186 mg_u_long **chars_vals[2];
187 huff_data *lens_huff[2];
188 mg_u_long **lens_vals[2];
189 auxiliary_dict *ad;
190 int fast_loaded;
191
192 compression_dict ();
193 ~compression_dict ();
194
195 void Clear ();
196};
197
198
199#endif
Note: See TracBrowser for help on using the repository browser.