source: trunk/gsdl/src/mgpp/text/text.h@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.6 KB
Line 
1/**************************************************************************
2 *
3 * text.h -- Header file for compression related stuff
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: text.h 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24
25
26#ifndef H_TEXT
27#define H_TEXT
28
29#include "huffman.h"
30#include <stdio.h>
31
32
33
34
35/*****************************************************************************
36 *
37 * There are several different methods of compressing the text of the
38 * database. The following defines list the different methods of text
39 * compression
40 *
41 */
42
43
44/* The dictionary contains all the fragments that occur in the collection
45 i.e. escapes are not possible */
46#define MG_COMPLETE_DICTIONARY 0
47
48
49/* Certain words have been deleted from the dictionary. The words deleted
50 have been used to create the frequency huffman codes of the characters.
51 This dictionary has an escape code and may be used to compress novel
52 words. This dictionary may fail if there is a novel character. */
53#define MG_PARTIAL_DICTIONARY 1
54
55
56/* This dictionary has an escape so that novel words and non-words can be
57 coded. The method for coding the novel words and non-words is determined
58 by a dictionary parameter. */
59#define MG_SEED_DICTIONARY 2
60
61
62
63
64
65
66
67
68
69/*****************************************************************************
70 *
71 * With a seed dictionary there are several methods for coding the novel
72 * words and non-words the following defined values specify the different
73 * methods of coding.
74 *
75 */
76
77
78/* Code novel words and non-words character by character using huffman codes.
79 The huffman codes for the word and non-word lengths and characters are
80 generated from the distribution of lengths and characters in the
81 dictionary. */
82#define MG_NOVEL_HUFFMAN_CHARS 0
83
84
85/* This method codes novel words using delta codes. The novel words are stored
86 in a auxillary dictionary which is built by pass two. */
87#define MG_NOVEL_DELTA 2
88
89
90/* This method codes novel words using hybrid version of delta. The novel
91 words are stored in a auxillary dictionary which is built by pass two. */
92#define MG_NOVEL_HYBRID 3
93
94
95
96
97
98
99/* This specified an amount of extra space allocated in the
100 compression_dict_header for adding new parameters. As new
101 parameters are added this should be decreased. */
102#define TEXT_PARAMS 15
103
104
105
106struct compression_dict_header {
107 u_long dict_type;
108 u_long novel_method;
109 u_long params[TEXT_PARAMS];
110 u_long num_words[2];
111 u_long num_word_chars[2];
112 u_long lookback;
113};
114
115
116struct comp_frags_header {
117 huff_data hd;
118 u_long uncompressed_size;
119 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
120};
121
122// BOGUSTEXTLEN is used to replace the ratio
123// in the compressed_text_header while preparing
124// for UCArray
125#define BOGUSTEXTLEN 1000000
126
127struct compressed_text_header {
128 u_long num_of_docs;
129 u_long num_of_words; // number of words in collection
130 double num_of_bytes;
131
132 compressed_text_header ();
133 void Clear ();
134
135 // you must seek to the appropriate place before calling
136 // Read or Write
137 bool Read (FILE *f);
138 bool Write (FILE *f) const;
139};
140
141
142struct compression_stats_header {
143 u_long num_docs;
144 u_long dummy; // alignment
145 double num_bytes;
146};
147
148
149struct frags_stats_header {
150 u_long num_frags;
151 u_long mem_for_frags;
152};
153
154
155struct aux_frags_header {
156 u_long num_frags;
157 u_long mem_for_frags;
158};
159
160
161struct auxiliary_dict {
162 aux_frags_header afh[2];
163 u_char *word_data[2];
164 u_char **words[2];
165 int blk_start[2][33], blk_end[2][33]; /* blk_start and blk_end are required
166 for the hybrid methods */
167
168 auxiliary_dict ();
169 ~auxiliary_dict ();
170
171 void Clear ();
172};
173
174
175struct compression_dict {
176 compression_dict_header cdh;
177 comp_frags_header *cfh[2];
178 unsigned long MemForCompDict;
179 u_char ***values[2];
180 u_char *escape[2];
181 huff_data *chars_huff[2];
182 u_long **chars_vals[2];
183 huff_data *lens_huff[2];
184 u_long **lens_vals[2];
185 auxiliary_dict *ad;
186 int fast_loaded;
187
188 compression_dict ();
189 ~compression_dict ();
190
191 void Clear ();
192};
193
194
195#endif
Note: See TracBrowser for help on using the repository browser.