source: trunk/gsdl/src/mgpp/text/text.pass1.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.2 KB
Line 
1/**************************************************************************
2 *
3 * text.pass1.cpp -- Text compression (Pass 1)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: text.pass1.cpp 856 2000-01-14 02:26:25Z sjboddie $
22 *
23 **************************************************************************/
24
25#include "sysfuncs.h"
26
27#include "memlib.h"
28#include "messages.h"
29#include "huffman.h"
30#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
31
32
33#include "mg_files.h"
34#include "mg.h"
35#include "build.h"
36#include "locallib.h"
37#include "words.h"
38#include "text.h"
39#include "hash.h"
40#include "local_strings.h"
41
42#include "TextEl.h"
43
44
45/*
46 $Log$
47 Revision 1.1 2000/01/14 02:26:23 sjboddie
48 Rodgers new C++ mg
49
50 Revision 1.1 1999/10/11 02:58:37 cs025
51 Base install of MG-PP
52
53 Revision 1.1 1999/08/10 21:18:25 sjboddie
54 renamed mg-1.3d directory mg
55
56 Revision 1.2 1998/12/17 09:12:54 rjmcnab
57
58 Altered mg to process utf-8 encoded Unicode. The main changes
59 are in the parsing of the input, the casefolding, and the stemming.
60
61 Revision 1.1 1998/11/17 09:35:46 rjmcnab
62 *** empty log message ***
63
64 * Revision 1.4 1994/11/25 03:47:47 tes
65 * Committing files before adding the merge stuff.
66 *
67 * Revision 1.3 1994/10/20 03:57:09 tes
68 * I have rewritten the boolean query optimiser and abstracted out the
69 * components of the boolean query.
70 *
71 * Revision 1.2 1994/09/20 04:42:13 tes
72 * For version 1.1
73 *
74 */
75
76
77#define POOL_SIZE 1024*1024
78#define INITIAL_HASH_SIZE 7927
79
80
81
82
83
84
85
86typedef struct hash_rec
87 {
88 unsigned long wcnt; /* word frequency */
89 unsigned long occurance_num;
90 u_char *word;
91 }
92hash_rec;
93
94typedef struct dict_data
95 {
96 hash_rec *HashTable;
97 unsigned long HashSize;
98 unsigned long HashUsed;
99 unsigned long wordnum;
100 unsigned long words_read;
101 unsigned long bytes_diff;
102 huff_data hd;
103 }
104dict_data;
105
106
107
108static unsigned long longestDoc = 0;
109static unsigned long occurance_num = 0;
110static dict_data DictData[2];
111
112static u_char *Pool;
113static int PoolLeft;
114static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
115static unsigned long MaxMemInUse = 0;
116static unsigned long MemInUse = 0;
117static compression_stats_header csh = {0, 0, 0.0}; /* [RJM 07/97: 4G limit] */
118
119
120static void ChangeMem (int Change) {
121 MemInUse += Change;
122 if (MemInUse > MaxMemInUse) MaxMemInUse = MemInUse;
123}
124
125
126int init_text_1 (const TagInfo &/*tagInfo*/, char * /*FileName*/) {
127 int which;
128
129 if (!(Pool = (u_char *) Xmalloc (POOL_SIZE))) {
130 Message ("Unable to allocate memory for pool");
131 return (COMPERROR);
132 }
133 PoolLeft = POOL_SIZE;
134 ChangeMem (POOL_SIZE);
135
136 for (which = 1; which >= 0; which--) {
137 u_char *word;
138 hash_rec *ent;
139 dict_data *dd = &DictData[which];
140
141 dd->wordnum = 0;
142 dd->words_read = 0;
143 dd->bytes_diff = 0;
144 dd->HashSize = INITIAL_HASH_SIZE;
145 dd->HashUsed = 0;
146
147 if (!(dd->HashTable = (hash_rec *) Xmalloc (sizeof (hash_rec) * dd->HashSize))) {
148 Message ("Unable to allocate memory for table");
149 return (COMPERROR);
150 }
151 ChangeMem (sizeof (hash_rec) * dd->HashSize);
152 bzero ((char *) (dd->HashTable), sizeof (hash_rec) * dd->HashSize);
153
154 word = Pool;
155 *Pool++ = '\0';
156 PoolLeft--;
157 {
158 register u_char *wptr;
159 register int hsize = dd->HashSize;
160 register unsigned long hashval, step;
161
162 HASH (hashval, step, word, hsize);
163 wptr = (dd->HashTable + hashval)->word;
164 while (wptr) {
165 hashval += step;
166 if (hashval >= (unsigned long)hsize)
167 hashval -= hsize;
168 wptr = (dd->HashTable + hashval)->word;
169 }
170 ent = dd->HashTable + hashval;
171 }
172 ent->wcnt = 1;
173 ent->word = word;
174 dd->HashUsed = 1;
175 }
176 return (COMPALLOK);
177}
178
179
180static int process_text_element (const u_char *s_in, int l_in) {
181 const u_char *end = s_in + l_in - 1;
182
183 /*
184 ** Alternately parse off words and non-words from the input
185 ** stream beginning with a non-word. Each token is then
186 ** inserted into the set if it does not exist or has it's
187 ** frequency count incremented if it does.
188 */
189
190 bool which = false; // non-word
191 for (; s_in <= end; which = !which) {
192 u_char Word[MAXWORDLEN + 1];
193 dict_data *dd = &DictData[which];
194
195 /* First parse a word or non-word out of the string */
196 if (which) PARSE_WORD (Word, s_in, end);
197 else PARSE_NON_WORD (Word, s_in, end);
198
199 dd->wordnum++;
200 inputbytes += *Word;
201 dd->words_read++;
202
203 /* Search the hash table for Word */
204 {
205 register unsigned long hashval, step;
206 register int hsize = dd->HashSize;
207 HASH (hashval, step, Word, hsize);
208 for (;;) {
209 register u_char *s1;
210 register u_char *s2;
211 register int len;
212 register hash_rec *ent;
213 ent = dd->HashTable + hashval;
214 if (!ent->word) {
215 int len = *Word + 1;
216 if (len > PoolLeft) {
217 if (!(Pool = (u_char *) Xmalloc (POOL_SIZE))) {
218 Message ("Unable to allocate memory for pool");
219 return (COMPERROR);
220 }
221 PoolLeft = POOL_SIZE;
222 ChangeMem (POOL_SIZE);
223 }
224 ent->occurance_num = occurance_num++;
225 ent->wcnt = 1;
226 ent->word = Pool;
227 memcpy (Pool, Word, len);
228 Pool += len;
229 PoolLeft -= len;
230 dd->HashUsed++;
231 dd->bytes_diff += Word[0];
232 break;
233 }
234
235 /* Compare the words */
236 s1 = Word;
237 s2 = ent->word;
238 len = *s1 + 1;
239 for (; len; len--)
240 if (*s1++ != *s2++) break;
241
242 if (len) {
243 hashval = (hashval + step);
244 if (hashval >= (unsigned long)hsize) hashval -= hsize;
245 } else {
246 ent->wcnt++;
247 break;
248 }
249 }
250 }
251
252
253 if (dd->HashUsed >= dd->HashSize >> 1) {
254 hash_rec *ht;
255 unsigned long size;
256 unsigned long i;
257 size = prime (dd->HashSize * 2);
258 if (!(ht = (hash_rec *) Xmalloc (sizeof (hash_rec) * size))) {
259 Message ("Unable to allocate memory for table");
260 return (COMPERROR);
261 }
262 ChangeMem (sizeof (hash_rec) * size);
263 bzero ((char *) ht, sizeof (hash_rec) * size);
264
265 for (i = 0; i < dd->HashSize; i++)
266 if (dd->HashTable[i].word) {
267 register u_char *wptr;
268 register unsigned long hashval, step;
269
270 wptr = dd->HashTable[i].word;
271 HASH (hashval, step, wptr, size);
272 wptr = (ht + hashval)->word;
273 while (wptr) {
274 hashval += step;
275 if (hashval >= size) hashval -= size;
276 wptr = (ht + hashval)->word;
277 }
278 ht[hashval] = dd->HashTable[i];
279 }
280 Xfree (dd->HashTable);
281 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
282 dd->HashTable = ht;
283 dd->HashSize = size;
284 }
285 }
286
287 return COMPALLOK;
288}
289
290
291int process_text_1 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
292 unsigned long textLen = 0;
293 unsigned long docLen = 0;
294 int retValue;
295
296 // process each text element in this document
297 TextElArray::const_iterator here = doc.begin();
298 TextElArray::const_iterator end = doc.end();
299 while (here != end) {
300 textLen = (*here).text.size();
301 docLen += textLen;
302
303 retValue = process_text_element ((*here).text.begin(), textLen);
304 if (retValue != COMPALLOK) return retValue;
305
306 here++;
307 }
308
309 // get max document length
310 if (docLen > longestDoc) longestDoc = docLen;
311
312 // update header information
313 csh.num_docs++;
314 csh.num_bytes += docLen;
315
316 return COMPALLOK;
317}
318
319
320static int PackHashTable (dict_data * dd) {
321 int s, d;
322 for (s = d = 0; (unsigned int)s < dd->HashSize; s++)
323 if (dd->HashTable[s].word)
324 dd->HashTable[d++] = dd->HashTable[s];
325
326 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
327 ChangeMem (sizeof (hash_rec) * dd->HashUsed);
328
329 if (!(dd->HashTable = (hash_rec *) Xrealloc (dd->HashTable,
330 sizeof (hash_rec) * dd->HashUsed))) {
331 Message ("Out of memory");
332 return COMPERROR;
333 }
334 dd->HashSize = dd->HashUsed;
335 return COMPALLOK;
336}
337
338
339static int ent_comp (const void *s1, const void *s2) {
340 return casecompare (((hash_rec *) s1)->word, ((hash_rec *) s2)->word);
341}
342
343
344static void WriteHashTable (FILE * fp, dict_data * dd) {
345 frags_stats_header fsh;
346 u_long j = 0;
347 u_char *curr;
348
349 if (PackHashTable (dd) == COMPERROR) return;
350
351 qsort (dd->HashTable, dd->HashUsed, sizeof (hash_rec), ent_comp);
352
353 fsh.num_frags = dd->HashSize;
354 fsh.mem_for_frags = dd->HashSize;
355 for (j = 0; j < dd->HashSize; j++)
356 fsh.mem_for_frags += dd->HashTable[j].word[0];
357
358 /* [RPAP - Jan 97: Endian Ordering] */
359 HTONUL(fsh.num_frags);
360 HTONUL(fsh.mem_for_frags);
361
362 fwrite (&fsh, sizeof (fsh), 1, fp);
363
364 for (j = 0; j < dd->HashSize; j++) {
365 curr = dd->HashTable[j].word;
366
367 /* [RPAP - Jan 97: Endian Ordering] */
368 HTONUL(dd->HashTable[j].wcnt);
369 HTONUL(dd->HashTable[j].occurance_num);
370
371 fwrite (&dd->HashTable[j].wcnt, sizeof (dd->HashTable[j].wcnt), 1, fp);
372 fwrite (&dd->HashTable[j].occurance_num,
373 sizeof (dd->HashTable[j].occurance_num), 1, fp);
374
375 /* [RPAP - Jan 97: Endian Ordering] */
376 NTOHUL(dd->HashTable[j].wcnt);
377 NTOHUL(dd->HashTable[j].occurance_num);
378
379 fwrite (curr, sizeof (u_char), curr[0] + 1, fp);
380 }
381}
382
383
384int done_text_1 (const TagInfo &/*tagInfo*/, char *file_name) {
385 char *temp_str;
386 FILE *fp;
387
388 if (!(fp = create_file (file_name, TEXT_STATS_DICT_SUFFIX, "wb",
389 MAGIC_STATS_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
390 return COMPERROR;
391
392 temp_str = msg_prefix;
393 msg_prefix = "text.pass1";
394
395 /* [RPAP - Jan 97: Endian Ordering] */
396 HTONUL(csh.num_docs);
397 HTOND(csh.num_bytes); /* [RJM 07/97: 4G limit] */
398
399 fwrite (&csh, sizeof (csh), 1, fp);
400
401 /* [RPAP - Jan 97: Endian Ordering] */
402 NTOHUL(csh.num_docs);
403 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
404
405 WriteHashTable (fp, &DictData[0]);
406 WriteHashTable (fp, &DictData[1]);
407 msg_prefix = temp_str;
408 return COMPALLOK;
409}
Note: See TracBrowser for help on using the repository browser.