source: trunk/gsdl/packages/mg/src/text/text.pass1.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1/**************************************************************************
2 *
3 * text.pass1.c -- Text compression (Pass 1)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: text.pass1.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "huffman.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31
32#include "mg_files.h"
33#include "mg.h"
34#include "build.h"
35#include "locallib.h"
36#include "words.h"
37#include "text.h"
38#include "hash.h"
39#include "local_strings.h"
40
41
42/*
43 $Log$
44 Revision 1.1 1999/08/10 21:18:25 sjboddie
45 renamed mg-1.3d directory mg
46
47 Revision 1.2 1998/12/17 09:12:54 rjmcnab
48
49 Altered mg to process utf-8 encoded Unicode. The main changes
50 are in the parsing of the input, the casefolding, and the stemming.
51
52 Revision 1.1 1998/11/17 09:35:46 rjmcnab
53 *** empty log message ***
54
55 * Revision 1.4 1994/11/25 03:47:47 tes
56 * Committing files before adding the merge stuff.
57 *
58 * Revision 1.3 1994/10/20 03:57:09 tes
59 * I have rewritten the boolean query optimiser and abstracted out the
60 * components of the boolean query.
61 *
62 * Revision 1.2 1994/09/20 04:42:13 tes
63 * For version 1.1
64 *
65 */
66
67static char *RCSID = "$Id: text.pass1.c 439 1999-08-10 21:23:37Z sjboddie $";
68
69
70#define POOL_SIZE 1024*1024
71#define INITIAL_HASH_SIZE 7927
72
73
74
75
76
77
78
79typedef struct hash_rec
80 {
81 unsigned long wcnt; /* word frequency */
82 unsigned long occurance_num;
83 u_char *word;
84 }
85hash_rec;
86
87typedef struct dict_data
88 {
89 hash_rec *HashTable;
90 unsigned long HashSize;
91 unsigned long HashUsed;
92 unsigned long wordnum;
93 unsigned long words_read;
94 unsigned long bytes_diff;
95 huff_data hd;
96 }
97dict_data;
98
99
100
101static unsigned long LongestDoc = 0;
102static unsigned long occurance_num = 0;
103static dict_data DictData[2];
104
105static u_char *Pool;
106static int PoolLeft;
107static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
108static unsigned long MaxMemInUse = 0;
109static unsigned long MemInUse = 0;
110static compression_stats_header csh =
111{0, 0.0}; /* [RJM 07/97: 4G limit] */
112
113
114static void
115ChangeMem (int Change)
116{
117 MemInUse += Change;
118 if (MemInUse > MaxMemInUse)
119 MaxMemInUse = MemInUse;
120}
121
122
123
124
125int
126init_text_1 (char *FileName)
127{
128 int which;
129
130 if (!(Pool = Xmalloc (POOL_SIZE)))
131 {
132 Message ("Unable to allocate memory for pool");
133 return (COMPERROR);
134 }
135 PoolLeft = POOL_SIZE;
136 ChangeMem (POOL_SIZE);
137
138 for (which = 1; which >= 0; which--)
139 {
140 u_char *word;
141 hash_rec *ent;
142 dict_data *dd = &DictData[which];
143
144 dd->wordnum = 0;
145 dd->words_read = 0;
146 dd->bytes_diff = 0;
147 dd->HashSize = INITIAL_HASH_SIZE;
148 dd->HashUsed = 0;
149
150 if (!(dd->HashTable = Xmalloc (sizeof (hash_rec) * dd->HashSize)))
151 {
152 Message ("Unable to allocate memory for table");
153 return (COMPERROR);
154 }
155 ChangeMem (sizeof (hash_rec) * dd->HashSize);
156 bzero ((char *) (dd->HashTable), sizeof (hash_rec) * dd->HashSize);
157
158 word = Pool;
159 *Pool++ = '\0';
160 PoolLeft--;
161 {
162 register u_char *wptr;
163 register int hsize = dd->HashSize;
164 register unsigned long hashval, step;
165
166 HASH (hashval, step, word, hsize);
167 wptr = (dd->HashTable + hashval)->word;
168 while (wptr)
169 {
170 hashval += step;
171 if (hashval >= hsize)
172 hashval -= hsize;
173 wptr = (dd->HashTable + hashval)->word;
174 }
175 ent = dd->HashTable + hashval;
176 }
177 ent->wcnt = 1;
178 ent->word = word;
179 dd->HashUsed = 1;
180 }
181 return (COMPALLOK);
182}
183
184
185
186
187int
188process_text_1 (u_char * s_in, int l_in)
189{
190 int which;
191 u_char *end = s_in + l_in - 1;
192
193 if (l_in > LongestDoc)
194 LongestDoc = l_in;
195
196 csh.num_docs++;
197 csh.num_bytes += l_in;
198
199 which = inaword (s_in, end);
200 /*
201 ** Alternately parse off words and non-words from the input
202 ** stream beginning with a non-word. Each token is then
203 ** inserted into the set if it does not exist or has it's
204 ** frequency count incremented if it does.
205 */
206 for (; s_in <= end; which = !which)
207 {
208 u_char Word[MAXWORDLEN + 1];
209 dict_data *dd = &DictData[which];
210
211 /* First parse a word or non-word out of the string */
212 if (which)
213 PARSE_WORD (Word, s_in, end);
214 else
215 PARSE_NON_WORD (Word, s_in, end);
216
217 dd->wordnum++;
218 inputbytes += *Word;
219 dd->words_read++;
220
221 /* Search the hash table for Word */
222 {
223 register unsigned long hashval, step;
224 register int hsize = dd->HashSize;
225 HASH (hashval, step, Word, hsize);
226 for (;;)
227 {
228 register u_char *s1;
229 register u_char *s2;
230 register int len;
231 register hash_rec *ent;
232 ent = dd->HashTable + hashval;
233 if (!ent->word)
234 {
235 int len = *Word + 1;
236 if (len > PoolLeft)
237 {
238 if (!(Pool = Xmalloc (POOL_SIZE)))
239 {
240 Message ("Unable to allocate memory for pool");
241 return (COMPERROR);
242 }
243 PoolLeft = POOL_SIZE;
244 ChangeMem (POOL_SIZE);
245 }
246 ent->occurance_num = occurance_num++;
247 ent->wcnt = 1;
248 ent->word = Pool;
249 memcpy (Pool, Word, len);
250 Pool += len;
251 PoolLeft -= len;
252 dd->HashUsed++;
253 dd->bytes_diff += Word[0];
254 break;
255 }
256
257 /* Compare the words */
258 s1 = Word;
259 s2 = ent->word;
260 len = *s1 + 1;
261 for (; len; len--)
262 if (*s1++ != *s2++)
263 break;
264
265 if (len)
266 {
267 hashval = (hashval + step);
268 if (hashval >= hsize)
269 hashval -= hsize;
270 }
271 else
272 {
273 ent->wcnt++;
274 break;
275 }
276 }
277 }
278
279
280 if (dd->HashUsed >= dd->HashSize >> 1)
281 {
282 hash_rec *ht;
283 unsigned long size;
284 unsigned long i;
285 size = prime (dd->HashSize * 2);
286 if (!(ht = Xmalloc (sizeof (hash_rec) * size)))
287 {
288 Message ("Unable to allocate memory for table");
289 return (COMPERROR);
290 }
291 ChangeMem (sizeof (hash_rec) * size);
292 bzero ((char *) ht, sizeof (hash_rec) * size);
293
294 for (i = 0; i < dd->HashSize; i++)
295 if (dd->HashTable[i].word)
296 {
297 register u_char *wptr;
298 register unsigned long hashval, step;
299
300 wptr = dd->HashTable[i].word;
301 HASH (hashval, step, wptr, size);
302 wptr = (ht + hashval)->word;
303 while (wptr)
304 {
305 hashval += step;
306 if (hashval >= size)
307 hashval -= size;
308 wptr = (ht + hashval)->word;
309 }
310 ht[hashval] = dd->HashTable[i];
311 }
312 Xfree (dd->HashTable);
313 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
314 dd->HashTable = ht;
315 dd->HashSize = size;
316
317
318 }
319 }
320 return (COMPALLOK);
321} /* encode */
322
323
324
325static int
326PackHashTable (dict_data * dd)
327{
328 int s, d;
329 for (s = d = 0; s < dd->HashSize; s++)
330 if (dd->HashTable[s].word)
331 dd->HashTable[d++] = dd->HashTable[s];
332 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
333 ChangeMem (sizeof (hash_rec) * dd->HashUsed);
334 if (!(dd->HashTable = Xrealloc (dd->HashTable,
335 sizeof (hash_rec) * dd->HashUsed)))
336 {
337 Message ("Out of memory");
338 return COMPERROR;
339 }
340 dd->HashSize = dd->HashUsed;
341 return COMPALLOK;
342}
343
344
345
346
347
348static int
349ent_comp (const void *s1, const void *s2)
350{
351 return casecompare (((hash_rec *) s1)->word, ((hash_rec *) s2)->word);
352}
353
354
355
356static void
357WriteHashTable (FILE * fp, dict_data * dd)
358{
359 frags_stats_header fsh;
360 u_long j = 0;
361 u_char *curr;
362
363 if (PackHashTable (dd) == COMPERROR)
364 return;
365
366 qsort (dd->HashTable, dd->HashUsed, sizeof (hash_rec), ent_comp);
367
368 fsh.num_frags = dd->HashSize;
369 fsh.mem_for_frags = dd->HashSize;
370 for (j = 0; j < dd->HashSize; j++)
371 fsh.mem_for_frags += dd->HashTable[j].word[0];
372
373 /* [RPAP - Jan 97: Endian Ordering] */
374 HTONUL(fsh.num_frags);
375 HTONUL(fsh.mem_for_frags);
376
377 fwrite (&fsh, sizeof (fsh), 1, fp);
378
379 for (j = 0; j < dd->HashSize; j++)
380 {
381 curr = dd->HashTable[j].word;
382
383 /* [RPAP - Jan 97: Endian Ordering] */
384 HTONUL(dd->HashTable[j].wcnt);
385 HTONUL(dd->HashTable[j].occurance_num);
386
387 fwrite (&dd->HashTable[j].wcnt, sizeof (dd->HashTable[j].wcnt), 1, fp);
388 fwrite (&dd->HashTable[j].occurance_num,
389 sizeof (dd->HashTable[j].occurance_num), 1, fp);
390
391 /* [RPAP - Jan 97: Endian Ordering] */
392 NTOHUL(dd->HashTable[j].wcnt);
393 NTOHUL(dd->HashTable[j].occurance_num);
394
395 fwrite (curr, sizeof (u_char), curr[0] + 1, fp);
396 }
397}
398
399
400int
401done_text_1 (char *file_name)
402{
403 char *temp_str;
404 FILE *fp;
405
406 if (!(fp = create_file (file_name, TEXT_STATS_DICT_SUFFIX, "wb",
407 MAGIC_STATS_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
408 return COMPERROR;
409
410 temp_str = msg_prefix;
411 msg_prefix = "text.pass1";
412
413 /* [RPAP - Jan 97: Endian Ordering] */
414 HTONUL(csh.num_docs);
415 HTOND(csh.num_bytes); /* [RJM 07/97: 4G limit] */
416
417 fwrite (&csh, sizeof (csh), 1, fp);
418
419 /* [RPAP - Jan 97: Endian Ordering] */
420 NTOHUL(csh.num_docs);
421 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
422
423 WriteHashTable (fp, &DictData[0]);
424 WriteHashTable (fp, &DictData[1]);
425 msg_prefix = temp_str;
426 return COMPALLOK;
427} /* done_encode */
Note: See TracBrowser for help on using the repository browser.