source: main/trunk/greenstone2/common-src/indexers/mg/src/text/text.pass1.c@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
Line 
1/**************************************************************************
2 *
3 * text.pass1.c -- Text compression (Pass 1)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: text.pass1.c 25147 2012-02-28 00:59:00Z kjdon $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "huffman.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31
32#include "mg_files.h"
33#include "mg.h"
34#include "build.h"
35#include "locallib.h"
36#include "words.h"
37#include "text.h"
38#include "hash.h"
39#include "local_strings.h"
40/* for more meaningful messages - jrm21 (gsdl) */
41#ifdef HAVE_STRERROR
42#include <string.h>
43#include <errno.h>
44#endif
45
46#define POOL_SIZE 1024*1024
47#define INITIAL_HASH_SIZE 7927
48
49
50typedef struct hash_rec
51 {
52 mg_u_long wcnt; /* word frequency */
53 mg_u_long occurance_num;
54 u_char *word;
55 }
56hash_rec;
57
58typedef struct dict_data
59 {
60 hash_rec *HashTable;
61 mg_u_long HashSize;
62 mg_u_long HashUsed;
63 mg_u_long wordnum;
64 mg_u_long words_read;
65 mg_u_long bytes_diff;
66 huff_data hd;
67 }
68dict_data;
69
70
71
72static mg_u_long LongestDoc = 0;
73static mg_u_long occurance_num = 0;
74static dict_data DictData[2];
75
76static u_char *Pool;
77static int PoolLeft;
78static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
79static mg_u_long MaxMemInUse = 0;
80static mg_u_long MemInUse = 0;
81static compression_stats_header csh =
82{0, 0.0}; /* [RJM 07/97: 4G limit] */
83
84
85static void
86ChangeMem (int Change)
87{
88 MemInUse += Change;
89 if (MemInUse > MaxMemInUse)
90 MaxMemInUse = MemInUse;
91}
92
93
94
95
96int
97init_text_1 (char *FileName)
98{
99 int which;
100 if (!(Pool = Xmalloc (POOL_SIZE)))
101 {
102 Message ("Unable to allocate memory for pool");
103 return (COMPERROR);
104 }
105 PoolLeft = POOL_SIZE;
106 ChangeMem (POOL_SIZE);
107
108 for (which = 1; which >= 0; which--)
109 {
110 u_char *word;
111 hash_rec *ent;
112 dict_data *dd = &DictData[which];
113
114 dd->wordnum = 0;
115 dd->words_read = 0;
116 dd->bytes_diff = 0;
117 dd->HashSize = INITIAL_HASH_SIZE;
118 dd->HashUsed = 0;
119 if (!(dd->HashTable = Xmalloc (sizeof (hash_rec) * dd->HashSize)))
120 {
121 Message ("Unable to allocate memory for table");
122 return (COMPERROR);
123 }
124 ChangeMem (sizeof (hash_rec) * dd->HashSize);
125 bzero ((char *) (dd->HashTable), sizeof (hash_rec) * dd->HashSize);
126
127 word = Pool;
128 *Pool++ = '\0';
129 PoolLeft--;
130 {
131 register u_char *wptr;
132 register int hsize = dd->HashSize;
133 register mg_u_long hashval, step;
134
135 HASH (hashval, step, word, hsize);
136 wptr = (dd->HashTable + hashval)->word;
137 while (wptr)
138 {
139 hashval += step;
140 if (hashval >= hsize)
141 hashval -= hsize;
142 wptr = (dd->HashTable + hashval)->word;
143 }
144 ent = dd->HashTable + hashval;
145 }
146 ent->wcnt = 1;
147 ent->word = word;
148 dd->HashUsed = 1;
149 }
150 return (COMPALLOK);
151}
152
153
154
155
156int
157process_text_1 (u_char * s_in, int l_in)
158{
159 int which;
160 u_char *end = s_in + l_in - 1;
161 if (l_in > LongestDoc)
162 LongestDoc = l_in;
163
164 csh.num_docs++;
165 csh.num_bytes += l_in;
166
167 which = inaword (s_in, end);
168 /*
169 ** Alternately parse off words and non-words from the input
170 ** stream beginning with a non-word. Each token is then
171 ** inserted into the set if it does not exist or has it's
172 ** frequency count incremented if it does.
173 */
174 for (; s_in <= end; which = !which)
175 {
176 u_char Word[MAXWORDLEN + 1];
177 dict_data *dd = &DictData[which];
178
179 /* First parse a word or non-word out of the string */
180 if (which)
181 PARSE_WORD (Word, s_in, end);
182 else
183 PARSE_NON_WORD (Word, s_in, end);
184
185 dd->wordnum++;
186 inputbytes += *Word;
187 dd->words_read++;
188
189 /* Search the hash table for Word */
190 {
191 register mg_u_long hashval, step;
192 register int hsize = dd->HashSize;
193 HASH (hashval, step, Word, hsize);
194 for (;;)
195 {
196 register u_char *s1;
197 register u_char *s2;
198 register int len;
199 register hash_rec *ent;
200 ent = dd->HashTable + hashval;
201 if (!ent->word)
202 {
203 int len = *Word + 1;
204 if (len > PoolLeft)
205 {
206 if (!(Pool = Xmalloc (POOL_SIZE)))
207 {
208 Message ("Unable to allocate memory for pool");
209 return (COMPERROR);
210 }
211 PoolLeft = POOL_SIZE;
212 ChangeMem (POOL_SIZE);
213 }
214 ent->occurance_num = occurance_num++;
215 ent->wcnt = 1;
216 ent->word = Pool;
217 memcpy (Pool, Word, len);
218 Pool += len;
219 PoolLeft -= len;
220 dd->HashUsed++;
221 dd->bytes_diff += Word[0];
222 break;
223 }
224 /* Compare the words */
225 s1 = Word;
226 s2 = ent->word;
227 len = *s1 + 1;
228 for (; len; len--)
229 if (*s1++ != *s2++)
230 break;
231
232 if (len)
233 {
234 hashval = (hashval + step);
235 if (hashval >= hsize)
236 hashval -= hsize;
237 }
238 else
239 {
240 ent->wcnt++;
241 break;
242 }
243 }
244 }
245
246
247 if (dd->HashUsed >= dd->HashSize >> 1)
248 {
249 hash_rec *ht;
250 mg_u_long size;
251 mg_u_long i;
252 size = prime (dd->HashSize * 2);
253 if (!(ht = Xmalloc (sizeof (hash_rec) * size)))
254 {
255 Message ("Unable to allocate memory for table");
256 return (COMPERROR);
257 }
258 ChangeMem (sizeof (hash_rec) * size);
259 bzero ((char *) ht, sizeof (hash_rec) * size);
260
261 for (i = 0; i < dd->HashSize; i++)
262 if (dd->HashTable[i].word)
263 {
264 register u_char *wptr;
265 register mg_u_long hashval, step;
266
267 wptr = dd->HashTable[i].word;
268 HASH (hashval, step, wptr, size);
269 wptr = (ht + hashval)->word;
270 while (wptr)
271 {
272 hashval += step;
273 if (hashval >= size)
274 hashval -= size;
275 wptr = (ht + hashval)->word;
276 }
277 ht[hashval] = dd->HashTable[i];
278 }
279 Xfree (dd->HashTable);
280 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
281 dd->HashTable = ht;
282 dd->HashSize = size;
283
284
285 }
286 }
287 return (COMPALLOK);
288} /* encode */
289
290
291
292static int
293PackHashTable (dict_data * dd)
294{
295 int s, d;
296 for (s = d = 0; s < dd->HashSize; s++)
297 if (dd->HashTable[s].word)
298 dd->HashTable[d++] = dd->HashTable[s];
299 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
300 ChangeMem (sizeof (hash_rec) * dd->HashUsed);
301 if (!(dd->HashTable = Xrealloc (dd->HashTable,
302 sizeof (hash_rec) * dd->HashUsed)))
303 {
304 Message ("Out of memory");
305 return COMPERROR;
306 }
307 dd->HashSize = dd->HashUsed;
308 return COMPALLOK;
309}
310
311
312
313
314
315static int
316ent_comp (const void *s1, const void *s2)
317{
318 return casecompare (((hash_rec *) s1)->word, ((hash_rec *) s2)->word);
319}
320
321
322
323static void
324WriteHashTable (FILE * fp, dict_data * dd)
325{
326 frags_stats_header fsh;
327 mg_u_long j = 0;
328 u_char *curr;
329
330 if (PackHashTable (dd) == COMPERROR)
331 return;
332
333 qsort (dd->HashTable, dd->HashUsed, sizeof (hash_rec), ent_comp);
334
335 fsh.num_frags = dd->HashSize;
336 fsh.mem_for_frags = dd->HashSize;
337 for (j = 0; j < dd->HashSize; j++)
338 fsh.mem_for_frags += dd->HashTable[j].word[0];
339
340 /* [RPAP - Jan 97: Endian Ordering] */
341 HTONUL(fsh.num_frags);
342 HTONUL(fsh.mem_for_frags);
343
344 fwrite (&fsh, sizeof (fsh), 1, fp);
345
346 for (j = 0; j < dd->HashSize; j++)
347 {
348 curr = dd->HashTable[j].word;
349
350 /* [RPAP - Jan 97: Endian Ordering] */
351 HTONUL(dd->HashTable[j].wcnt);
352 HTONUL(dd->HashTable[j].occurance_num);
353
354 fwrite (&dd->HashTable[j].wcnt, sizeof (dd->HashTable[j].wcnt), 1, fp);
355 fwrite (&dd->HashTable[j].occurance_num,
356 sizeof (dd->HashTable[j].occurance_num), 1, fp);
357
358 /* [RPAP - Jan 97: Endian Ordering] */
359 NTOHUL(dd->HashTable[j].wcnt);
360 NTOHUL(dd->HashTable[j].occurance_num);
361
362 fwrite (curr, sizeof (u_char), curr[0] + 1, fp);
363 }
364}
365
366
367int
368done_text_1 (char *file_name)
369{
370 char *temp_str;
371 FILE *fp;
372
373 if (!(fp = create_file (file_name, TEXT_STATS_DICT_SUFFIX, "wb",
374 MAGIC_STATS_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
375 {
376 fprintf(stderr,"Couldn't create file %s%s:%s\n",
377 file_name, TEXT_STATS_DICT_SUFFIX,
378#if defined(HAVE_STRERROR) || defined(__WIN32__)
379 strerror(errno)
380#else
381 " "
382#endif
383 );
384 return COMPERROR;
385 }
386
387 temp_str = msg_prefix;
388 msg_prefix = "text.pass1";
389
390 /* [RPAP - Jan 97: Endian Ordering] */
391 HTONUL(csh.num_docs);
392 HTOND(csh.num_bytes); /* [RJM 07/97: 4G limit] */
393
394 fwrite (&csh, sizeof (csh), 1, fp);
395
396 /* [RPAP - Jan 97: Endian Ordering] */
397 NTOHUL(csh.num_docs);
398 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
399
400 WriteHashTable (fp, &DictData[0]);
401 WriteHashTable (fp, &DictData[1]);
402 msg_prefix = temp_str;
403 fclose(fp);
404 return COMPALLOK;
405} /* done_encode */
Note: See TracBrowser for help on using the repository browser.