source: trunk/indexers/mgpp/text/text.pass1.cpp@ 9786

Last change on this file since 9786 was 9786, checked in by kjdon, 19 years ago

closed the file handle in done_text

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1/**************************************************************************
2 *
3 * text.pass1.cpp -- Text compression (Pass 1)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23// need this to avoid bizarre compiler problems under VC++ 6.0
24#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
25# include <iostream>
26#endif
27
28#include "sysfuncs.h"
29#include "memlib.h"
30#include "messages.h"
31#include "huffman.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33#include "mg_files.h"
34#include "mg.h"
35#include "build.h"
36#include "locallib.h"
37#include "words.h"
38#include "text.h"
39#include "hash.h"
40#include "local_strings.h"
41#include "TextEl.h"
42
43/* for more meaningful messages - jrm21 (gsdl) */
44#ifdef HAVE_STRERROR
45#include <string.h>
46#include <errno.h>
47#endif
48
49
50#define POOL_SIZE 1024*1024
51#define INITIAL_HASH_SIZE 7927
52
53
54typedef struct hash_rec
55 {
56 unsigned long wcnt; /* word frequency */
57 unsigned long occurance_num;
58 u_char *word;
59 }
60hash_rec;
61
62typedef struct dict_data
63 {
64 hash_rec *HashTable;
65 unsigned long HashSize;
66 unsigned long HashUsed;
67 unsigned long wordnum;
68 unsigned long words_read;
69 unsigned long bytes_diff;
70 huff_data hd;
71 }
72dict_data;
73
74
75
76static unsigned long longestDoc = 0;
77static unsigned long occurance_num = 0;
78static dict_data DictData[2];
79
80static u_char *Pool;
81static int PoolLeft;
82static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
83static unsigned long MaxMemInUse = 0;
84static unsigned long MemInUse = 0;
85static compression_stats_header csh = {0, 0, 0.0}; /* [RJM 07/97: 4G limit] */
86
87
88static void ChangeMem (int Change) {
89 MemInUse += Change;
90 if (MemInUse > MaxMemInUse) MaxMemInUse = MemInUse;
91}
92
93
94int init_text_1 (const TagInfo &/*tagInfo*/, char * /*FileName*/) {
95 int which;
96
97 if (!(Pool = (u_char *) Xmalloc (POOL_SIZE))) {
98 Message ("Unable to allocate memory for pool");
99 return (COMPERROR);
100 }
101 PoolLeft = POOL_SIZE;
102 ChangeMem (POOL_SIZE);
103
104 for (which = 1; which >= 0; --which) {
105 u_char *word;
106 hash_rec *ent;
107 dict_data *dd = &DictData[which];
108
109 dd->wordnum = 0;
110 dd->words_read = 0;
111 dd->bytes_diff = 0;
112 dd->HashSize = INITIAL_HASH_SIZE;
113 dd->HashUsed = 0;
114
115 if (!(dd->HashTable = (hash_rec *) Xmalloc (sizeof (hash_rec) * dd->HashSize))) {
116 Message ("Unable to allocate memory for table");
117 return (COMPERROR);
118 }
119 ChangeMem (sizeof (hash_rec) * dd->HashSize);
120 memset (dd->HashTable,'\0' , sizeof (hash_rec) * dd->HashSize);
121
122 word = Pool;
123 *Pool++ = '\0';
124 --PoolLeft;
125 {
126 register u_char *wptr;
127 register int hsize = dd->HashSize;
128 register unsigned long hashval, step;
129
130 HASH (hashval, step, word, hsize);
131 wptr = (dd->HashTable + hashval)->word;
132 while (wptr) {
133 hashval += step;
134 if (hashval >= (unsigned long)hsize)
135 hashval -= hsize;
136 wptr = (dd->HashTable + hashval)->word;
137 }
138 ent = dd->HashTable + hashval;
139 }
140 ent->wcnt = 1;
141 ent->word = word;
142 dd->HashUsed = 1;
143 }
144 return (COMPALLOK);
145}
146
147
148static int process_text_element (const u_char *s_in, int l_in) {
149 const u_char *end = s_in + l_in - 1;
150
151 /*
152 ** Alternately parse off words and non-words from the input
153 ** stream beginning with a non-word. Each token is then
154 ** inserted into the set if it does not exist or has it's
155 ** frequency count incremented if it does.
156 */
157
158 bool which = false; // non-word
159 for (; s_in <= end; which = !which) {
160 u_char Word[MAXWORDLEN + 1];
161 dict_data *dd = &DictData[which];
162
163 /* First parse a word or non-word out of the string */
164 if (which) PARSE_WORD (Word, s_in, end);
165 else PARSE_NON_WORD (Word, s_in, end);
166
167 ++dd->wordnum;
168 inputbytes += *Word;
169 ++dd->words_read;
170
171 /* Search the hash table for Word */
172 {
173 register unsigned long hashval, step;
174 register int hsize = dd->HashSize;
175 HASH (hashval, step, Word, hsize);
176 for (;;) {
177 register u_char *s1;
178 register u_char *s2;
179 register int len;
180 register hash_rec *ent;
181 ent = dd->HashTable + hashval;
182 if (!ent->word) {
183 int len = *Word + 1;
184 if (len > PoolLeft) {
185 if (!(Pool = (u_char *) Xmalloc (POOL_SIZE))) {
186 Message ("Unable to allocate memory for pool");
187 return (COMPERROR);
188 }
189 PoolLeft = POOL_SIZE;
190 ChangeMem (POOL_SIZE);
191 }
192 ent->occurance_num = occurance_num++;
193 ent->wcnt = 1;
194 ent->word = Pool;
195 memcpy (Pool, Word, len);
196 Pool += len;
197 PoolLeft -= len;
198 ++dd->HashUsed;
199 dd->bytes_diff += Word[0];
200 break;
201 }
202
203 /* Compare the words */
204 s1 = Word;
205 s2 = ent->word;
206 len = *s1 + 1;
207 for (; len; --len)
208 if (*s1++ != *s2++) break;
209
210 if (len) {
211 hashval = (hashval + step);
212 if (hashval >= (unsigned long)hsize) hashval -= hsize;
213 } else {
214 ++ent->wcnt;
215 break;
216 }
217 }
218 }
219
220
221 if (dd->HashUsed >= dd->HashSize >> 1) {
222 hash_rec *ht;
223 unsigned long size;
224 unsigned long i;
225 size = prime (dd->HashSize * 2);
226 if (!(ht = (hash_rec *) Xmalloc (sizeof (hash_rec) * size))) {
227 Message ("Unable to allocate memory for table");
228 return (COMPERROR);
229 }
230 ChangeMem (sizeof (hash_rec) * size);
231 memset (ht, '\0', sizeof (hash_rec) * size);
232
233 for (i = 0; i < dd->HashSize; ++i)
234 if (dd->HashTable[i].word) {
235 register u_char *wptr;
236 register unsigned long hashval, step;
237
238 wptr = dd->HashTable[i].word;
239 HASH (hashval, step, wptr, size);
240 wptr = (ht + hashval)->word;
241 while (wptr) {
242 hashval += step;
243 if (hashval >= size) hashval -= size;
244 wptr = (ht + hashval)->word;
245 }
246 ht[hashval] = dd->HashTable[i];
247 }
248 Xfree (dd->HashTable);
249 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
250 dd->HashTable = ht;
251 dd->HashSize = size;
252 }
253 }
254
255 return COMPALLOK;
256}
257
258
259int process_text_1 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
260 unsigned long textLen = 0;
261 unsigned long docLen = 0;
262 int retValue;
263
264 // process each text element in this document
265 TextElArray::const_iterator here = doc.begin();
266 TextElArray::const_iterator end = doc.end();
267 while (here != end) {
268 textLen = (*here).text.size();
269 docLen += textLen;
270
271 retValue = process_text_element (&(here->text[0]), textLen);
272 if (retValue != COMPALLOK) return retValue;
273
274 ++here;
275 }
276
277 // get max document length
278 if (docLen > longestDoc) longestDoc = docLen;
279
280 // update header information
281 ++csh.num_docs;
282 csh.num_bytes += docLen;
283
284 return COMPALLOK;
285}
286
287
288static int PackHashTable (dict_data * dd) {
289 int s, d;
290 for (s = d = 0; (unsigned int)s < dd->HashSize; ++s)
291 if (dd->HashTable[s].word)
292 dd->HashTable[d++] = dd->HashTable[s];
293
294 ChangeMem (-sizeof (hash_rec) * dd->HashSize);
295 ChangeMem (sizeof (hash_rec) * dd->HashUsed);
296
297 if (!(dd->HashTable = (hash_rec *) Xrealloc (dd->HashTable,
298 sizeof (hash_rec) * dd->HashUsed))) {
299 Message ("Out of memory");
300 return COMPERROR;
301 }
302 dd->HashSize = dd->HashUsed;
303 return COMPALLOK;
304}
305
306
307static int ent_comp (const void *s1, const void *s2) {
308 return casecompare (((hash_rec *) s1)->word, ((hash_rec *) s2)->word);
309}
310
311
312static void WriteHashTable (FILE * fp, dict_data * dd) {
313 frags_stats_header fsh;
314 u_long j = 0;
315 u_char *curr;
316
317 if (PackHashTable (dd) == COMPERROR) return;
318
319 qsort (dd->HashTable, dd->HashUsed, sizeof (hash_rec), ent_comp);
320
321 fsh.num_frags = dd->HashSize;
322 fsh.mem_for_frags = dd->HashSize;
323 for (j = 0; j < dd->HashSize; ++j)
324 fsh.mem_for_frags += dd->HashTable[j].word[0];
325
326 /* [RPAP - Jan 97: Endian Ordering] */
327 HTONUL(fsh.num_frags);
328 HTONUL(fsh.mem_for_frags);
329
330 fwrite (&fsh, sizeof (fsh), 1, fp);
331
332 for (j = 0; j < dd->HashSize; ++j) {
333 curr = dd->HashTable[j].word;
334
335 /* [RPAP - Jan 97: Endian Ordering] */
336 HTONUL(dd->HashTable[j].wcnt);
337 HTONUL(dd->HashTable[j].occurance_num);
338
339 fwrite (&dd->HashTable[j].wcnt, sizeof (dd->HashTable[j].wcnt), 1, fp);
340 fwrite (&dd->HashTable[j].occurance_num,
341 sizeof (dd->HashTable[j].occurance_num), 1, fp);
342
343 /* [RPAP - Jan 97: Endian Ordering] */
344 NTOHUL(dd->HashTable[j].wcnt);
345 NTOHUL(dd->HashTable[j].occurance_num);
346
347 fwrite (curr, sizeof (u_char), curr[0] + 1, fp);
348 }
349}
350
351
352int done_text_1 (const TagInfo &/*tagInfo*/, char *file_name) {
353 char *temp_str;
354 FILE *fp;
355
356 if (!(fp = create_file (file_name, TEXT_STATS_DICT_SUFFIX, "wb",
357 MAGIC_STATS_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
358 {
359 fprintf(stderr,"Couldn't create file %s%s:%s\n",
360 file_name, TEXT_STATS_DICT_SUFFIX,
361#if defined(HAVE_STRERROR) || defined(__WIN32__)
362 strerror(errno)
363#else
364 " "
365#endif
366 );
367 return COMPERROR;
368 }
369
370 temp_str = msg_prefix;
371 msg_prefix = "text.pass1";
372
373 /* [RPAP - Jan 97: Endian Ordering] */
374 HTONUL(csh.num_docs);
375 HTOND(csh.num_bytes); /* [RJM 07/97: 4G limit] */
376
377 fwrite (&csh, sizeof (csh), 1, fp);
378
379 /* [RPAP - Jan 97: Endian Ordering] */
380 NTOHUL(csh.num_docs);
381 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
382
383 WriteHashTable (fp, &DictData[0]);
384 WriteHashTable (fp, &DictData[1]);
385 msg_prefix = temp_str;
386 fclose(fp);
387 return COMPALLOK;
388}
Note: See TracBrowser for help on using the repository browser.