source: trunk/gsdl/src/mgpp/text/text.pass2.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
Line 
1/**************************************************************************
2 *
3 * text.pass2.cpp -- Text compression (Pass 2)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: text.pass2.cpp 856 2000-01-14 02:26:25Z sjboddie $
22 *
23 **************************************************************************/
24
25
26#include "sysfuncs.h"
27
28#include "memlib.h"
29#include "messages.h"
30#include "local_strings.h"
31#include "bitio_m_stdio.h"
32#include "huffman.h"
33#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
34
35#include "mg.h"
36#include "mg_files.h"
37#include "build.h"
38#include "words.h"
39#include "text.h"
40#include "hash.h"
41#include "locallib.h"
42#include "comp_dict.h"
43
44#include "FText.h"
45
46
47/*
48 $Log$
49 Revision 1.1 2000/01/14 02:26:24 sjboddie
50 Rodgers new C++ mg
51
52 Revision 1.1 1999/10/11 02:58:38 cs025
53 Base install of MG-PP
54
55 Revision 1.1 1999/08/10 21:18:25 sjboddie
56 renamed mg-1.3d directory mg
57
58 Revision 1.2 1998/12/17 09:12:54 rjmcnab
59
60 Altered mg to process utf-8 encoded Unicode. The main changes
61 are in the parsing of the input, the casefolding, and the stemming.
62
63 Revision 1.1 1998/11/17 09:35:47 rjmcnab
64 *** empty log message ***
65
66 * Revision 1.3 1994/10/20 03:57:10 tes
67 * I have rewritten the boolean query optimiser and abstracted out the
68 * components of the boolean query.
69 *
70 * Revision 1.2 1994/09/20 04:42:14 tes
71 * For version 1.1
72 *
73 */
74
75#define POOL_SIZE 1024*256
76
77struct char_pool {
78 struct char_pool *next;
79 u_long left;
80 u_char *ptr;
81 u_char pool[POOL_SIZE];
82};
83
84struct novel_hash_rec {
85 u_long ordinal_num;
86 u_char *word;
87};
88
89
90#define INITIAL_HASH_SIZE 7927
91
92struct novel_hash_table {
93 novel_hash_rec *HashTable;
94 u_long HashSize, HashUsed;
95 char_pool *first_pool;
96 char_pool *pool;
97 u_long next_num, binary_start;
98 novel_hash_rec **code_to_nhr;
99};
100
101
102static FILE *text;
103static stdio_bitio_buffer textOutBuf;
104static u_long headLen;
105
106static novel_hash_table nht[2];
107
108int blk_start[2][33], blk_end[2][33];
109
110// 0 = non-word, 1 = word
111unsigned char whichWordType = 0;
112
113CompressTextInfoMap textInfoMap;
114
115
116static char_pool *new_pool (char_pool * pool) {
117 char_pool *p = (char_pool *) Xmalloc (sizeof (char_pool));
118 if (!p) FatalError (1, "Unable to allocate memory for pool");
119
120 if (pool) pool->next = p;
121 p->next = NULL;
122 p->left = POOL_SIZE;
123 p->ptr = p->pool;
124 return p;
125}
126
127
128int init_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
129 char path[512];
130
131 // load the compression dictionary
132 if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
133 path)) == COMPERROR)
134 return COMPERROR;
135
136 // create the compressed text file and attach it to the text buffer
137 if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
138 MAGIC_TEXT, MG_MESSAGE)))
139 return COMPERROR;
140 textOutBuf.attachFile (text);
141 textOutBuf.encodeStart ();
142
143 // write out the compressed text header
144 bzero ((char *) &cth, sizeof (cth));
145 if (fwrite (&cth, sizeof (cth), 1, text) != 1)
146 return COMPERROR;
147
148 // the length of the compressed header is the size of the
149 // magic number plus the size of the compressed text header
150 headLen = sizeof (u_long) + sizeof (cth);
151
152 // start off with non-word
153 whichWordType = 0;
154
155 // clear the text pointers
156 textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
157
158 // set up the hash table of novel words
159 int i;
160 if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
161 for (i = 0; i <= 1; i++)
162 {
163 nht[i].HashSize = INITIAL_HASH_SIZE;
164 nht[i].HashTable = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * nht[i].HashSize);
165 bzero ((char *) nht[i].HashTable,
166 sizeof (novel_hash_rec) * nht[i].HashSize);
167 nht[i].HashUsed = 0;
168 nht[i].HashSize = INITIAL_HASH_SIZE;
169 nht[i].pool = nht[i].first_pool = new_pool (NULL);
170 nht[i].next_num = 1;
171 nht[i].binary_start = 1;
172 nht[i].code_to_nhr = NULL;
173 if (cdh.novel_method == MG_NOVEL_HYBRID)
174 {
175 int num;
176 num = 1;
177 blk_start[i][0] = 0;
178 blk_end[i][0] = cdh.num_words[i] - 1;
179 while (num < 33)
180 {
181 blk_start[i][num] = blk_end[i][num - 1] + 1;
182 blk_end[i][num] = blk_start[i][num] +
183 (blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
184 num++;
185 }
186 }
187 }
188
189 return (COMPALLOK);
190}
191
192
193static int compress_text (bitio_buffer &buffer, const u_char *s_in,
194 int l_in, unsigned char &which,
195 unsigned long &numWords) {
196 const u_char *end = s_in + l_in - 1;
197
198 for (; s_in <= end; which = !which) {
199 u_char Word[MAXWORDLEN + 1];
200 int res;
201
202 if (which) numWords++;
203
204 /* First parse a word or non-word out of the string */
205 if (which) PARSE_WORD (Word, s_in, end);
206 else PARSE_NON_WORD (Word, s_in, end);
207
208 /* Search the hash table for Word */
209 if (ht[which]) {
210 register unsigned long hashval, step;
211 register int tsize = ht[which]->size;
212 register u_char **wptr;
213 HASH (hashval, step, Word, tsize);
214 for (;;) {
215 register u_char *s1;
216 register u_char *s2;
217 register int len;
218 wptr = ht[which]->table[hashval];
219 if (wptr == NULL) {
220 res = COMPERROR;
221 break;
222 }
223
224 /* Compare the words */
225 s1 = Word;
226 s2 = *wptr;
227 len = *s1 + 1;
228 for (; len; len--)
229 if (*s1++ != *s2++) break;
230
231 if (len) {
232 hashval += step;
233 if (hashval >= (unsigned long)tsize) hashval -= tsize;
234
235 } else {
236 res = ht[which]->table[hashval] - ht[which]->words;
237 break;
238 }
239 }
240 }
241 else
242 res = COMPERROR;
243 /* Check that the word was found in the dictionary */
244 if (res == COMPERROR)
245 {
246 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
247 {
248 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
249 return (COMPERROR);
250 }
251 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
252 {
253 u_long i;
254 if (ht[which])
255 {
256 res = ht[which]->hd->num_codes - 1;
257 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
258 }
259 buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
260 for (i = 0; i < Word[0]; i++)
261 buffer.huff_encode (Word[i + 1], char_codes[which],
262 char_huff[which].clens, NULL);
263 }
264 if (cdh.dict_type == MG_SEED_DICTIONARY)
265 {
266 if (ht[which])
267 {
268 res = ht[which]->hd->num_codes - 1;
269 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
270 }
271 switch (cdh.novel_method)
272 {
273 case MG_NOVEL_HUFFMAN_CHARS:
274 {
275 u_long i;
276 buffer.huff_encode (Word[0], lens_codes[which],
277 lens_huff[which].clens, NULL);
278 for (i = 0; i < Word[0]; i++)
279 buffer.huff_encode (Word[i + 1], char_codes[which],
280 char_huff[which].clens, NULL);
281 }
282 break;
283 case MG_NOVEL_DELTA:
284 case MG_NOVEL_HYBRID:
285 {
286 register unsigned long hashval, step;
287 register novel_hash_table *h = &nht[which];
288 register int hsize = h->HashSize;
289 register novel_hash_rec *ent;
290 HASH (hashval, step, Word, hsize);
291 for (;;)
292 {
293 register u_char *s1, *s2;
294 register int len;
295 ent = h->HashTable + hashval;
296 if (!ent->word)
297 {
298 int len = *Word + 1;
299 if ((unsigned)len > h->pool->left)
300 h->pool = new_pool (h->pool);
301 ent->word = h->pool->ptr;
302 ent->ordinal_num = h->next_num++;
303 memcpy (h->pool->ptr, Word, len);
304 h->pool->ptr += len;
305 h->pool->left -= len;
306 h->HashUsed++;
307 break;
308 }
309 /* Compare the words */
310 s1 = Word;
311 s2 = ent->word;
312 len = *s1 + 1;
313 for (; len; len--)
314 if (*s1++ != *s2++)
315 break;
316
317 if (!len)
318 break;
319
320 hashval = (hashval + step);
321 if (hashval >= (unsigned)hsize) hashval -= hsize;
322 }
323
324 switch (cdh.novel_method) {
325 case MG_NOVEL_DELTA:
326 buffer.delta_encode (ent->ordinal_num, NULL);
327 break;
328 case MG_NOVEL_HYBRID:
329 int k = 0;
330 int j = ent->ordinal_num - 1;
331 while (j > blk_end[which][k])
332 k++;
333 assert (j - blk_start[which][k] + 1 >= 1 &&
334 j - blk_start[which][k] + 1 <=
335 blk_end[which][k] - blk_start[which][k] + 1);
336
337 buffer.gamma_encode (k + 1, NULL);
338 buffer.binary_encode (j - blk_start[which][k] + 1,
339 blk_end[which][k] - blk_start[which][k] + 1,
340 NULL);
341 break;
342 }
343
344 if (h->HashUsed >= h->HashSize >> 1)
345 {
346 novel_hash_rec *ht;
347 unsigned long size;
348 unsigned long i;
349 size = prime (h->HashSize * 2);
350 if (!(ht = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * size)))
351 {
352 Message ("Unable to allocate memory for table");
353 return (COMPERROR);
354 }
355 bzero ((char *) ht, sizeof (novel_hash_rec) * size);
356
357 for (i = 0; i < h->HashSize; i++)
358 if (h->HashTable[i].word)
359 {
360 register u_char *wptr;
361 register unsigned long hashval, step;
362
363 wptr = h->HashTable[i].word;
364 HASH (hashval, step, wptr, size);
365 wptr = (ht + hashval)->word;
366 while (wptr)
367 {
368 hashval += step;
369 if (hashval >= size)
370 hashval -= size;
371 wptr = (ht + hashval)->word;
372 }
373 ht[hashval] = h->HashTable[i];
374 }
375 Xfree (h->HashTable);
376 h->HashTable = ht;
377 h->HashSize = size;
378 }
379 }
380 break;
381 }
382 }
383 }
384 else
385 {
386 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
387 }
388 }
389
390 return COMPALLOK;
391}
392
393
394int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
395 // we need to remember where the last text element ends
396 unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
397 unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
398
399 // process each text element
400 TextElArray::const_iterator here = doc.begin();
401 TextElArray::const_iterator end = doc.end();
402 while (here != end) {
403 // remember current place in compressed text
404 unsigned char startWhich = whichWordType;
405 unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
406 unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
407
408 // compress the text
409 if (compress_text (textOutBuf,
410 (*here).text.begin(),
411 (*here).text.size(),
412 whichWordType,
413 cth.num_of_words) != COMPALLOK)
414 return COMPERROR;
415
416 endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
417 endPos = textOutBuf.GetBytesWritten () + headLen;
418
419 // update the pointers if this is a level tag
420 if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
421 if ((*here).elType == OpenTagE) {
422 // set the start of the document
423 textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
424
425 } else if ((*here).elType == CloseTagE) {
426 // set the end of the document
427 textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
428
429 } else {
430 // TextE, nothing needs to be done
431 }
432 }
433
434 cth.num_of_bytes += (*here).text.size();
435 here++;
436 }
437
438 // close off any unclosed tags
439 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
440 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
441 while (tiHere != tiEnd) {
442 if ((*tiHere).second.inDoc) (*tiHere).second.SetEnd (endPos, endBit);
443 tiHere++;
444 }
445
446 // we've processed one more document
447 cth.num_of_docs++;
448
449 return COMPALLOK;
450}
451
452
453static int write_aux_dict (char *fileName) {
454 int i;
455 FILE *aux;
456 if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
457 MAGIC_AUX_DICT, MG_MESSAGE)))
458 return COMPERROR;
459
460 for (i = 0; i <= 1; i++)
461 {
462 aux_frags_header afh;
463 char_pool *cp;
464
465 afh.num_frags = nht[i].HashUsed;
466 afh.mem_for_frags = 0;
467 for (cp = nht[i].first_pool; cp; cp = cp->next)
468 afh.mem_for_frags += POOL_SIZE - cp->left;
469
470 /* [RPAP - Jan 97: Endian Ordering] */
471 HTONUL(afh.num_frags);
472 HTONUL(afh.mem_for_frags);
473
474 fwrite (&afh, sizeof (afh), 1, aux);
475
476 for (cp = nht[i].first_pool; cp; cp = cp->next)
477 fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
478 }
479 fclose (aux);
480 return COMPALLOK;
481}
482
483
484static void estimate_compressed_aux_dict (void) {
485 int i;
486 u_long aux_compressed = 0, total_uncomp = 0;
487 for (i = 0; i <= 1; i++)
488 {
489 int j;
490 long chars[256], fchars[256];
491 long lens[16], flens[16];
492 char_pool *cp;
493 bzero ((char *) chars, sizeof (chars));
494 bzero ((char *) lens, sizeof (lens));
495 for (cp = nht[i].first_pool; cp; cp = cp->next)
496 {
497 u_char *buf = cp->pool;
498 while (buf != cp->ptr)
499 {
500 int len = *buf++;
501 lens[len]++;
502 total_uncomp += len + 4;
503 for (; len; len--)
504 chars[*buf++]++;
505 }
506 }
507 for (j = 0; j < 256; j++)
508 if (!chars[j] && PESINAWORD (j) == i)
509 fchars[j] = 1;
510 else
511 fchars[j] = chars[j];
512 for (j = 0; j < 16; j++)
513 if (!lens[j])
514 flens[j] = 1;
515 else
516 flens[j] = lens[j];
517
518 aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
519 Calculate_Huffman_Size (256, fchars, chars)) / 8);
520
521 }
522}
523
524
525
526static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
527 // create the text index
528 FILE *textIdxFile;
529 if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
530 MAGIC_TEXI, MG_MESSAGE)))
531 return false;
532
533 // output each level
534 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
535 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
536 while (tiHere != tiEnd) {
537 // remember the level information
538 TextLevelInfo levelInfo;
539 levelInfo.levelTag = (*tiHere).first;
540 levelInfo.textIdxPtr = ftell (textIdxFile);
541 levelInfo.numEntries = (*tiHere).second.docPtrs.size();
542 textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
543
544 // write out the index array for this level
545 if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
546 return false;
547
548 tiHere++;
549 }
550
551 // close the file
552 fclose (textIdxFile);
553
554 return true;
555}
556
557
558bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
559 // cout << textLevels;
560
561 // create the text index
562 FILE *textLevelFile;
563 if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
564 MAGIC_TEXT_LEVELS, MG_MESSAGE)))
565 return false;
566
567 // output each level
568 if (!textLevels.Write (textLevelFile)) return false;
569
570 // close the file
571 fclose (textLevelFile);
572
573 return true;
574}
575
576
577int done_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
578 // write out any remaining bits
579 textOutBuf.encodeDone();
580
581 // write out the compressed text header (with updated statistics)
582 if (fseek (text, sizeof (u_long), SEEK_SET) == -1 || !cth.Write (text))
583 return COMPERROR;
584 fclose (text);
585
586 // write out the text index file and get the level information
587 FTextLevel textLevels;
588 if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
589
590 // write out the text level information
591 if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
592
593 // write out the auxiliary dictionary
594 if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
595 (cdh.novel_method == MG_NOVEL_DELTA ||
596 cdh.novel_method == MG_NOVEL_HYBRID)) {
597 if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
598 estimate_compressed_aux_dict ();
599
600 } else {
601 unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
602 }
603
604 return (COMPALLOK);
605}
Note: See TracBrowser for help on using the repository browser.