source: trunk/indexers/mgpp/text/text.pass2.cpp@ 8692

Last change on this file since 8692 was 8692, checked in by kjdon, 19 years ago

Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
Line 
1/**************************************************************************
2 *
3 * text.pass2.cpp -- Text compression (Pass 2)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
24#pragma warning(disable:4786)
25#endif
26
27// need this to avoid bizarre compiler problems under VC++ 6.0
28#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
29# include <iostream>
30#endif
31
32#if defined (__WIN32__)
33# include <stdio.h>
34# define unlink _unlink
35#endif
36
37#include "sysfuncs.h"
38#include "memlib.h"
39#include "messages.h"
40#include "local_strings.h"
41#include "bitio_m_stdio.h"
42#include "huffman.h"
43#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
44#include "mg.h"
45#include "mg_files.h"
46#include "build.h"
47#include "words.h"
48#include "text.h"
49#include "hash.h"
50#include "locallib.h"
51#include "comp_dict.h"
52#include "FText.h"
53
54#define POOL_SIZE 1024*256
55
56
57struct char_pool {
58 struct char_pool *next;
59 u_long left;
60 u_char *ptr;
61 u_char pool[POOL_SIZE];
62};
63
64struct novel_hash_rec {
65 u_long ordinal_num;
66 u_char *word;
67};
68
69
70#define INITIAL_HASH_SIZE 7927
71
72struct novel_hash_table {
73 novel_hash_rec *HashTable;
74 u_long HashSize, HashUsed;
75 char_pool *first_pool;
76 char_pool *pool;
77 u_long next_num, binary_start;
78 novel_hash_rec **code_to_nhr;
79};
80
81
82static FILE *text;
83static stdio_bitio_buffer textOutBuf;
84static u_long headLen;
85
86static novel_hash_table nht[2];
87
88int blk_start[2][33], blk_end[2][33];
89
90// 0 = non-word, 1 = word
91unsigned char whichWordType = 0;
92
93CompressTextInfoMap textInfoMap;
94
95
96static char_pool *new_pool (char_pool * pool) {
97 char_pool *p = (char_pool *) Xmalloc (sizeof (char_pool));
98 if (!p) FatalError (1, "Unable to allocate memory for pool");
99
100 if (pool) pool->next = p;
101 p->next = NULL;
102 p->left = POOL_SIZE;
103 p->ptr = p->pool;
104 return p;
105}
106
107
108int init_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
109 char path[512];
110
111 // load the compression dictionary
112 if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
113 path)) == COMPERROR)
114 return COMPERROR;
115
116 // create the compressed text file and attach it to the text buffer
117 if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
118 MAGIC_TEXT, MG_MESSAGE)))
119 return COMPERROR;
120 textOutBuf.attachFile (text);
121 textOutBuf.encodeStart ();
122
123 // write out the compressed text header
124 memset (&cth, '\0', sizeof (cth));
125 if (fwrite (&cth, sizeof (cth), 1, text) != 1)
126 return COMPERROR;
127
128 // the length of the compressed header is the size of the
129 // magic number plus the size of the compressed text header
130 headLen = sizeof (u_long) + sizeof (cth);
131
132 // start off with non-word
133 whichWordType = 0;
134
135 // clear the text pointers
136 textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
137
138 // set up the hash table of novel words
139 int i;
140 if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
141 for (i = 0; i <= 1; ++i)
142 {
143 nht[i].HashSize = INITIAL_HASH_SIZE;
144 nht[i].HashTable = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * nht[i].HashSize);
145 memset (nht[i].HashTable, '\0',
146 sizeof (novel_hash_rec) * nht[i].HashSize);
147 nht[i].HashUsed = 0;
148 nht[i].HashSize = INITIAL_HASH_SIZE;
149 nht[i].pool = nht[i].first_pool = new_pool (NULL);
150 nht[i].next_num = 1;
151 nht[i].binary_start = 1;
152 nht[i].code_to_nhr = NULL;
153 if (cdh.novel_method == MG_NOVEL_HYBRID)
154 {
155 int num;
156 num = 1;
157 blk_start[i][0] = 0;
158 blk_end[i][0] = cdh.num_words[i] - 1;
159 while (num < 33)
160 {
161 blk_start[i][num] = blk_end[i][num - 1] + 1;
162 blk_end[i][num] = blk_start[i][num] +
163 (blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
164 ++num;
165 }
166 }
167 }
168
169 return (COMPALLOK);
170}
171
172
173static int compress_text (bitio_buffer &buffer, const u_char *s_in,
174 int l_in, unsigned char &which,
175 unsigned long &numWords) {
176 const u_char *end = s_in + l_in - 1;
177
178 for (; s_in <= end; which = !which) {
179 u_char Word[MAXWORDLEN + 1];
180 int res;
181
182 if (which) ++numWords;
183
184 /* First parse a word or non-word out of the string */
185 if (which) PARSE_WORD (Word, s_in, end);
186 else PARSE_NON_WORD (Word, s_in, end);
187
188 /* Search the hash table for Word */
189 if (ht[which]) {
190 register unsigned long hashval, step;
191 register int tsize = ht[which]->size;
192 register u_char **wptr;
193 HASH (hashval, step, Word, tsize);
194 for (;;) {
195 register u_char *s1;
196 register u_char *s2;
197 register int len;
198 wptr = ht[which]->table[hashval];
199 if (wptr == NULL) {
200 res = COMPERROR;
201 break;
202 }
203
204 /* Compare the words */
205 s1 = Word;
206 s2 = *wptr;
207 len = *s1 + 1;
208 for (; len; --len)
209 if (*s1++ != *s2++) break;
210
211 if (len) {
212 hashval += step;
213 if (hashval >= (unsigned long)tsize) hashval -= tsize;
214
215 } else {
216 res = ht[which]->table[hashval] - ht[which]->words;
217 break;
218 }
219 }
220 }
221 else
222 res = COMPERROR;
223 /* Check that the word was found in the dictionary */
224 if (res == COMPERROR)
225 {
226 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
227 {
228 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
229 return (COMPERROR);
230 }
231 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
232 {
233 u_long i;
234 if (ht[which])
235 {
236 res = ht[which]->hd->num_codes - 1;
237 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
238 }
239 buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
240 for (i = 0; i < Word[0]; ++i)
241 buffer.huff_encode (Word[i + 1], char_codes[which],
242 char_huff[which].clens, NULL);
243 }
244 if (cdh.dict_type == MG_SEED_DICTIONARY)
245 {
246 if (ht[which])
247 {
248 res = ht[which]->hd->num_codes - 1;
249 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
250 }
251 switch (cdh.novel_method)
252 {
253 case MG_NOVEL_HUFFMAN_CHARS:
254 {
255 u_long i;
256 buffer.huff_encode (Word[0], lens_codes[which],
257 lens_huff[which].clens, NULL);
258 for (i = 0; i < Word[0]; ++i)
259 buffer.huff_encode (Word[i + 1], char_codes[which],
260 char_huff[which].clens, NULL);
261 }
262 break;
263 case MG_NOVEL_DELTA:
264 case MG_NOVEL_HYBRID:
265 {
266 register unsigned long hashval, step;
267 register novel_hash_table *h = &nht[which];
268 register int hsize = h->HashSize;
269 register novel_hash_rec *ent;
270 HASH (hashval, step, Word, hsize);
271 for (;;)
272 {
273 register u_char *s1, *s2;
274 register int len;
275 ent = h->HashTable + hashval;
276 if (!ent->word)
277 {
278 int len = *Word + 1;
279 if ((unsigned)len > h->pool->left)
280 h->pool = new_pool (h->pool);
281 ent->word = h->pool->ptr;
282 ent->ordinal_num = h->next_num++;
283 memcpy (h->pool->ptr, Word, len);
284 h->pool->ptr += len;
285 h->pool->left -= len;
286 ++h->HashUsed;
287 break;
288 }
289 /* Compare the words */
290 s1 = Word;
291 s2 = ent->word;
292 len = *s1 + 1;
293 for (; len; --len)
294 if (*s1++ != *s2++)
295 break;
296
297 if (!len)
298 break;
299
300 hashval = (hashval + step);
301 if (hashval >= (unsigned)hsize) hashval -= hsize;
302 }
303
304 switch (cdh.novel_method) {
305 case MG_NOVEL_DELTA:
306 buffer.delta_encode (ent->ordinal_num, NULL);
307 break;
308 case MG_NOVEL_HYBRID:
309 int k = 0;
310 int j = ent->ordinal_num - 1;
311 while (j > blk_end[which][k])
312 ++k;
313 assert (j - blk_start[which][k] + 1 >= 1 &&
314 j - blk_start[which][k] + 1 <=
315 blk_end[which][k] - blk_start[which][k] + 1);
316
317 buffer.gamma_encode (k + 1, NULL);
318 buffer.binary_encode (j - blk_start[which][k] + 1,
319 blk_end[which][k] - blk_start[which][k] + 1,
320 NULL);
321 break;
322 }
323
324 if (h->HashUsed >= h->HashSize >> 1)
325 {
326 novel_hash_rec *ht;
327 unsigned long size;
328 unsigned long i;
329 size = prime (h->HashSize * 2);
330 if (!(ht = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * size)))
331 {
332 Message ("Unable to allocate memory for table");
333 return (COMPERROR);
334 }
335 memset (ht, '\0', sizeof (novel_hash_rec) * size);
336
337 for (i = 0; i < h->HashSize; ++i)
338 if (h->HashTable[i].word)
339 {
340 register u_char *wptr;
341 register unsigned long hashval, step;
342
343 wptr = h->HashTable[i].word;
344 HASH (hashval, step, wptr, size);
345 wptr = (ht + hashval)->word;
346 while (wptr)
347 {
348 hashval += step;
349 if (hashval >= size)
350 hashval -= size;
351 wptr = (ht + hashval)->word;
352 }
353 ht[hashval] = h->HashTable[i];
354 }
355 Xfree (h->HashTable);
356 h->HashTable = ht;
357 h->HashSize = size;
358 }
359 }
360 break;
361 }
362 }
363 }
364 else
365 {
366 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
367 }
368 }
369
370 return COMPALLOK;
371}
372
373
374int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
375 // we need to remember where the last text element ends
376 unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
377 unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
378
379 // process each text element
380 TextElArray::const_iterator here = doc.begin();
381 TextElArray::const_iterator end = doc.end();
382 while (here != end) {
383 // remember current place in compressed text
384 unsigned char startWhich = whichWordType;
385 unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
386 unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
387
388 // compress the text
389 if (compress_text (textOutBuf,
390 &(here->text[0]),
391 (*here).text.size(),
392 whichWordType,
393 cth.num_of_words) != COMPALLOK)
394 return COMPERROR;
395
396 endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
397 endPos = textOutBuf.GetBytesWritten () + headLen;
398
399 // update the pointers if this is a level tag
400 if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
401 if ((*here).elType == OpenTagE) {
402 // set the start of the document
403 textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
404
405 } else if ((*here).elType == CloseTagE) {
406 // set the end of the document
407 textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
408
409 } else {
410 // TextE, nothing needs to be done
411 }
412 }
413
414 cth.num_of_bytes += (*here).text.size();
415 ++here;
416 }
417
418 // close off any unclosed tags
419 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
420 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
421 while (tiHere != tiEnd) {
422 if ((*tiHere).second.inDoc) (*tiHere).second.SetEnd (endPos, endBit);
423 ++tiHere;
424 }
425
426 // we've processed one more document
427 ++cth.num_of_docs;
428
429 return COMPALLOK;
430}
431
432
433static int write_aux_dict (char *fileName) {
434 int i;
435 FILE *aux;
436 if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
437 MAGIC_AUX_DICT, MG_MESSAGE)))
438 return COMPERROR;
439
440 for (i = 0; i <= 1; ++i)
441 {
442 aux_frags_header afh;
443 char_pool *cp;
444
445 afh.num_frags = nht[i].HashUsed;
446 afh.mem_for_frags = 0;
447 for (cp = nht[i].first_pool; cp; cp = cp->next)
448 afh.mem_for_frags += POOL_SIZE - cp->left;
449
450 /* [RPAP - Jan 97: Endian Ordering] */
451 HTONUL(afh.num_frags);
452 HTONUL(afh.mem_for_frags);
453
454 fwrite (&afh, sizeof (afh), 1, aux);
455
456 for (cp = nht[i].first_pool; cp; cp = cp->next)
457 fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
458 }
459 fclose (aux);
460 return COMPALLOK;
461}
462
463
464static void estimate_compressed_aux_dict (void) {
465 int i;
466 u_long aux_compressed = 0, total_uncomp = 0;
467 for (i = 0; i <= 1; ++i)
468 {
469 int j;
470 long chars[256], fchars[256];
471 long lens[16], flens[16];
472 char_pool *cp;
473 memset (chars, '\0', sizeof (chars));
474 memset (lens, '\0', sizeof (lens));
475 for (cp = nht[i].first_pool; cp; cp = cp->next)
476 {
477 u_char *buf = cp->pool;
478 while (buf != cp->ptr)
479 {
480 int len = *buf++;
481 ++lens[len];
482 total_uncomp += len + 4;
483 for (; len; --len)
484 ++chars[*buf++];
485 }
486 }
487 for (j = 0; j < 256; ++j)
488 if (!chars[j] && PESINAWORD (j) == i)
489 fchars[j] = 1;
490 else
491 fchars[j] = chars[j];
492 for (j = 0; j < 16; ++j)
493 if (!lens[j])
494 flens[j] = 1;
495 else
496 flens[j] = lens[j];
497
498 aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
499 Calculate_Huffman_Size (256, fchars, chars)) / 8);
500
501 }
502}
503
504
505
506static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
507 // create the text index
508 FILE *textIdxFile;
509 if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
510 MAGIC_TEXI, MG_MESSAGE)))
511 return false;
512
513 // output each level
514 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
515 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
516 while (tiHere != tiEnd) {
517 // remember the level information
518 TextLevelInfo levelInfo;
519 levelInfo.levelTag = (*tiHere).first;
520 levelInfo.textIdxPtr = ftell (textIdxFile);
521 levelInfo.numEntries = (*tiHere).second.docPtrs.size();
522 textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
523
524 // write out the index array for this level
525 if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
526 return false;
527
528 ++tiHere;
529 }
530
531 // close the file
532 fclose (textIdxFile);
533
534 return true;
535}
536
537
538bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
539 // cout << textLevels;
540
541 // create the text index
542 FILE *textLevelFile;
543 if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
544 MAGIC_TEXT_LEVELS, MG_MESSAGE)))
545 return false;
546
547 // output each level
548 if (!textLevels.Write (textLevelFile)) return false;
549
550 // close the file
551 fclose (textLevelFile);
552
553 return true;
554}
555
556
557int done_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
558 // write out any remaining bits
559 textOutBuf.encodeDone();
560
561 // write out the compressed text header (with updated statistics)
562 if (fseek (text, sizeof (u_long), SEEK_SET) == -1 || !cth.Write (text))
563 return COMPERROR;
564 fclose (text);
565
566 // write out the text index file and get the level information
567 FTextLevel textLevels;
568 if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
569
570 // write out the text level information
571 if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
572
573 // write out the auxiliary dictionary
574 if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
575 (cdh.novel_method == MG_NOVEL_DELTA ||
576 cdh.novel_method == MG_NOVEL_HYBRID)) {
577 if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
578 estimate_compressed_aux_dict ();
579
580 } else {
581 unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
582 }
583
584 return (COMPALLOK);
585}
Note: See TracBrowser for help on using the repository browser.