source: trunk/mgpp/text/text.pass2.cpp@ 3365

Last change on this file since 3365 was 3365, checked in by kjdon, 22 years ago

Initial revision

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1/**************************************************************************
2 *
3 * text.pass2.cpp -- Text compression (Pass 2)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23// need this to avoid bizarre compiler problems under VC++ 6.0
24#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
25# include <iostream>
26#endif
27
28#if defined (__WIN32__)
29# include <stdio.h>
30# define unlink _unlink
31#endif
32
33#include "sysfuncs.h"
34#include "memlib.h"
35#include "messages.h"
36#include "local_strings.h"
37#include "bitio_m_stdio.h"
38#include "huffman.h"
39#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
40#include "mg.h"
41#include "mg_files.h"
42#include "build.h"
43#include "words.h"
44#include "text.h"
45#include "hash.h"
46#include "locallib.h"
47#include "comp_dict.h"
48#include "FText.h"
49
50#define POOL_SIZE 1024*256
51
52
53struct char_pool {
54 struct char_pool *next;
55 u_long left;
56 u_char *ptr;
57 u_char pool[POOL_SIZE];
58};
59
60struct novel_hash_rec {
61 u_long ordinal_num;
62 u_char *word;
63};
64
65
66#define INITIAL_HASH_SIZE 7927
67
68struct novel_hash_table {
69 novel_hash_rec *HashTable;
70 u_long HashSize, HashUsed;
71 char_pool *first_pool;
72 char_pool *pool;
73 u_long next_num, binary_start;
74 novel_hash_rec **code_to_nhr;
75};
76
77
78static FILE *text;
79static stdio_bitio_buffer textOutBuf;
80static u_long headLen;
81
82static novel_hash_table nht[2];
83
84int blk_start[2][33], blk_end[2][33];
85
86// 0 = non-word, 1 = word
87unsigned char whichWordType = 0;
88
89CompressTextInfoMap textInfoMap;
90
91
92static char_pool *new_pool (char_pool * pool) {
93 char_pool *p = (char_pool *) Xmalloc (sizeof (char_pool));
94 if (!p) FatalError (1, "Unable to allocate memory for pool");
95
96 if (pool) pool->next = p;
97 p->next = NULL;
98 p->left = POOL_SIZE;
99 p->ptr = p->pool;
100 return p;
101}
102
103
104int init_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
105 char path[512];
106
107 // load the compression dictionary
108 if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
109 path)) == COMPERROR)
110 return COMPERROR;
111
112 // create the compressed text file and attach it to the text buffer
113 if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
114 MAGIC_TEXT, MG_MESSAGE)))
115 return COMPERROR;
116 textOutBuf.attachFile (text);
117 textOutBuf.encodeStart ();
118
119 // write out the compressed text header
120 memset (&cth, '\0', sizeof (cth));
121 if (fwrite (&cth, sizeof (cth), 1, text) != 1)
122 return COMPERROR;
123
124 // the length of the compressed header is the size of the
125 // magic number plus the size of the compressed text header
126 headLen = sizeof (u_long) + sizeof (cth);
127
128 // start off with non-word
129 whichWordType = 0;
130
131 // clear the text pointers
132 textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
133
134 // set up the hash table of novel words
135 int i;
136 if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
137 for (i = 0; i <= 1; i++)
138 {
139 nht[i].HashSize = INITIAL_HASH_SIZE;
140 nht[i].HashTable = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * nht[i].HashSize);
141 memset (nht[i].HashTable, '\0',
142 sizeof (novel_hash_rec) * nht[i].HashSize);
143 nht[i].HashUsed = 0;
144 nht[i].HashSize = INITIAL_HASH_SIZE;
145 nht[i].pool = nht[i].first_pool = new_pool (NULL);
146 nht[i].next_num = 1;
147 nht[i].binary_start = 1;
148 nht[i].code_to_nhr = NULL;
149 if (cdh.novel_method == MG_NOVEL_HYBRID)
150 {
151 int num;
152 num = 1;
153 blk_start[i][0] = 0;
154 blk_end[i][0] = cdh.num_words[i] - 1;
155 while (num < 33)
156 {
157 blk_start[i][num] = blk_end[i][num - 1] + 1;
158 blk_end[i][num] = blk_start[i][num] +
159 (blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
160 num++;
161 }
162 }
163 }
164
165 return (COMPALLOK);
166}
167
168
169static int compress_text (bitio_buffer &buffer, const u_char *s_in,
170 int l_in, unsigned char &which,
171 unsigned long &numWords) {
172 const u_char *end = s_in + l_in - 1;
173
174 for (; s_in <= end; which = !which) {
175 u_char Word[MAXWORDLEN + 1];
176 int res;
177
178 if (which) numWords++;
179
180 /* First parse a word or non-word out of the string */
181 if (which) PARSE_WORD (Word, s_in, end);
182 else PARSE_NON_WORD (Word, s_in, end);
183
184 /* Search the hash table for Word */
185 if (ht[which]) {
186 register unsigned long hashval, step;
187 register int tsize = ht[which]->size;
188 register u_char **wptr;
189 HASH (hashval, step, Word, tsize);
190 for (;;) {
191 register u_char *s1;
192 register u_char *s2;
193 register int len;
194 wptr = ht[which]->table[hashval];
195 if (wptr == NULL) {
196 res = COMPERROR;
197 break;
198 }
199
200 /* Compare the words */
201 s1 = Word;
202 s2 = *wptr;
203 len = *s1 + 1;
204 for (; len; len--)
205 if (*s1++ != *s2++) break;
206
207 if (len) {
208 hashval += step;
209 if (hashval >= (unsigned long)tsize) hashval -= tsize;
210
211 } else {
212 res = ht[which]->table[hashval] - ht[which]->words;
213 break;
214 }
215 }
216 }
217 else
218 res = COMPERROR;
219 /* Check that the word was found in the dictionary */
220 if (res == COMPERROR)
221 {
222 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
223 {
224 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
225 return (COMPERROR);
226 }
227 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
228 {
229 u_long i;
230 if (ht[which])
231 {
232 res = ht[which]->hd->num_codes - 1;
233 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
234 }
235 buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
236 for (i = 0; i < Word[0]; i++)
237 buffer.huff_encode (Word[i + 1], char_codes[which],
238 char_huff[which].clens, NULL);
239 }
240 if (cdh.dict_type == MG_SEED_DICTIONARY)
241 {
242 if (ht[which])
243 {
244 res = ht[which]->hd->num_codes - 1;
245 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
246 }
247 switch (cdh.novel_method)
248 {
249 case MG_NOVEL_HUFFMAN_CHARS:
250 {
251 u_long i;
252 buffer.huff_encode (Word[0], lens_codes[which],
253 lens_huff[which].clens, NULL);
254 for (i = 0; i < Word[0]; i++)
255 buffer.huff_encode (Word[i + 1], char_codes[which],
256 char_huff[which].clens, NULL);
257 }
258 break;
259 case MG_NOVEL_DELTA:
260 case MG_NOVEL_HYBRID:
261 {
262 register unsigned long hashval, step;
263 register novel_hash_table *h = &nht[which];
264 register int hsize = h->HashSize;
265 register novel_hash_rec *ent;
266 HASH (hashval, step, Word, hsize);
267 for (;;)
268 {
269 register u_char *s1, *s2;
270 register int len;
271 ent = h->HashTable + hashval;
272 if (!ent->word)
273 {
274 int len = *Word + 1;
275 if ((unsigned)len > h->pool->left)
276 h->pool = new_pool (h->pool);
277 ent->word = h->pool->ptr;
278 ent->ordinal_num = h->next_num++;
279 memcpy (h->pool->ptr, Word, len);
280 h->pool->ptr += len;
281 h->pool->left -= len;
282 h->HashUsed++;
283 break;
284 }
285 /* Compare the words */
286 s1 = Word;
287 s2 = ent->word;
288 len = *s1 + 1;
289 for (; len; len--)
290 if (*s1++ != *s2++)
291 break;
292
293 if (!len)
294 break;
295
296 hashval = (hashval + step);
297 if (hashval >= (unsigned)hsize) hashval -= hsize;
298 }
299
300 switch (cdh.novel_method) {
301 case MG_NOVEL_DELTA:
302 buffer.delta_encode (ent->ordinal_num, NULL);
303 break;
304 case MG_NOVEL_HYBRID:
305 int k = 0;
306 int j = ent->ordinal_num - 1;
307 while (j > blk_end[which][k])
308 k++;
309 assert (j - blk_start[which][k] + 1 >= 1 &&
310 j - blk_start[which][k] + 1 <=
311 blk_end[which][k] - blk_start[which][k] + 1);
312
313 buffer.gamma_encode (k + 1, NULL);
314 buffer.binary_encode (j - blk_start[which][k] + 1,
315 blk_end[which][k] - blk_start[which][k] + 1,
316 NULL);
317 break;
318 }
319
320 if (h->HashUsed >= h->HashSize >> 1)
321 {
322 novel_hash_rec *ht;
323 unsigned long size;
324 unsigned long i;
325 size = prime (h->HashSize * 2);
326 if (!(ht = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * size)))
327 {
328 Message ("Unable to allocate memory for table");
329 return (COMPERROR);
330 }
331 memset (ht, '\0', sizeof (novel_hash_rec) * size);
332
333 for (i = 0; i < h->HashSize; i++)
334 if (h->HashTable[i].word)
335 {
336 register u_char *wptr;
337 register unsigned long hashval, step;
338
339 wptr = h->HashTable[i].word;
340 HASH (hashval, step, wptr, size);
341 wptr = (ht + hashval)->word;
342 while (wptr)
343 {
344 hashval += step;
345 if (hashval >= size)
346 hashval -= size;
347 wptr = (ht + hashval)->word;
348 }
349 ht[hashval] = h->HashTable[i];
350 }
351 Xfree (h->HashTable);
352 h->HashTable = ht;
353 h->HashSize = size;
354 }
355 }
356 break;
357 }
358 }
359 }
360 else
361 {
362 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
363 }
364 }
365
366 return COMPALLOK;
367}
368
369
370int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
371 // we need to remember where the last text element ends
372 unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
373 unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
374
375 // process each text element
376 TextElArray::const_iterator here = doc.begin();
377 TextElArray::const_iterator end = doc.end();
378 while (here != end) {
379 // remember current place in compressed text
380 unsigned char startWhich = whichWordType;
381 unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
382 unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
383
384 // compress the text
385 if (compress_text (textOutBuf,
386 &(here->text[0]),
387 (*here).text.size(),
388 whichWordType,
389 cth.num_of_words) != COMPALLOK)
390 return COMPERROR;
391
392 endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
393 endPos = textOutBuf.GetBytesWritten () + headLen;
394
395 // update the pointers if this is a level tag
396 if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
397 if ((*here).elType == OpenTagE) {
398 // set the start of the document
399 textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
400
401 } else if ((*here).elType == CloseTagE) {
402 // set the end of the document
403 textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
404
405 } else {
406 // TextE, nothing needs to be done
407 }
408 }
409
410 cth.num_of_bytes += (*here).text.size();
411 here++;
412 }
413
414 // close off any unclosed tags
415 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
416 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
417 while (tiHere != tiEnd) {
418 if ((*tiHere).second.inDoc) (*tiHere).second.SetEnd (endPos, endBit);
419 tiHere++;
420 }
421
422 // we've processed one more document
423 cth.num_of_docs++;
424
425 return COMPALLOK;
426}
427
428
429static int write_aux_dict (char *fileName) {
430 int i;
431 FILE *aux;
432 if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
433 MAGIC_AUX_DICT, MG_MESSAGE)))
434 return COMPERROR;
435
436 for (i = 0; i <= 1; i++)
437 {
438 aux_frags_header afh;
439 char_pool *cp;
440
441 afh.num_frags = nht[i].HashUsed;
442 afh.mem_for_frags = 0;
443 for (cp = nht[i].first_pool; cp; cp = cp->next)
444 afh.mem_for_frags += POOL_SIZE - cp->left;
445
446 /* [RPAP - Jan 97: Endian Ordering] */
447 HTONUL(afh.num_frags);
448 HTONUL(afh.mem_for_frags);
449
450 fwrite (&afh, sizeof (afh), 1, aux);
451
452 for (cp = nht[i].first_pool; cp; cp = cp->next)
453 fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
454 }
455 fclose (aux);
456 return COMPALLOK;
457}
458
459
460static void estimate_compressed_aux_dict (void) {
461 int i;
462 u_long aux_compressed = 0, total_uncomp = 0;
463 for (i = 0; i <= 1; i++)
464 {
465 int j;
466 long chars[256], fchars[256];
467 long lens[16], flens[16];
468 char_pool *cp;
469 memset (chars, '\0', sizeof (chars));
470 memset (lens, '\0', sizeof (lens));
471 for (cp = nht[i].first_pool; cp; cp = cp->next)
472 {
473 u_char *buf = cp->pool;
474 while (buf != cp->ptr)
475 {
476 int len = *buf++;
477 lens[len]++;
478 total_uncomp += len + 4;
479 for (; len; len--)
480 chars[*buf++]++;
481 }
482 }
483 for (j = 0; j < 256; j++)
484 if (!chars[j] && PESINAWORD (j) == i)
485 fchars[j] = 1;
486 else
487 fchars[j] = chars[j];
488 for (j = 0; j < 16; j++)
489 if (!lens[j])
490 flens[j] = 1;
491 else
492 flens[j] = lens[j];
493
494 aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
495 Calculate_Huffman_Size (256, fchars, chars)) / 8);
496
497 }
498}
499
500
501
502static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
503 // create the text index
504 FILE *textIdxFile;
505 if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
506 MAGIC_TEXI, MG_MESSAGE)))
507 return false;
508
509 // output each level
510 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
511 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
512 while (tiHere != tiEnd) {
513 // remember the level information
514 TextLevelInfo levelInfo;
515 levelInfo.levelTag = (*tiHere).first;
516 levelInfo.textIdxPtr = ftell (textIdxFile);
517 levelInfo.numEntries = (*tiHere).second.docPtrs.size();
518 textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
519
520 // write out the index array for this level
521 if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
522 return false;
523
524 tiHere++;
525 }
526
527 // close the file
528 fclose (textIdxFile);
529
530 return true;
531}
532
533
534bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
535 // cout << textLevels;
536
537 // create the text index
538 FILE *textLevelFile;
539 if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
540 MAGIC_TEXT_LEVELS, MG_MESSAGE)))
541 return false;
542
543 // output each level
544 if (!textLevels.Write (textLevelFile)) return false;
545
546 // close the file
547 fclose (textLevelFile);
548
549 return true;
550}
551
552
553int done_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
554 // write out any remaining bits
555 textOutBuf.encodeDone();
556
557 // write out the compressed text header (with updated statistics)
558 if (fseek (text, sizeof (u_long), SEEK_SET) == -1 || !cth.Write (text))
559 return COMPERROR;
560 fclose (text);
561
562 // write out the text index file and get the level information
563 FTextLevel textLevels;
564 if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
565
566 // write out the text level information
567 if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
568
569 // write out the auxiliary dictionary
570 if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
571 (cdh.novel_method == MG_NOVEL_DELTA ||
572 cdh.novel_method == MG_NOVEL_HYBRID)) {
573 if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
574 estimate_compressed_aux_dict ();
575
576 } else {
577 unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
578 }
579
580 return (COMPALLOK);
581}
Note: See TracBrowser for help on using the repository browser.