source: trunk/gsdl/src/mgpp/text/text.pass2.cpp@ 2554

Last change on this file since 2554 was 2541, checked in by jrm21, 23 years ago

1) replaced obsolete bzero() calls with memset() calls.
2) Makefile.in fixed for c++ variables (CXX instead of CPP), and GSDLOS is now

set from configure, so files are installed in the right place.

3) text.pass1.cpp now prints an error message if index file can't be created.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1/**************************************************************************
2 *
3 * text.pass2.cpp -- Text compression (Pass 2)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23// need this to avoid bizarre compiler problems under VC++ 6.0
24#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
25# include <iostream>
26#endif
27
28#if defined (__WIN32__)
29# include <stdio.h>
30# define unlink _unlink
31#endif
32
33#include "sysfuncs.h"
34#include "memlib.h"
35#include "messages.h"
36#include "local_strings.h"
37#include "bitio_m_stdio.h"
38#include "huffman.h"
39#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
40#include "mg.h"
41#include "mg_files.h"
42#include "build.h"
43#include "words.h"
44#include "text.h"
45#include "hash.h"
46#include "locallib.h"
47#include "comp_dict.h"
48#include "FText.h"
49
50#define POOL_SIZE 1024*256
51
52
53struct char_pool {
54 struct char_pool *next;
55 u_long left;
56 u_char *ptr;
57 u_char pool[POOL_SIZE];
58};
59
60struct novel_hash_rec {
61 u_long ordinal_num;
62 u_char *word;
63};
64
65
66#define INITIAL_HASH_SIZE 7927
67
68struct novel_hash_table {
69 novel_hash_rec *HashTable;
70 u_long HashSize, HashUsed;
71 char_pool *first_pool;
72 char_pool *pool;
73 u_long next_num, binary_start;
74 novel_hash_rec **code_to_nhr;
75};
76
77
78static FILE *text;
79static stdio_bitio_buffer textOutBuf;
80static u_long headLen;
81
82static novel_hash_table nht[2];
83
84int blk_start[2][33], blk_end[2][33];
85
86// 0 = non-word, 1 = word
87unsigned char whichWordType = 0;
88
89CompressTextInfoMap textInfoMap;
90
91
92static char_pool *new_pool (char_pool * pool) {
93 char_pool *p = (char_pool *) Xmalloc (sizeof (char_pool));
94 if (!p) FatalError (1, "Unable to allocate memory for pool");
95
96 if (pool) pool->next = p;
97 p->next = NULL;
98 p->left = POOL_SIZE;
99 p->ptr = p->pool;
100 return p;
101}
102
103
104int init_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
105 char path[512];
106
107 // load the compression dictionary
108 if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
109 path)) == COMPERROR)
110 return COMPERROR;
111
112 // create the compressed text file and attach it to the text buffer
113 if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
114 MAGIC_TEXT, MG_MESSAGE)))
115 return COMPERROR;
116 textOutBuf.attachFile (text);
117 textOutBuf.encodeStart ();
118
119 // write out the compressed text header
120 memset (&cth, '\0', sizeof (cth));
121 if (fwrite (&cth, sizeof (cth), 1, text) != 1)
122 return COMPERROR;
123
124 // the length of the compressed header is the size of the
125 // magic number plus the size of the compressed text header
126 headLen = sizeof (u_long) + sizeof (cth);
127
128 // start off with non-word
129 whichWordType = 0;
130
131 // clear the text pointers
132 textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
133
134 // set up the hash table of novel words
135 int i;
136 if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
137 for (i = 0; i <= 1; i++)
138 {
139 nht[i].HashSize = INITIAL_HASH_SIZE;
140 nht[i].HashTable = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * nht[i].HashSize);
141 memset (nht[i].HashTable, '\0',
142 sizeof (novel_hash_rec) * nht[i].HashSize);
143 nht[i].HashUsed = 0;
144 nht[i].HashSize = INITIAL_HASH_SIZE;
145 nht[i].pool = nht[i].first_pool = new_pool (NULL);
146 nht[i].next_num = 1;
147 nht[i].binary_start = 1;
148 nht[i].code_to_nhr = NULL;
149 if (cdh.novel_method == MG_NOVEL_HYBRID)
150 {
151 int num;
152 num = 1;
153 blk_start[i][0] = 0;
154 blk_end[i][0] = cdh.num_words[i] - 1;
155 while (num < 33)
156 {
157 blk_start[i][num] = blk_end[i][num - 1] + 1;
158 blk_end[i][num] = blk_start[i][num] +
159 (blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
160 num++;
161 }
162 }
163 }
164
165 return (COMPALLOK);
166}
167
168
169static int compress_text (bitio_buffer &buffer, const u_char *s_in,
170 int l_in, unsigned char &which,
171 unsigned long &numWords) {
172 const u_char *end = s_in + l_in - 1;
173
174 for (; s_in <= end; which = !which) {
175 u_char Word[MAXWORDLEN + 1];
176 int res;
177
178 if (which) numWords++;
179
180 /* First parse a word or non-word out of the string */
181 if (which) PARSE_WORD (Word, s_in, end);
182 else PARSE_NON_WORD (Word, s_in, end);
183
184 /* Search the hash table for Word */
185 if (ht[which]) {
186 register unsigned long hashval, step;
187 register int tsize = ht[which]->size;
188 register u_char **wptr;
189 HASH (hashval, step, Word, tsize);
190 for (;;) {
191 register u_char *s1;
192 register u_char *s2;
193 register int len;
194 wptr = ht[which]->table[hashval];
195 if (wptr == NULL) {
196 res = COMPERROR;
197 break;
198 }
199
200 /* Compare the words */
201 s1 = Word;
202 s2 = *wptr;
203 len = *s1 + 1;
204 for (; len; len--)
205 if (*s1++ != *s2++) break;
206
207 if (len) {
208 hashval += step;
209 if (hashval >= (unsigned long)tsize) hashval -= tsize;
210
211 } else {
212 res = ht[which]->table[hashval] - ht[which]->words;
213 break;
214 }
215 }
216 }
217 else
218 res = COMPERROR;
219 /* Check that the word was found in the dictionary */
220 if (res == COMPERROR)
221 {
222 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
223 {
224 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
225 return (COMPERROR);
226 }
227 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
228 {
229 u_long i;
230 if (ht[which])
231 {
232 res = ht[which]->hd->num_codes - 1;
233 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
234 }
235 buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
236 for (i = 0; i < Word[0]; i++)
237 buffer.huff_encode (Word[i + 1], char_codes[which],
238 char_huff[which].clens, NULL);
239 }
240 if (cdh.dict_type == MG_SEED_DICTIONARY)
241 {
242 if (ht[which])
243 {
244 res = ht[which]->hd->num_codes - 1;
245 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
246 }
247 switch (cdh.novel_method)
248 {
249 case MG_NOVEL_HUFFMAN_CHARS:
250 {
251 u_long i;
252 buffer.huff_encode (Word[0], lens_codes[which],
253 lens_huff[which].clens, NULL);
254 for (i = 0; i < Word[0]; i++)
255 buffer.huff_encode (Word[i + 1], char_codes[which],
256 char_huff[which].clens, NULL);
257 }
258 break;
259 case MG_NOVEL_DELTA:
260 case MG_NOVEL_HYBRID:
261 {
262 register unsigned long hashval, step;
263 register novel_hash_table *h = &nht[which];
264 register int hsize = h->HashSize;
265 register novel_hash_rec *ent;
266 HASH (hashval, step, Word, hsize);
267 for (;;)
268 {
269 register u_char *s1, *s2;
270 register int len;
271 ent = h->HashTable + hashval;
272 if (!ent->word)
273 {
274 int len = *Word + 1;
275 if ((unsigned)len > h->pool->left)
276 h->pool = new_pool (h->pool);
277 ent->word = h->pool->ptr;
278 ent->ordinal_num = h->next_num++;
279 memcpy (h->pool->ptr, Word, len);
280 h->pool->ptr += len;
281 h->pool->left -= len;
282 h->HashUsed++;
283 break;
284 }
285 /* Compare the words */
286 s1 = Word;
287 s2 = ent->word;
288 len = *s1 + 1;
289 for (; len; len--)
290 if (*s1++ != *s2++)
291 break;
292
293 if (!len)
294 break;
295
296 hashval = (hashval + step);
297 if (hashval >= (unsigned)hsize) hashval -= hsize;
298 }
299
300 switch (cdh.novel_method) {
301 case MG_NOVEL_DELTA:
302 buffer.delta_encode (ent->ordinal_num, NULL);
303 break;
304 case MG_NOVEL_HYBRID:
305 int k = 0;
306 int j = ent->ordinal_num - 1;
307 while (j > blk_end[which][k])
308 k++;
309 assert (j - blk_start[which][k] + 1 >= 1 &&
310 j - blk_start[which][k] + 1 <=
311 blk_end[which][k] - blk_start[which][k] + 1);
312
313 buffer.gamma_encode (k + 1, NULL);
314 buffer.binary_encode (j - blk_start[which][k] + 1,
315 blk_end[which][k] - blk_start[which][k] + 1,
316 NULL);
317 break;
318 }
319
320 if (h->HashUsed >= h->HashSize >> 1)
321 {
322 novel_hash_rec *ht;
323 unsigned long size;
324 unsigned long i;
325 size = prime (h->HashSize * 2);
326 if (!(ht = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * size)))
327 {
328 Message ("Unable to allocate memory for table");
329 return (COMPERROR);
330 }
331 memset (ht, '\0', sizeof (novel_hash_rec) * size);
332
333 for (i = 0; i < h->HashSize; i++)
334 if (h->HashTable[i].word)
335 {
336 register u_char *wptr;
337 register unsigned long hashval, step;
338
339 wptr = h->HashTable[i].word;
340 HASH (hashval, step, wptr, size);
341 wptr = (ht + hashval)->word;
342 while (wptr)
343 {
344 hashval += step;
345 if (hashval >= size)
346 hashval -= size;
347 wptr = (ht + hashval)->word;
348 }
349 ht[hashval] = h->HashTable[i];
350 }
351 Xfree (h->HashTable);
352 h->HashTable = ht;
353 h->HashSize = size;
354 }
355 }
356 break;
357 }
358 }
359 }
360 else
361 {
362 buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
363 }
364 }
365
366 return COMPALLOK;
367}
368
369
370int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
371 // we need to remember where the last text element ends
372 unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
373 unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
374
375 // process each text element
376 TextElArray::const_iterator here = doc.begin();
377 TextElArray::const_iterator end = doc.end();
378 while (here != end) {
379 // remember current place in compressed text
380 unsigned char startWhich = whichWordType;
381 unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
382 unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
383
384 // compress the text
385 if (compress_text (textOutBuf,
386 (*here).text.begin(),
387 (*here).text.size(),
388 whichWordType,
389 cth.num_of_words) != COMPALLOK)
390 return COMPERROR;
391
392 endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
393 endPos = textOutBuf.GetBytesWritten () + headLen;
394
395 // update the pointers if this is a level tag
396 if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
397 if ((*here).elType == OpenTagE) {
398 // set the start of the document
399 textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
400
401 } else if ((*here).elType == CloseTagE) {
402 // set the end of the document
403 textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
404
405 } else {
406 // TextE, nothing needs to be done
407 }
408 }
409
410 cth.num_of_bytes += (*here).text.size();
411 here++;
412 }
413
414 // close off any unclosed tags
415 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
416 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
417 while (tiHere != tiEnd) {
418 if ((*tiHere).second.inDoc) (*tiHere).second.SetEnd (endPos, endBit);
419 tiHere++;
420 }
421
422 // we've processed one more document
423 cth.num_of_docs++;
424
425 return COMPALLOK;
426}
427
428
429static int write_aux_dict (char *fileName) {
430 int i;
431 FILE *aux;
432 if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
433 MAGIC_AUX_DICT, MG_MESSAGE)))
434 return COMPERROR;
435
436 for (i = 0; i <= 1; i++)
437 {
438 aux_frags_header afh;
439 char_pool *cp;
440
441 afh.num_frags = nht[i].HashUsed;
442 afh.mem_for_frags = 0;
443 for (cp = nht[i].first_pool; cp; cp = cp->next)
444 afh.mem_for_frags += POOL_SIZE - cp->left;
445
446 /* [RPAP - Jan 97: Endian Ordering] */
447 HTONUL(afh.num_frags);
448 HTONUL(afh.mem_for_frags);
449
450 fwrite (&afh, sizeof (afh), 1, aux);
451
452 for (cp = nht[i].first_pool; cp; cp = cp->next)
453 fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
454 }
455 fclose (aux);
456 return COMPALLOK;
457}
458
459
460static void estimate_compressed_aux_dict (void) {
461 int i;
462 u_long aux_compressed = 0, total_uncomp = 0;
463 for (i = 0; i <= 1; i++)
464 {
465 int j;
466 long chars[256], fchars[256];
467 long lens[16], flens[16];
468 char_pool *cp;
469 memset (chars, '\0', sizeof (chars));
470 memset (lens, '\0', sizeof (lens));
471 for (cp = nht[i].first_pool; cp; cp = cp->next)
472 {
473 u_char *buf = cp->pool;
474 while (buf != cp->ptr)
475 {
476 int len = *buf++;
477 lens[len]++;
478 total_uncomp += len + 4;
479 for (; len; len--)
480 chars[*buf++]++;
481 }
482 }
483 for (j = 0; j < 256; j++)
484 if (!chars[j] && PESINAWORD (j) == i)
485 fchars[j] = 1;
486 else
487 fchars[j] = chars[j];
488 for (j = 0; j < 16; j++)
489 if (!lens[j])
490 flens[j] = 1;
491 else
492 flens[j] = lens[j];
493
494 aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
495 Calculate_Huffman_Size (256, fchars, chars)) / 8);
496
497 }
498}
499
500
501
502static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
503 // create the text index
504 FILE *textIdxFile;
505 if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
506 MAGIC_TEXI, MG_MESSAGE)))
507 return false;
508
509 // output each level
510 CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
511 CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
512 while (tiHere != tiEnd) {
513 // remember the level information
514 TextLevelInfo levelInfo;
515 levelInfo.levelTag = (*tiHere).first;
516 levelInfo.textIdxPtr = ftell (textIdxFile);
517 levelInfo.numEntries = (*tiHere).second.docPtrs.size();
518 textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
519
520 // write out the index array for this level
521 if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
522 return false;
523
524 tiHere++;
525 }
526
527 // close the file
528 fclose (textIdxFile);
529
530 return true;
531}
532
533
534bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
535 // cout << textLevels;
536
537 // create the text index
538 FILE *textLevelFile;
539 if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
540 MAGIC_TEXT_LEVELS, MG_MESSAGE)))
541 return false;
542
543 // output each level
544 if (!textLevels.Write (textLevelFile)) return false;
545
546 // close the file
547 fclose (textLevelFile);
548
549 return true;
550}
551
552
553int done_text_2 (const TagInfo &/*tagInfo*/, char *fileName) {
554 // write out any remaining bits
555 textOutBuf.encodeDone();
556
557 // write out the compressed text header (with updated statistics)
558 if (fseek (text, sizeof (u_long), SEEK_SET) == -1 || !cth.Write (text))
559 return COMPERROR;
560 fclose (text);
561
562 // write out the text index file and get the level information
563 FTextLevel textLevels;
564 if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
565
566 // write out the text level information
567 if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
568
569 // write out the auxiliary dictionary
570 if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
571 (cdh.novel_method == MG_NOVEL_DELTA ||
572 cdh.novel_method == MG_NOVEL_HYBRID)) {
573 if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
574 estimate_compressed_aux_dict ();
575
576 } else {
577 unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
578 }
579
580 return (COMPALLOK);
581}
Note: See TracBrowser for help on using the repository browser.