source: trunk/gsdl/src/mgpp/text/TextGet.cpp@ 2928

Last change on this file since 2928 was 2928, checked in by jrm21, 22 years ago

replaced bzero and bcopy with memset and memcpy in the src, even though it was
already done in the headers, just to make the code a bit clearer.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.8 KB
Line 
1/**************************************************************************
2 *
3 * TextGet.cpp -- Decompressing the text
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22// need this to avoid bizarre compiler problems under VC++ 6.0
23#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
24# include <iostream>
25#endif
26
27#include "TextGet.h"
28#include "mg_files.h"
29#include "netorder.h"
30#include "mg_errors.h"
31#include "locallib.h"
32#include "words.h"
33#include "local_strings.h"
34#include "bitio_m_stdio.h"
35
36typedef enum huff_type {lengths, chars};
37
38
39static auxiliary_dict *LoadAuxDict (compression_dict &cd, FILE *text_aux_dict) {
40 auxiliary_dict *ad;
41 int i;
42
43 if (!(ad = new auxiliary_dict))
44 {
45 mg_errno = MG_NOMEM;
46 return (NULL);
47 }
48
49 memset (ad, '\0', sizeof (*ad));
50
51 for (i = 0; i <= 1; i++)
52 {
53 int j;
54 u_char *pos;
55
56 fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
57
58 /* [RPAP - Jan 97: Endian Ordering] */
59 NTOHUL(ad->afh[i].num_frags);
60 NTOHUL(ad->afh[i].mem_for_frags);
61
62 if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags]))
63 {
64 mg_errno = MG_NOMEM;
65 delete ad;
66 return (NULL);
67 }
68 if (!(ad->words[i] = new u_char* [ad->afh[i].num_frags]))
69 {
70 mg_errno = MG_NOMEM;
71 delete ad;
72 return (NULL);
73 }
74
75 fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
76 text_aux_dict);
77
78 pos = ad->word_data[i];
79 for (j = 0; j < (int)ad->afh[i].num_frags; j++)
80 {
81 ad->words[i][j] = pos;
82 pos += *pos + 1;
83 }
84 if (cd.cdh.novel_method == MG_NOVEL_HYBRID)
85 {
86 int num;
87 num = 1;
88 ad->blk_start[i][0] = 0;
89 ad->blk_end[i][0] = cd.cdh.num_words[i] - 1;
90 while (num < 33)
91 {
92 ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
93 ad->blk_end[i][num] = ad->blk_start[i][num] +
94 (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
95 num++;
96 }
97 }
98 }
99 return (ad);
100}
101
102
103static u_char ***ReadInWords (FILE *dict, compression_dict &cd,
104 comp_frags_header *cfh, u_char **escape) {
105 int i, lookback;
106 int ptrs_reqd = 0;
107 int mem_reqd = 0;
108 int num_set[MAX_HUFFCODE_LEN + 1];
109 u_char *next_word[MAX_HUFFCODE_LEN + 1];
110 u_char **vals;
111 u_char ***values;
112 u_char word[MAXWORDLEN + 1];
113 u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
114
115 lookback = cd.cdh.lookback;
116
117 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++) {
118 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
119 mem_reqd += cfh->huff_words_size[i];
120 }
121
122 if (!(vals = new u_char* [ptrs_reqd]))
123 return (NULL);
124
125 if (!(values = new u_char** [MAX_HUFFCODE_LEN + 1]))
126 return (NULL);
127
128 if (!(next_word[0] = new u_char[mem_reqd]))
129 return (NULL);
130
131 cd.MemForCompDict += ptrs_reqd * sizeof (*vals) +
132 (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
133 mem_reqd;
134
135 values[0] = vals;
136 values[0][0] = next_word[0];
137 for (i = 1; i <= cfh->hd.maxcodelen; i++)
138 {
139 int next_start = (values[i - 1] - vals) +
140 ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
141 values[i] = &vals[next_start];
142 next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
143 values[i][0] = next_word[i];
144 }
145
146 memset (num_set, '\0', sizeof (num_set));
147
148 for (i = 0; i < cfh->hd.num_codes; i++)
149 {
150 register int val, copy;
151 register int len = cfh->hd.clens[i];
152 val = getc (dict);
153 copy = (val >> 4) & 0xf;
154 val &= 0xf;
155
156 fread (word + copy + 1, sizeof (u_char), val, dict);
157 *word = val + copy;
158
159 if ((num_set[len] & ((1 << lookback) - 1)) == 0)
160 {
161 values[len][num_set[len] >> lookback] = next_word[len];
162 memcpy (next_word[len], word, *word + 1);
163 if (escape && i == cfh->hd.num_codes - 1)
164 *escape = next_word[len];
165 next_word[len] += *word + 1;
166 }
167 else
168 {
169 copy = prefixlen (last_word[len], word);
170 memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
171 *next_word[len] = (copy << 4) + (*word - copy);
172 if (escape && i == cfh->hd.num_codes - 1)
173 *escape = next_word[len];
174 next_word[len] += (*word - copy) + 1;
175 }
176 memcpy (last_word[len], word, *word + 1);
177 num_set[len]++;
178 }
179 if (cfh->hd.clens)
180 delete cfh->hd.clens;
181 cfh->hd.clens = NULL;
182 return values;
183}
184
185static int Load_Comp_HuffData(compression_dict &cd, int which, FILE *dict,
186 huff_type type) {
187 huff_data * hd;
188 u_long ** vals;
189
190 if (!(hd = new huff_data))
191 return 1;
192 cd.MemForCompDict += sizeof (huff_data);
193 if (Read_Huffman_Data (dict, hd, &cd.MemForCompDict, NULL) == -1)
194 return 2;
195 if (!(vals = Generate_Huffman_Vals (hd, &cd.MemForCompDict)))
196 return 3;
197 if (hd->clens)
198 delete hd->clens;
199 hd->clens = NULL;
200 if (type == chars)
201 {
202 cd.chars_huff[which] = hd;
203 cd.chars_vals[which] = vals;
204 }
205 else
206 {
207 cd.lens_huff[which] = hd;
208 cd.lens_vals[which] = vals;
209 }
210
211 return 0;
212}
213
214static int Load_Comp_FragsHeader(compression_dict &cd, int which, int getEscape,
215 FILE *dict) {
216 if (!(cd.cfh[which] = new comp_frags_header))
217 return 1;
218 cd.MemForCompDict += sizeof (*cd.cfh[which]);
219 if (Read_cfh (dict, cd.cfh[which], &cd.MemForCompDict, NULL) == -1)
220 return 2;
221
222 if (!(cd.values[which] = ReadInWords (dict, cd, cd.cfh[which],
223 getEscape == 0 ? NULL : &cd.escape[which])))
224 return 3;
225
226 return 0;
227}
228
229static bool LoadSlowCompDict (FILE *dict, FILE *aux_dict, compression_dict &cd) {
230 if (dict == NULL) return false;
231
232 int which;
233
234 memset (&cd, '\0', sizeof (compression_dict));
235
236 cd.MemForCompDict = sizeof (compression_dict);
237
238 if (Read_cdh (dict, &cd.cdh, &cd.MemForCompDict, NULL) == -1)
239 return false;
240
241 for (which = 0; which < 2; which++)
242 switch (cd.cdh.dict_type)
243 {
244 case MG_COMPLETE_DICTIONARY:
245 {
246 if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0)
247 return false;
248 cd.escape[which] = NULL;
249
250 }
251 break;
252 case MG_PARTIAL_DICTIONARY:
253 {
254 if (cd.cdh.num_words[which])
255 {
256 if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
257 return false;
258 }
259
260 if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
261 return false;
262
263 if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
264 return false;
265 }
266 break;
267 case MG_SEED_DICTIONARY:
268 {
269 if (cd.cdh.num_words[which])
270 {
271 if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
272 return false;
273 }
274 switch (cd.cdh.novel_method)
275 {
276 case MG_NOVEL_HUFFMAN_CHARS:
277 if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
278 return false;
279
280 if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
281 return false;
282 break;
283 case MG_NOVEL_DELTA:
284 break;
285 case MG_NOVEL_HYBRID:
286 break;
287 }
288 break;
289 }
290 }
291
292 if (cd.cdh.novel_method == MG_NOVEL_DELTA ||
293 cd.cdh.novel_method == MG_NOVEL_HYBRID)
294 {
295 if (!aux_dict)
296 {
297 mg_errno = MG_NOFILE;
298 cd.Clear();
299 return false;
300 }
301
302 if (!(cd.ad = LoadAuxDict (cd, aux_dict)))
303 {
304 cd.Clear();
305 return false;
306 }
307 }
308
309 mg_errno = MG_NOERROR;
310
311 cd.fast_loaded = 0;
312
313 return true;
314}
315
316
317
318#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
319#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p,cd) & 7))) != 0)
320
321// fast loading really needs to be totally re-writen. "Unloading" the
322// text data will currently cause a crash because memory is being
323// deleted multiple times (and probably a zillion other reasons).
324static bool LoadFastCompDict (FILE *text_fast_comp_dict, compression_dict &_cd) {
325 if (text_fast_comp_dict == NULL) return false;
326
327 u_long *p, *end;
328 u_char *fixup;
329 u_long mem;
330 u_long fixup_mem;
331 int i; /* [RPAP - Jan 97: Endian Ordering] */
332
333 fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
334 NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
335 fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
336 NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
337
338 compression_dict *cd;
339 if (!(cd = (compression_dict *)malloc (mem))) {
340 mg_errno = MG_NOMEM;
341 return false;
342 }
343
344 end = (u_long *) (((u_char *) cd) + mem);
345 fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
346
347 if (!(fixup = new u_char[fixup_mem]))
348 {
349 mg_errno = MG_NOMEM;
350 return false;
351 }
352
353 fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
354
355 for (p = (u_long *) cd; (u_long) p < (u_long) end; p++)
356 if (IS_FIXUP (p))
357 {
358 NTOHUL(*p); /* [RPAP - Jan 97: Endian Ordering] */
359 *p = *p + (u_long) cd;
360 }
361
362 /* [RPAP - Jan 97: Endian Ordering] */
363 /* cdh */
364 NTOHUL(cd->cdh.dict_type);
365 NTOHUL(cd->cdh.novel_method);
366 for (i = 0; i < TEXT_PARAMS; i++)
367 NTOHUL(cd->cdh.params[i]);
368 NTOHUL(cd->cdh.num_words[0]);
369 NTOHUL(cd->cdh.num_words[1]);
370 NTOHUL(cd->cdh.num_word_chars[0]);
371 NTOHUL(cd->cdh.num_word_chars[1]);
372 NTOHUL(cd->cdh.lookback);
373 /* cfh */
374 for (i = 0; i <= 1; i++)
375 {
376 int j;
377
378 NTOHSI(cd->cfh[i]->hd.num_codes);
379 NTOHSI(cd->cfh[i]->hd.mincodelen);
380 NTOHSI(cd->cfh[i]->hd.maxcodelen);
381 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
382 {
383 NTOHSI(cd->cfh[i]->hd.lencount[j]);
384 NTOHUL(cd->cfh[i]->hd.min_code[j]);
385 }
386 NTOHUL(cd->cfh[i]->uncompressed_size);
387 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
388 NTOHUL(cd->cfh[i]->huff_words_size[j]);
389 }
390 NTOHUL(cd->MemForCompDict);
391 /* ad */
392 if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
393 cd->cdh.novel_method == MG_NOVEL_HYBRID)
394 for (i = 0; i <= 1; i++)
395 {
396 int j;
397
398 NTOHUL(cd->ad->afh[i].num_frags);
399 NTOHUL(cd->ad->afh[i].mem_for_frags);
400 for (j = 0; j < 33; j++)
401 {
402 NTOHSI(cd->ad->blk_start[i][j]);
403 NTOHSI(cd->ad->blk_end[i][j]);
404 }
405 }
406 NTOHSI(cd->fast_loaded);
407
408 delete fixup;
409
410 // the whole fast comp dict is a bit of a hack so I don't
411 // feel too bad about the next line :-) -- Rodger.
412 _cd = *cd;
413
414 return true;
415}
416
417
418static bool LoadCompDict (FILE *compDictFile,
419 FILE *auxDictFile,
420 FILE *fastCompDictFile,
421 compression_dict &cd) {
422 // see if we have a fast loading compression dictionary
423 if (fastCompDictFile != NULL)
424 return LoadFastCompDict (fastCompDictFile, cd);
425
426 // slow compression dictionary
427 return LoadSlowCompDict (compDictFile, auxDictFile, cd);
428}
429
430
431// try to open the dictionary files and load the dictionary
432static bool OpenLoadCompDict (char *textname, compression_dict &cd) {
433 FILE *compDictFile = NULL;
434 FILE *auxDictFile = NULL;
435 FILE *fastCompDictFile = NULL;
436
437 fastCompDictFile = open_file (textname, TEXT_DICT_FAST_SUFFIX,
438 "rb", MAGIC_FAST_DICT, MG_CONTINUE);
439
440 if (fastCompDictFile == NULL) {
441 compDictFile = open_file (textname, TEXT_DICT_SUFFIX,
442 "rb", MAGIC_DICT, MG_MESSAGE);
443 auxDictFile = open_file (textname, TEXT_DICT_AUX_SUFFIX,
444 "rb", MAGIC_AUX_DICT, MG_CONTINUE);
445 }
446
447 bool res = LoadCompDict (compDictFile, auxDictFile, fastCompDictFile, cd);
448
449 if (compDictFile != NULL) fclose (compDictFile);
450 if (auxDictFile != NULL) fclose (auxDictFile);
451 if (fastCompDictFile != NULL) fclose (fastCompDictFile);
452
453 return res;
454}
455
456static bool LoadLevels (char *textname, FTextLevel &levels) {
457 FILE *levelFile = NULL;
458
459 // open the text level file
460 levelFile = open_file (textname, TEXT_LEVEL_SUFFIX,
461 "rb", MAGIC_TEXT_LEVELS, MG_CONTINUE);
462 if (levelFile == NULL) return false;
463
464 // seek to the appropriate place and read the level information
465 bool res = ((fseek (levelFile, sizeof (u_long), SEEK_SET) == 0) &&
466 levels.Read (levelFile));
467
468 // close the file
469 fclose (levelFile);
470
471 return res;
472}
473
474
475TextData::TextData () {
476 // put file pointers in known state first
477 textFile = NULL;
478 textIdxFile = NULL;
479 Clear ();
480}
481
482void TextData::Clear () {
483 cd.Clear();
484 textFile = NULL;
485 textIdxFile = NULL;
486 cth.Clear();
487 levels.Clear();
488}
489
490bool TextData::LoadData (char *basepath, char *textname) {
491
492 if (textname[0] == '\0') return false;
493
494 // set the basepath
495 set_basepath(basepath);
496
497 // load the compression dictionary
498 if (!OpenLoadCompDict (textname, cd)) return false;
499
500 // open the compressed text and text index file
501 textFile = open_file (textname, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_CONTINUE);
502 if (textFile == NULL) return false;
503
504 textIdxFile = open_file (textname, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI, MG_CONTINUE);
505 if (textIdxFile == NULL) return false;
506
507 // read in the compressed text header
508 if ((fseek (textFile, sizeof (u_long), SEEK_SET) != 0) || !cth.Read (textFile))
509 return false;
510
511 // read in the level information
512 if (!LoadLevels (textname, levels)) return false;
513
514 return true;
515}
516
517bool TextData::UnloadData () {
518 // close any open files
519 if (textFile != NULL) {
520 fclose (textFile);
521 textFile = NULL;
522 }
523 if (textIdxFile != NULL) {
524 fclose (textIdxFile);
525 textIdxFile = NULL;
526 }
527
528 // do general clear
529 Clear ();
530
531 return true;
532}
533
534
535bool GetDocIdx (TextData &td, const UCArray &docLevel,
536 unsigned long docNum, TextIdx &docIdx) {
537 // make sure the text index file was opened successfully
538 if (td.textIdxFile == NULL) return false;
539
540 // read in the index
541 TextLevelInfo &levelInfo = td.levels.levelInfo[docLevel];
542 if (!docIdx.Read (td.textIdxFile, levelInfo, docNum)) return false;
543
544 return true;
545}
546
547
548
549
550#define MY_HUFF_DECODE(len, code, mcodes) \
551 do { \
552 register unsigned long *__min_code = (mcodes); \
553 register unsigned long *__mclen = __min_code; \
554 register unsigned long __code = 0; \
555 do \
556 { \
557 __code += __code + buffer.bit(); \
558 } \
559 while (__code < *++__mclen); \
560 (len) = __mclen - __min_code; \
561 (code) = __code - *__mclen; \
562 } while(0);
563
564
565bool GetDocText (TextData &td, const UCArray &docLevel,
566 unsigned long docNum, UCArray &docText) {
567 // erase the current text
568 docText.erase (docText.begin(), docText.end());
569
570 // look up the information about this document
571 TextIdx docIdx;
572 if (!GetDocIdx (td, docLevel, docNum, docIdx)) return false;
573
574 // do seek to appropriate position
575 stdio_bitio_buffer buffer (td.textFile);
576 buffer.seek (docIdx.start.byte, docIdx.start.bit);
577
578 // decompress the document
579 compression_dict &cd = td.cd;
580 auxiliary_dict *ad = cd.ad;
581 int which = docIdx.which;
582 unsigned long num_bits = (docIdx.end.byte*8+(8-docIdx.end.bit)) -
583 (docIdx.start.byte*8+(8-docIdx.start.bit));
584 unsigned long bits = 0;
585
586 // keep decoding bits until enough bits have been decoded
587 while (bits < num_bits) {
588 register unsigned code, len;
589 register int r;
590 register u_char *t, *b = NULL;
591 u_char word[MAXWORDLEN + 1];
592
593 if (cd.cfh[which]) {
594 MY_HUFF_DECODE (len, code, cd.cfh[which]->hd.min_code);
595 bits += len;
596
597 r = code & ((1 << cd.cdh.lookback) - 1);
598 t = cd.values[which][len][code >> cd.cdh.lookback];
599
600 /* step through from base pointer */
601 b = word + 1;
602 while (r--) {
603 register int copy = *t >> 4;
604 memcpy (word + copy + 1, t + 1, *t & 0xf);
605 word[0] = copy + (*t & 0xf);
606 t += ((*t) & 0xf) + 1;
607 }
608 } else t = NULL;
609
610 if (t == cd.escape[which]) {
611 switch (cd.cdh.novel_method) {
612 case MG_NOVEL_HUFFMAN_CHARS:
613 {
614 int len, i;
615 int c;
616 len = buffer.huff_decode(cd.lens_huff[which]->min_code,
617 cd.lens_vals[which], &bits);
618 for (i = 0; i < len; i++) {
619 c = buffer.huff_decode(cd.chars_huff[which]->min_code,
620 cd.chars_vals[which], &bits);
621 docText.push_back (c);
622 }
623 }
624 break;
625 case MG_NOVEL_DELTA:
626 case MG_NOVEL_HYBRID:
627 {
628 int idx = 0, len;
629 u_char *base;
630 switch (cd.cdh.novel_method)
631 {
632 case MG_NOVEL_DELTA:
633 {
634 idx = buffer.delta_decode (&bits);
635 idx--;
636 }
637 break;
638 case MG_NOVEL_HYBRID:
639 {
640 int k;
641 k = buffer.gamma_decode (&bits);
642 k--;
643 idx = buffer.binary_decode(ad->blk_end[which][k] -
644 ad->blk_start[which][k] + 1,
645 &bits);
646 idx += ad->blk_start[which][k] - 1;
647 }
648 break;
649 }
650 base = ad->words[which][idx];
651 len = *base++;
652 for (; len; len--)
653 {
654 docText.push_back (*base++);
655 }
656 }
657 break;
658 }
659 }
660 else
661 {
662 /* copy over the matching prefix */
663 r = (*t >> 4);
664 while (r--) {
665 docText.push_back (*b++);
666 }
667
668 /* and the stored suffix */
669 r = ((*t) & 0xf);
670 while (r--) {
671 docText.push_back (*++t);
672 }
673 }
674 which = !which;
675 }
676
677 buffer.done();
678
679 return true;
680}
681
Note: See TracBrowser for help on using the repository browser.