source: branches/New_Config_Format-branch/gsdl/src/mgpp/text/TextGet.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1/**************************************************************************
2 *
3 * TextGet.cpp -- Decompressing the text
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: TextGet.cpp 1279 2000-07-12 22:21:53Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "TextGet.h"
25#include "mg_files.h"
26#include "netorder.h"
27#include "mg_errors.h"
28#include "locallib.h"
29#include "words.h"
30#include "local_strings.h"
31#include "bitio_m_stdio.h"
32
33typedef enum huff_type {lengths, chars};
34
35
36static auxiliary_dict *LoadAuxDict (compression_dict &cd, FILE *text_aux_dict) {
37 auxiliary_dict *ad;
38 int i;
39
40 if (!(ad = new auxiliary_dict))
41 {
42 mg_errno = MG_NOMEM;
43 return (NULL);
44 }
45
46 bzero ((char *) ad, sizeof (*ad));
47
48 for (i = 0; i <= 1; i++)
49 {
50 int j;
51 u_char *pos;
52
53 fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
54
55 /* [RPAP - Jan 97: Endian Ordering] */
56 NTOHUL(ad->afh[i].num_frags);
57 NTOHUL(ad->afh[i].mem_for_frags);
58
59 if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags]))
60 {
61 mg_errno = MG_NOMEM;
62 delete ad;
63 return (NULL);
64 }
65 if (!(ad->words[i] = new u_char* [ad->afh[i].num_frags]))
66 {
67 mg_errno = MG_NOMEM;
68 delete ad;
69 return (NULL);
70 }
71
72 fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
73 text_aux_dict);
74
75 pos = ad->word_data[i];
76 for (j = 0; j < (int)ad->afh[i].num_frags; j++)
77 {
78 ad->words[i][j] = pos;
79 pos += *pos + 1;
80 }
81 if (cd.cdh.novel_method == MG_NOVEL_HYBRID)
82 {
83 int num;
84 num = 1;
85 ad->blk_start[i][0] = 0;
86 ad->blk_end[i][0] = cd.cdh.num_words[i] - 1;
87 while (num < 33)
88 {
89 ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
90 ad->blk_end[i][num] = ad->blk_start[i][num] +
91 (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
92 num++;
93 }
94 }
95 }
96 return (ad);
97}
98
99
100static u_char ***ReadInWords (FILE *dict, compression_dict &cd,
101 comp_frags_header *cfh, u_char **escape) {
102 int i, lookback;
103 int ptrs_reqd = 0;
104 int mem_reqd = 0;
105 int num_set[MAX_HUFFCODE_LEN + 1];
106 u_char *next_word[MAX_HUFFCODE_LEN + 1];
107 u_char **vals;
108 u_char ***values;
109 u_char word[MAXWORDLEN + 1];
110 u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
111
112 lookback = cd.cdh.lookback;
113
114 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++) {
115 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
116 mem_reqd += cfh->huff_words_size[i];
117 }
118
119 if (!(vals = new u_char* [ptrs_reqd]))
120 return (NULL);
121
122 if (!(values = new u_char** [MAX_HUFFCODE_LEN + 1]))
123 return (NULL);
124
125 if (!(next_word[0] = new u_char[mem_reqd]))
126 return (NULL);
127
128 cd.MemForCompDict += ptrs_reqd * sizeof (*vals) +
129 (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
130 mem_reqd;
131
132 values[0] = vals;
133 values[0][0] = next_word[0];
134 for (i = 1; i <= cfh->hd.maxcodelen; i++)
135 {
136 int next_start = (values[i - 1] - vals) +
137 ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
138 values[i] = &vals[next_start];
139 next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
140 values[i][0] = next_word[i];
141 }
142
143 bzero ((char *) num_set, sizeof (num_set));
144
145 for (i = 0; i < cfh->hd.num_codes; i++)
146 {
147 register int val, copy;
148 register int len = cfh->hd.clens[i];
149 val = getc (dict);
150 copy = (val >> 4) & 0xf;
151 val &= 0xf;
152
153 fread (word + copy + 1, sizeof (u_char), val, dict);
154 *word = val + copy;
155
156 if ((num_set[len] & ((1 << lookback) - 1)) == 0)
157 {
158 values[len][num_set[len] >> lookback] = next_word[len];
159 memcpy (next_word[len], word, *word + 1);
160 if (escape && i == cfh->hd.num_codes - 1)
161 *escape = next_word[len];
162 next_word[len] += *word + 1;
163 }
164 else
165 {
166 copy = prefixlen (last_word[len], word);
167 memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
168 *next_word[len] = (copy << 4) + (*word - copy);
169 if (escape && i == cfh->hd.num_codes - 1)
170 *escape = next_word[len];
171 next_word[len] += (*word - copy) + 1;
172 }
173 memcpy (last_word[len], word, *word + 1);
174 num_set[len]++;
175 }
176 if (cfh->hd.clens)
177 delete cfh->hd.clens;
178 cfh->hd.clens = NULL;
179 return values;
180}
181
182static int Load_Comp_HuffData(compression_dict &cd, int which, FILE *dict,
183 huff_type type) {
184 huff_data * hd;
185 u_long ** vals;
186
187 if (!(hd = new huff_data))
188 return 1;
189 cd.MemForCompDict += sizeof (huff_data);
190 if (Read_Huffman_Data (dict, hd, &cd.MemForCompDict, NULL) == -1)
191 return 2;
192 if (!(vals = Generate_Huffman_Vals (hd, &cd.MemForCompDict)))
193 return 3;
194 if (hd->clens)
195 delete hd->clens;
196 hd->clens = NULL;
197 if (type == chars)
198 {
199 cd.chars_huff[which] = hd;
200 cd.chars_vals[which] = vals;
201 }
202 else
203 {
204 cd.lens_huff[which] = hd;
205 cd.lens_vals[which] = vals;
206 }
207
208 return 0;
209}
210
211static int Load_Comp_FragsHeader(compression_dict &cd, int which, int getEscape,
212 FILE *dict) {
213 if (!(cd.cfh[which] = new comp_frags_header))
214 return 1;
215 cd.MemForCompDict += sizeof (*cd.cfh[which]);
216 if (Read_cfh (dict, cd.cfh[which], &cd.MemForCompDict, NULL) == -1)
217 return 2;
218
219 if (!(cd.values[which] = ReadInWords (dict, cd, cd.cfh[which],
220 getEscape == 0 ? NULL : &cd.escape[which])))
221 return 3;
222
223 return 0;
224}
225
226static bool LoadSlowCompDict (FILE *dict, FILE *aux_dict, compression_dict &cd) {
227 if (dict == NULL) return false;
228
229 int which;
230
231 bzero ((char *) &cd, sizeof (compression_dict));
232
233 cd.MemForCompDict = sizeof (compression_dict);
234
235 if (Read_cdh (dict, &cd.cdh, &cd.MemForCompDict, NULL) == -1)
236 return false;
237
238 for (which = 0; which < 2; which++)
239 switch (cd.cdh.dict_type)
240 {
241 case MG_COMPLETE_DICTIONARY:
242 {
243 if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0)
244 return false;
245 cd.escape[which] = NULL;
246
247 }
248 break;
249 case MG_PARTIAL_DICTIONARY:
250 {
251 if (cd.cdh.num_words[which])
252 {
253 if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
254 return false;
255 }
256
257 if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
258 return false;
259
260 if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
261 return false;
262 }
263 break;
264 case MG_SEED_DICTIONARY:
265 {
266 if (cd.cdh.num_words[which])
267 {
268 if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
269 return false;
270 }
271 switch (cd.cdh.novel_method)
272 {
273 case MG_NOVEL_HUFFMAN_CHARS:
274 if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
275 return false;
276
277 if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
278 return false;
279 break;
280 case MG_NOVEL_DELTA:
281 break;
282 case MG_NOVEL_HYBRID:
283 break;
284 }
285 break;
286 }
287 }
288
289 if (cd.cdh.novel_method == MG_NOVEL_DELTA ||
290 cd.cdh.novel_method == MG_NOVEL_HYBRID)
291 {
292 if (!aux_dict)
293 {
294 mg_errno = MG_NOFILE;
295 cd.Clear();
296 return false;
297 }
298
299 if (!(cd.ad = LoadAuxDict (cd, aux_dict)))
300 {
301 cd.Clear();
302 return false;
303 }
304 }
305
306 mg_errno = MG_NOERROR;
307
308 cd.fast_loaded = 0;
309
310 return true;
311}
312
313
314
315#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
316#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p,cd) & 7))) != 0)
317
318// fast loading really needs to be totally re-writen. "Unloading" the
319// text data will currently cause a crash because memory is being
320// deleted multiple times (and probably a zillion other reasons).
321static bool LoadFastCompDict (FILE *text_fast_comp_dict, compression_dict &_cd) {
322 if (text_fast_comp_dict == NULL) return false;
323
324 u_long *p, *end;
325 u_char *fixup;
326 u_long mem;
327 u_long fixup_mem;
328 int i; /* [RPAP - Jan 97: Endian Ordering] */
329
330 fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
331 NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
332 fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
333 NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
334
335 compression_dict *cd;
336 if (!(cd = (compression_dict *)malloc (mem))) {
337 mg_errno = MG_NOMEM;
338 return false;
339 }
340
341 end = (u_long *) (((u_char *) cd) + mem);
342 fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
343
344 if (!(fixup = new u_char[fixup_mem]))
345 {
346 mg_errno = MG_NOMEM;
347 return false;
348 }
349
350 fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
351
352 for (p = (u_long *) cd; (u_long) p < (u_long) end; p++)
353 if (IS_FIXUP (p))
354 {
355 NTOHUL(*p); /* [RPAP - Jan 97: Endian Ordering] */
356 *p = *p + (u_long) cd;
357 }
358
359 /* [RPAP - Jan 97: Endian Ordering] */
360 /* cdh */
361 NTOHUL(cd->cdh.dict_type);
362 NTOHUL(cd->cdh.novel_method);
363 for (i = 0; i < TEXT_PARAMS; i++)
364 NTOHUL(cd->cdh.params[i]);
365 NTOHUL(cd->cdh.num_words[0]);
366 NTOHUL(cd->cdh.num_words[1]);
367 NTOHUL(cd->cdh.num_word_chars[0]);
368 NTOHUL(cd->cdh.num_word_chars[1]);
369 NTOHUL(cd->cdh.lookback);
370 /* cfh */
371 for (i = 0; i <= 1; i++)
372 {
373 int j;
374
375 NTOHSI(cd->cfh[i]->hd.num_codes);
376 NTOHSI(cd->cfh[i]->hd.mincodelen);
377 NTOHSI(cd->cfh[i]->hd.maxcodelen);
378 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
379 {
380 NTOHSI(cd->cfh[i]->hd.lencount[j]);
381 NTOHUL(cd->cfh[i]->hd.min_code[j]);
382 }
383 NTOHUL(cd->cfh[i]->uncompressed_size);
384 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
385 NTOHUL(cd->cfh[i]->huff_words_size[j]);
386 }
387 NTOHUL(cd->MemForCompDict);
388 /* ad */
389 if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
390 cd->cdh.novel_method == MG_NOVEL_HYBRID)
391 for (i = 0; i <= 1; i++)
392 {
393 int j;
394
395 NTOHUL(cd->ad->afh[i].num_frags);
396 NTOHUL(cd->ad->afh[i].mem_for_frags);
397 for (j = 0; j < 33; j++)
398 {
399 NTOHSI(cd->ad->blk_start[i][j]);
400 NTOHSI(cd->ad->blk_end[i][j]);
401 }
402 }
403 NTOHSI(cd->fast_loaded);
404
405 delete fixup;
406
407 // the whole fast comp dict is a bit of a hack so I don't
408 // feel too bad about the next line :-) -- Rodger.
409 _cd = *cd;
410
411 return true;
412}
413
414
415static bool LoadCompDict (FILE *compDictFile,
416 FILE *auxDictFile,
417 FILE *fastCompDictFile,
418 compression_dict &cd) {
419 // see if we have a fast loading compression dictionary
420 if (fastCompDictFile != NULL)
421 return LoadFastCompDict (fastCompDictFile, cd);
422
423 // slow compression dictionary
424 return LoadSlowCompDict (compDictFile, auxDictFile, cd);
425}
426
427
428// try to open the dictionary files and load the dictionary
429static bool OpenLoadCompDict (char *textname, compression_dict &cd) {
430 FILE *compDictFile = NULL;
431 FILE *auxDictFile = NULL;
432 FILE *fastCompDictFile = NULL;
433
434 fastCompDictFile = open_file (textname, TEXT_DICT_FAST_SUFFIX,
435 "rb", MAGIC_FAST_DICT, MG_CONTINUE);
436
437 if (fastCompDictFile == NULL) {
438 compDictFile = open_file (textname, TEXT_DICT_SUFFIX,
439 "rb", MAGIC_DICT, MG_MESSAGE);
440 auxDictFile = open_file (textname, TEXT_DICT_AUX_SUFFIX,
441 "rb", MAGIC_AUX_DICT, MG_CONTINUE);
442 }
443
444 bool res = LoadCompDict (compDictFile, auxDictFile, fastCompDictFile, cd);
445
446 if (compDictFile != NULL) fclose (compDictFile);
447 if (auxDictFile != NULL) fclose (auxDictFile);
448 if (fastCompDictFile != NULL) fclose (fastCompDictFile);
449
450 return res;
451}
452
453static bool LoadLevels (char *textname, FTextLevel &levels) {
454 FILE *levelFile = NULL;
455
456 // open the text level file
457 levelFile = open_file (textname, TEXT_LEVEL_SUFFIX,
458 "rb", MAGIC_TEXT_LEVELS, MG_CONTINUE);
459 if (levelFile == NULL) return false;
460
461 // seek to the appropriate place and read the level information
462 bool res = ((fseek (levelFile, sizeof (u_long), SEEK_SET) == 0) &&
463 levels.Read (levelFile));
464
465 // close the file
466 fclose (levelFile);
467
468 return res;
469}
470
471
472TextData::TextData () {
473 // put file pointers in known state first
474 textFile = NULL;
475 textIdxFile = NULL;
476 Clear ();
477}
478
479void TextData::Clear () {
480 cd.Clear();
481 textFile = NULL;
482 textIdxFile = NULL;
483 cth.Clear();
484 levels.Clear();
485}
486
487bool TextData::LoadData (char *basepath, char *textname) {
488
489 if (textname[0] == '\0') return false;
490
491 // set the basepath
492 set_basepath(basepath);
493
494 // load the compression dictionary
495 if (!OpenLoadCompDict (textname, cd)) return false;
496
497 // open the compressed text and text index file
498 textFile = open_file (textname, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_CONTINUE);
499 if (textFile == NULL) return false;
500
501 textIdxFile = open_file (textname, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI, MG_CONTINUE);
502 if (textIdxFile == NULL) return false;
503
504 // read in the compressed text header
505 if ((fseek (textFile, sizeof (u_long), SEEK_SET) != 0) || !cth.Read (textFile))
506 return false;
507
508 // read in the level information
509 if (!LoadLevels (textname, levels)) return false;
510
511 return true;
512}
513
514bool TextData::UnloadData () {
515 // close any open files
516 if (textFile != NULL) {
517 fclose (textFile);
518 textFile = NULL;
519 }
520 if (textIdxFile != NULL) {
521 fclose (textIdxFile);
522 textIdxFile = NULL;
523 }
524
525 // do general clear
526 Clear ();
527
528 return true;
529}
530
531
532bool GetDocIdx (TextData &td, const UCArray &docLevel,
533 unsigned long docNum, TextIdx &docIdx) {
534 // make sure the text index file was opened successfully
535 if (td.textIdxFile == NULL) return false;
536
537 // read in the index
538 TextLevelInfo &levelInfo = td.levels.levelInfo[docLevel];
539 if (!docIdx.Read (td.textIdxFile, levelInfo, docNum)) return false;
540
541 return true;
542}
543
544
545
546
547#define MY_HUFF_DECODE(len, code, mcodes) \
548 do { \
549 register unsigned long *__min_code = (mcodes); \
550 register unsigned long *__mclen = __min_code; \
551 register unsigned long __code = 0; \
552 do \
553 { \
554 __code += __code + buffer.bit(); \
555 } \
556 while (__code < *++__mclen); \
557 (len) = __mclen - __min_code; \
558 (code) = __code - *__mclen; \
559 } while(0);
560
561
562bool GetDocText (TextData &td, const UCArray &docLevel,
563 unsigned long docNum, UCArray &docText) {
564 // erase the current text
565 docText.erase (docText.begin(), docText.end());
566
567 // look up the information about this document
568 TextIdx docIdx;
569 if (!GetDocIdx (td, docLevel, docNum, docIdx)) return false;
570
571 // do seek to appropriate position
572 stdio_bitio_buffer buffer (td.textFile);
573 buffer.seek (docIdx.start.byte, docIdx.start.bit);
574
575 // decompress the document
576 compression_dict &cd = td.cd;
577 auxiliary_dict *ad = cd.ad;
578 int which = docIdx.which;
579 unsigned long num_bits = (docIdx.end.byte*8+(8-docIdx.end.bit)) -
580 (docIdx.start.byte*8+(8-docIdx.start.bit));
581 unsigned long bits = 0;
582
583 // keep decoding bits until enough bits have been decoded
584 while (bits < num_bits) {
585 register unsigned code, len;
586 register int r;
587 register u_char *t, *b = NULL;
588 u_char word[MAXWORDLEN + 1];
589
590 if (cd.cfh[which]) {
591 MY_HUFF_DECODE (len, code, cd.cfh[which]->hd.min_code);
592 bits += len;
593
594 r = code & ((1 << cd.cdh.lookback) - 1);
595 t = cd.values[which][len][code >> cd.cdh.lookback];
596
597 /* step through from base pointer */
598 b = word + 1;
599 while (r--) {
600 register int copy = *t >> 4;
601 memcpy (word + copy + 1, t + 1, *t & 0xf);
602 word[0] = copy + (*t & 0xf);
603 t += ((*t) & 0xf) + 1;
604 }
605 } else t = NULL;
606
607 if (t == cd.escape[which]) {
608 switch (cd.cdh.novel_method) {
609 case MG_NOVEL_HUFFMAN_CHARS:
610 {
611 int len, i;
612 int c;
613 len = buffer.huff_decode(cd.lens_huff[which]->min_code,
614 cd.lens_vals[which], &bits);
615 for (i = 0; i < len; i++) {
616 c = buffer.huff_decode(cd.chars_huff[which]->min_code,
617 cd.chars_vals[which], &bits);
618 docText.push_back (c);
619 }
620 }
621 break;
622 case MG_NOVEL_DELTA:
623 case MG_NOVEL_HYBRID:
624 {
625 int idx = 0, len;
626 u_char *base;
627 switch (cd.cdh.novel_method)
628 {
629 case MG_NOVEL_DELTA:
630 {
631 idx = buffer.delta_decode (&bits);
632 idx--;
633 }
634 break;
635 case MG_NOVEL_HYBRID:
636 {
637 int k;
638 k = buffer.gamma_decode (&bits);
639 k--;
640 idx = buffer.binary_decode(ad->blk_end[which][k] -
641 ad->blk_start[which][k] + 1,
642 &bits);
643 idx += ad->blk_start[which][k] - 1;
644 }
645 break;
646 }
647 base = ad->words[which][idx];
648 len = *base++;
649 for (; len; len--)
650 {
651 docText.push_back (*base++);
652 }
653 }
654 break;
655 }
656 }
657 else
658 {
659 /* copy over the matching prefix */
660 r = (*t >> 4);
661 while (r--) {
662 docText.push_back (*b++);
663 }
664
665 /* and the stored suffix */
666 r = ((*t) & 0xf);
667 while (r--) {
668 docText.push_back (*++t);
669 }
670 }
671 which = !which;
672 }
673
674 buffer.done();
675
676 return true;
677}
678
Note: See TracBrowser for help on using the repository browser.