source: trunk/gsdl/src/mgpp/text/text_get.cpp@ 711

Last change on this file since 711 was 711, checked in by cs025, 25 years ago

Changes to eradicate Xmalloc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.6 KB
Line 
1/**************************************************************************
2 *
3 * text_get.c -- Function for reading documents from the compressed text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: text_get.cpp 711 1999-10-17 23:43:31Z cs025 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "timing.h"
29#include "messages.h"
30#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
31
32#include "huffman.h"
33#include "bitio_m_abstract.h"
34#include "bitio_m_mem.h"
35#include "bitio_m.h"
36#include "bitio_stdio.h"
37#include "huffman_stdio.h"
38
39#include "mg.h"
40#include "invf.h"
41#include "text.h"
42#include "lists.h"
43#include "backend.h"
44#include "text_get.h"
45#include "locallib.h"
46#include "words.h"
47#include "mg_errors.h"
48#include "local_strings.h"
49#include "DocEntry.h"
50
51/*
52 $Log$
53 Revision 1.2 1999/10/17 23:43:30 cs025
54 Changes to eradicate Xmalloc
55
56 Revision 1.1 1999/10/11 02:58:39 cs025
57 Base install of MG-PP
58
59 Revision 1.1 1999/08/10 21:18:26 sjboddie
60 renamed mg-1.3d directory mg
61
62 Revision 1.1 1998/11/17 09:35:48 rjmcnab
63 *** empty log message ***
64
65 * Revision 1.3 1994/10/20 03:57:11 tes
66 * I have rewritten the boolean query optimiser and abstracted out the
67 * components of the boolean query.
68 *
69 * Revision 1.2 1994/09/20 04:42:15 tes
70 * For version 1.1
71 *
72 */
73
74typedef enum huff_type {lengths, chars};
75
76static char *RCSID = "$Id: text_get.cpp 711 1999-10-17 23:43:31Z cs025 $";
77
78
79
80
81
82/* FetchDocStart ()
83 * Reads into DocEnt the starting position of the document in the *.text file
84 * Where the first document is document number 1
85 * It returns the true weight of the document.
86 */
87
88
89
90
91static double
92FetchDocStartLev1 (text_data * td, u_long DN,
93 u_long * seek_pos, u_long * len)
94{
95 unsigned long data[2];
96 /* [TS:Sep/94] Fixed up the seek call to give the correct offset */
97 Fseek (td->TextIdxFile,
98 sizeof (unsigned long) * (DN - 1) + /* the doc offsets */
99 sizeof (unsigned long) + /* the magic number */
100 sizeof (compressed_text_header), /* the header */
101 0);
102 Fread ((char *) &data, sizeof (data), 1, td->TextIdxFile);
103
104 /* [RPAP - Jan 97: Endian Ordering] */
105 NTOHUL(data[0]);
106 NTOHUL(data[1]);
107
108 *seek_pos = data[0];
109 *len = data[1] - data[0];
110 return (1.0);
111}
112
113#define MG_PAGE_SIZE 2048
114
115static int
116LoadIdx (text_data * td, unsigned long DN)
117{
118 if (!td->idx_data)
119 {
120 td->idx_data = new index_data[MG_PAGE_SIZE];
121 if (!td->idx_data)
122 FatalError (1, "Out of memory in FDSL2");
123 }
124
125 if (td->current_pos == -1 || DN >= td->current_pos + MG_PAGE_SIZE - 1 ||
126 DN < td->current_pos)
127 {
128 int i, num; /* [RPAP - Jan 97: Endian Ordering] */
129
130 long rn = (long) DN - (MG_PAGE_SIZE >> 1);
131 if (rn < 1)
132 rn = 1;
133
134 Fseek (td->TextIdxWgtFile, (sizeof (unsigned long) + sizeof (float)) *
135 (rn - 1) + sizeof (unsigned long), 0);
136
137 num = Fread ((char *) td->idx_data, sizeof (*(td->idx_data)), MG_PAGE_SIZE, /* [RPAP - Jan 97: Endian Ordering] */
138 td->TextIdxWgtFile);
139
140 /* [RPAP - Jan 97: Endian Ordering] */
141 for (i = 0; i < num; i++)
142 {
143 NTOHUL(td->idx_data[i].Start);
144 NTOHF(td->idx_data[i].Weight);
145 }
146
147 td->current_pos = rn;
148 }
149 return DN - td->current_pos;
150}
151
152static double
153FDSL2 (text_data * td, unsigned long DN, unsigned long *Pos)
154{
155 unsigned long pos = LoadIdx (td, DN);
156 *Pos = td->idx_data[pos].Start;
157 return (td->idx_data[pos].Weight);
158}
159
160
161static double
162FetchDocStartLev2 (text_data * td, u_long DN,
163 u_long * seek_pos, u_long * len)
164{
165 double Weight;
166 unsigned long s1, s2;
167
168 Weight = FDSL2 (td, DN, &s1);
169 do
170 {
171 DN++;
172 FDSL2 (td, DN, &s2);
173 }
174 while (s2 == s1);
175 *seek_pos = s1;
176 *len = s2 - s1;
177 return (Weight);
178}
179
180
181
182
183double
184FetchDocStart (query_data * qd, u_long DN, u_long * seek_pos, u_long * len)
185{
186 qd->text_idx_lookups++;
187
188 if (qd->td->TextIdxWgtFile)
189 return FetchDocStartLev2 (qd->td, DN, seek_pos, len);
190 else
191 return FetchDocStartLev1 (qd->td, DN, seek_pos, len);
192}
193
194unsigned long
195FetchInitialParagraph (text_data * td, unsigned long ParaNum)
196{
197 if (td->TextIdxWgtFile)
198 {
199 unsigned long pos;
200 unsigned long start;
201 int PN = ParaNum - 1;
202 pos = LoadIdx (td, ParaNum);
203 start = td->idx_data[pos].Start;
204 while (PN > 0)
205 {
206 pos = LoadIdx (td, PN);
207 if (td->idx_data[pos].Start != start)
208 return PN + 1;
209 PN--;
210 }
211 return PN + 1;
212 }
213 else
214 return ParaNum;
215}
216
217
218
219/* FetchCompressed ()
220 * Reads into buffer DocBuff the compressed form of document DocNum.
221 * Where the first document is document number 1
222 */
223int
224FetchCompressed (query_data * qd, char **DocBuff, DocEntry * DocEnt)
225{
226 if (!DocEnt->SeekPos)
227 DocEnt->FetchStart(qd);
228 // FetchDocStart (qd, DocEnt->DocNum, &DocEnt->SeekPos, &DocEnt->Len);
229 if (!(*DocBuff = new char[DocEnt->Len]))
230 return (-1);
231
232 if (Fseek (qd->td->TextFile, DocEnt->SeekPos, 0) == -1)
233 FatalError (1, "Error when seeking into text file");
234#if 0
235 printf ("Loading compressed text %d %d\n", DocEnt->SeekPos, DocEnt->Len);
236#endif
237 if (Fread (*DocBuff, 1, DocEnt->Len, qd->td->TextFile) != DocEnt->Len)
238 FatalError (1, "Error when reading data");
239
240 return (DocEnt->Len);
241
242}
243
244
245text_data *
246LoadTextData (File * text, File * text_idx_wgt, File * text_idx)
247{
248 text_data *td;
249
250 if (!(td = new text_data))
251 {
252 mg_errno = MG_NOMEM;
253 return (NULL);
254 }
255
256 td->TextFile = text;
257 td->TextIdxWgtFile = text_idx_wgt;
258 td->TextIdxFile = text_idx;
259 td->current_pos = -1;
260 td->idx_data = NULL;
261 Fread (&td->cth, sizeof (td->cth), 1, text);
262
263 /* [RPAP - Jan 97: Endian Ordering] */
264 NTOHUL(td->cth.num_of_docs);
265 NTOHD(td->cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
266 NTOHUL(td->cth.num_of_words);
267 NTOHUL(td->cth.length_of_longest_doc);
268 NTOHD(td->cth.ratio);
269
270 return (td);
271}
272
273
274void
275FreeTextData (text_data * td)
276{
277 if (td)
278 {
279 if (td->idx_data)
280 delete td->idx_data;
281 delete td;
282 }
283}
284
285
286static int
287pts_comp (const void *A, const void *B)
288{
289 const DocEntry *const *a = (DocEntry **) A;
290 const DocEntry *const *b = (DocEntry **) B;
291 return (*a)->DocNum - (*b)->DocNum;
292}
293
294
295
296
297int
298GetPosLens (query_data * qd, DocEntry * Docs, int num)
299{
300 DocEntry **pts;
301 int i, j;
302 if (!(pts = new (DocEntry *)[num]))
303 {
304 mg_errno = MG_NOMEM;
305 return (-1);
306 }
307 for (i = j = 0; i < num; i++, Docs++)
308 if (!Docs->SeekPos)
309 pts[j++] = Docs;
310
311 if (j)
312 {
313 qsort (pts, j, sizeof (DocEntry *), pts_comp);
314 for (i = 0; i < j; i++)
315 pts[i]->FetchStart(qd);
316 // FetchDocStart (qd, pts[i]->DocNum, &pts[i]->SeekPos, &pts[i]->Len);
317 }
318
319 delete pts;
320 return (0);
321}
322
323
324
325
326
327int
328LoadBuffers (query_data * qd, DocEntry * Docs, int max_mem, int num)
329{
330 DocEntry **pts;
331 int i, j;
332 int mem;
333
334 if (!num)
335 return (0);
336 if (!(pts = new (DocEntry *)[num]))
337 {
338 mg_errno = MG_NOMEM;
339 return (-1);
340 }
341
342 mem = i = 0;
343 do
344 {
345 pts[i] = Docs;
346 mem += Docs->Len;
347 i++;
348 Docs++;
349 }
350 while (i < num && mem < max_mem);
351 if (i > 1)
352 qsort (pts, i, sizeof (DocEntry *), pts_comp);
353 for (j = 0; j < i; j++)
354 {
355 if (FetchCompressed (qd, &pts[j]->CompTextBuffer, pts[j]) == -1)
356 return (-1);
357 ChangeMemInUse (qd, pts[j]->Len);
358 }
359
360 delete pts;
361
362 return (i);
363}
364
365
366/**
367 *
368 * GRB: Function removed 21/09/99 - wasn't being used; in any case used DocEntry_FreeTextBuffers instead
369 *
370 *
371void
372FreeBuffers (query_data * qd, DocEntry * Docs, int num)
373{
374 int i;
375
376 for (i = 0; i < num; i++, Docs++)
377 if (Docs->CompTextBuffer)
378 {
379 delete Docs->CompTextBuffer;
380 Docs->CompTextBuffer = NULL;
381 ChangeMemInUse (qd, -Docs->Len);
382 }
383
384}
385*/
386
387
388/****************************************************************************/
389
390static void
391FreeAuxDict (auxiliary_dict * ad)
392{
393 if (!ad)
394 return;
395 if (ad->word_data[0])
396 delete ad->word_data[0];
397 if (ad->word_data[1])
398 delete ad->word_data[1];
399 if (ad->words[0])
400 delete ad->words[0];
401 if (ad->words[1])
402 delete ad->words[1];
403 delete ad;
404}
405
406static auxiliary_dict *
407LoadAuxDict (compression_dict * cd, File * text_aux_dict)
408{
409 auxiliary_dict *ad;
410 int i;
411
412 if (!(ad = new auxiliary_dict))
413 {
414 mg_errno = MG_NOMEM;
415 return (NULL);
416 }
417
418 bzero ((char *) ad, sizeof (*ad));
419
420 for (i = 0; i <= 1; i++)
421 {
422 int j;
423 u_char *pos;
424
425 Fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
426
427 /* [RPAP - Jan 97: Endian Ordering] */
428 NTOHUL(ad->afh[i].num_frags);
429 NTOHUL(ad->afh[i].mem_for_frags);
430
431 if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags]))
432 {
433 mg_errno = MG_NOMEM;
434 FreeAuxDict (ad);
435 return (NULL);
436 }
437 if (!(ad->words[i] = new (u_char *)[ad->afh[i].num_frags]))
438 {
439 mg_errno = MG_NOMEM;
440 FreeAuxDict (ad);
441 return (NULL);
442 }
443
444 Fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
445 text_aux_dict);
446
447 pos = ad->word_data[i];
448 for (j = 0; j < ad->afh[i].num_frags; j++)
449 {
450 ad->words[i][j] = pos;
451 pos += *pos + 1;
452 }
453 if (cd->cdh.novel_method == MG_NOVEL_HYBRID ||
454 cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
455 {
456 int num;
457 num = 1;
458 ad->blk_start[i][0] = 0;
459 ad->blk_end[i][0] = cd->cdh.num_words[i] - 1;
460 while (num < 33)
461 {
462 ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
463 ad->blk_end[i][num] = ad->blk_start[i][num] +
464 (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
465 num++;
466 }
467 }
468 }
469 return (ad);
470}
471
472
473
474
475
476
477static u_char ***
478ReadInWords (File * dict, compression_dict * cd,
479 comp_frags_header * cfh, u_char ** escape)
480{
481 int i, lookback;
482 int ptrs_reqd = 0;
483 int mem_reqd = 0;
484 int num_set[MAX_HUFFCODE_LEN + 1];
485 u_char *next_word[MAX_HUFFCODE_LEN + 1];
486 u_char **vals;
487 u_char ***values;
488 u_char word[MAXWORDLEN + 1];
489 u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
490
491 lookback = cd->cdh.lookback;
492
493 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
494 {
495 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
496 mem_reqd += cfh->huff_words_size[i];
497 }
498
499 if (!(vals = new (u_char *)[ptrs_reqd]))
500 return (NULL);
501
502 if (!(values = new (u_char **)[MAX_HUFFCODE_LEN + 1]))
503 return (NULL);
504
505 if (!(next_word[0] = new u_char[mem_reqd]))
506 return (NULL);
507
508 cd->MemForCompDict += ptrs_reqd * sizeof (*vals) +
509 (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
510 mem_reqd;
511
512 values[0] = vals;
513 values[0][0] = next_word[0];
514 for (i = 1; i <= cfh->hd.maxcodelen; i++)
515 {
516 int next_start = (values[i - 1] - vals) +
517 ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
518 values[i] = &vals[next_start];
519 next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
520 values[i][0] = next_word[i];
521 }
522
523 bzero ((char *) num_set, sizeof (num_set));
524
525 for (i = 0; i < cfh->hd.num_codes; i++)
526 {
527 register int val, copy;
528 register int len = cfh->hd.clens[i];
529 val = Getc (dict);
530 copy = (val >> 4) & 0xf;
531 val &= 0xf;
532
533 Fread (word + copy + 1, sizeof (u_char), val, dict);
534 *word = val + copy;
535
536 if ((num_set[len] & ((1 << lookback) - 1)) == 0)
537 {
538 values[len][num_set[len] >> lookback] = next_word[len];
539 memcpy (next_word[len], word, *word + 1);
540 if (escape && i == cfh->hd.num_codes - 1)
541 *escape = next_word[len];
542 next_word[len] += *word + 1;
543 }
544 else
545 {
546 copy = prefixlen (last_word[len], word);
547 memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
548 *next_word[len] = (copy << 4) + (*word - copy);
549 if (escape && i == cfh->hd.num_codes - 1)
550 *escape = next_word[len];
551 next_word[len] += (*word - copy) + 1;
552 }
553 memcpy (last_word[len], word, *word + 1);
554 num_set[len]++;
555 }
556 if (cfh->hd.clens)
557 delete cfh->hd.clens;
558 cfh->hd.clens = NULL;
559 return values;
560}
561
562int Load_Comp_HuffData(compression_dict *cd, int which, File *dict,
563 huff_type type)
564{
565 huff_data * hd;
566 u_long ** vals;
567
568 if (!(hd = new huff_data))
569 return 1;
570 cd->MemForCompDict += sizeof (huff_data);
571 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL) == -1)
572 return 2;
573 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
574 return 3;
575 if (hd->clens)
576 delete hd->clens;
577 hd->clens = NULL;
578 if (type == chars)
579 {
580 cd->chars_huff[which] = hd;
581 cd->chars_vals[which] = vals;
582 }
583 else
584 {
585 cd->lens_huff[which] = hd;
586 cd->lens_vals[which] = vals;
587 }
588
589 return 0;
590}
591
592int Load_Comp_FragsHeader(compression_dict *cd, int which, int getEscape, File *dict)
593{
594 if (!(cd->cfh[which] = new comp_frags_header))
595 return 1;
596 cd->MemForCompDict += sizeof (*cd->cfh[which]);
597 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
598 return 2;
599
600 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
601 getEscape == 0 ? NULL : &cd->escape[which])))
602 return 3;
603 return 0;
604}
605
606static compression_dict *
607Load_Comp_Dict (File * dict, File * aux_dict)
608{
609 int which;
610 compression_dict *cd;
611
612 if (!(cd = new compression_dict))
613 {
614 mg_errno = MG_NOMEM;
615 return (NULL);
616 }
617
618 bzero ((char *) cd, sizeof (compression_dict));
619
620 cd->MemForCompDict = sizeof (compression_dict);
621
622 if (F_Read_cdh (dict, &cd->cdh, &cd->MemForCompDict, NULL) == -1)
623 return NULL;
624
625 for (which = 0; which < 2; which++)
626 switch (cd->cdh.dict_type)
627 {
628 case MG_COMPLETE_DICTIONARY:
629 {
630 if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0)
631 return NULL;
632 /*
633 if (!(cd->cfh[which] = (comp_frags_header *) Xmalloc (sizeof (*cd->cfh[which]))))
634 return NULL;
635 cd->MemForCompDict += sizeof (*cd->cfh[which]);
636 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
637 return NULL;
638
639 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
640 NULL)))
641 return NULL;
642 */
643 cd->escape[which] = NULL;
644
645 }
646 break;
647 case MG_PARTIAL_DICTIONARY:
648 {
649 huff_data *hd;
650 u_long **vals;
651 if (cd->cdh.num_words[which])
652 {
653 if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
654 return NULL;
655 /*
656 if (!(cd->cfh[which] = (comp_frags_header *) Xmalloc (sizeof (*cd->cfh[which]))))
657 return NULL;
658 cd->MemForCompDict += sizeof (*cd->cfh[which]);
659 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
660 return NULL;
661
662 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
663 &cd->escape[which])))
664 return NULL;
665 */
666 }
667 if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
668 return NULL;
669 /*
670 if (!(hd = (huff_data *) Xmalloc (sizeof (huff_data))))
671 return NULL;
672 cd->MemForCompDict += sizeof (huff_data);
673 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL) == -1)
674 return NULL;
675 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
676 return NULL;
677 if (hd->clens)
678 Xfree (hd->clens);
679 hd->clens = NULL;
680 cd->chars_huff[which] = hd;
681 cd->chars_vals[which] = vals;
682 */
683 if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
684 return NULL;
685 /*
686 if (!(hd = (huff_data *) Xmalloc (sizeof (huff_data))))
687 return NULL;
688 cd->MemForCompDict += sizeof (huff_data);
689 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL) == -1)
690 return NULL;
691 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
692 return NULL;
693 cd->lens_huff[which] = hd;
694 cd->lens_vals[which] = vals;
695 if (hd->clens)
696 Xfree (hd->clens);
697 hd->clens = NULL;
698 */
699 }
700 break;
701 case MG_SEED_DICTIONARY:
702 {
703 huff_data *hd;
704 u_long **vals;
705 if (cd->cdh.num_words[which])
706 {
707 if (Load_Comp_FragsHeader(cd, which, 1, dict))
708 return NULL;
709 /*
710 if (!(cd->cfh[which] = (comp_frags_header *) Xmalloc (sizeof (*cd->cfh[which]))))
711 return NULL;
712 cd->MemForCompDict += sizeof (*cd->cfh[which]);
713 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
714 return NULL;
715
716 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
717 &cd->escape[which])))
718 return NULL;
719 */
720 }
721 switch (cd->cdh.novel_method)
722 {
723 case MG_NOVEL_HUFFMAN_CHARS:
724 if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
725 return NULL;
726 /*
727 if (!(hd = (huff_data *) Xmalloc (sizeof (huff_data))))
728 return NULL;
729 cd->MemForCompDict += sizeof (huff_data);
730 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict,
731 NULL) == -1)
732 return NULL;
733 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
734 return NULL;
735 cd->chars_huff[which] = hd;
736 cd->chars_vals[which] = vals;
737 if (hd->clens)
738 Xfree (hd->clens);
739 hd->clens = NULL;
740 */
741
742 if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
743 return NULL;
744 /*
745 if (!(hd = (huff_data *) Xmalloc (sizeof (huff_data))))
746 return NULL;
747 cd->MemForCompDict += sizeof (huff_data);
748 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict
749 ,NULL) == -1)
750 return NULL;
751 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
752 return NULL;
753 cd->lens_huff[which] = hd;
754 cd->lens_vals[which] = vals;
755 if (hd->clens)
756 Xfree (hd->clens);
757 hd->clens = NULL;
758 */
759 break;
760 case MG_NOVEL_BINARY:
761 break;
762 case MG_NOVEL_DELTA:
763 break;
764 case MG_NOVEL_HYBRID:
765 break;
766 case MG_NOVEL_HYBRID_MTF:
767 break;
768 }
769 break;
770 }
771 }
772
773 if (cd->cdh.novel_method == MG_NOVEL_BINARY ||
774 cd->cdh.novel_method == MG_NOVEL_DELTA ||
775 cd->cdh.novel_method == MG_NOVEL_HYBRID ||
776 cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
777 {
778 if (!aux_dict)
779 {
780 mg_errno = MG_NOFILE;
781 FreeCompDict (cd);
782 return (NULL);
783 }
784
785 if (!(cd->ad = LoadAuxDict (cd, aux_dict)))
786 {
787 FreeCompDict (cd);
788 return (NULL);
789 }
790 }
791
792
793 mg_errno = MG_NOERROR;
794
795 cd->fast_loaded = 0;
796 return (cd);
797}
798
799#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
800
801#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p, cd) & 7))) != 0)
802
803
804static compression_dict *
805Load_Fast_Comp_Dict (File * text_fast_comp_dict)
806{
807 compression_dict *cd;
808 u_long *p, *end;
809 u_char *fixup;
810 u_long mem;
811 u_long fixup_mem;
812 int i; /* [RPAP - Jan 97: Endian Ordering] */
813
814 Fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
815 NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
816 Fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
817 NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
818 if (!(cd = (compression_dict *) Xmalloc (mem)))
819 {
820 mg_errno = MG_NOMEM;
821 return (NULL);
822 }
823
824 end = (u_long *) (((u_char *) cd) + mem);
825 Fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
826
827 if (!(fixup = new u_char[fixup_mem]))
828 {
829 mg_errno = MG_NOMEM;
830 return (NULL);
831 }
832
833 Fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
834
835 for (p = (u_long *) cd; (u_long) p < (u_long) end; p++)
836 if (IS_FIXUP (p))
837 {
838 NTOHUL(*p); /* [RPAP - Jan 97: Endian Ordering] */
839 *p = *p + (u_long) cd;
840 }
841
842 /* [RPAP - Jan 97: Endian Ordering] */
843 /* cdh */
844 NTOHUL(cd->cdh.dict_type);
845 NTOHUL(cd->cdh.novel_method);
846 for (i = 0; i < TEXT_PARAMS; i++)
847 NTOHUL(cd->cdh.params[i]);
848 NTOHUL(cd->cdh.num_words[0]);
849 NTOHUL(cd->cdh.num_words[1]);
850 NTOHUL(cd->cdh.num_word_chars[0]);
851 NTOHUL(cd->cdh.num_word_chars[1]);
852 NTOHUL(cd->cdh.lookback);
853 /* cfh */
854 for (i = 0; i <= 1; i++)
855 {
856 int j;
857
858 NTOHSI(cd->cfh[i]->hd.num_codes);
859 NTOHSI(cd->cfh[i]->hd.mincodelen);
860 NTOHSI(cd->cfh[i]->hd.maxcodelen);
861 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
862 {
863 NTOHSI(cd->cfh[i]->hd.lencount[j]);
864 NTOHUL(cd->cfh[i]->hd.min_code[j]);
865 }
866 NTOHUL(cd->cfh[i]->uncompressed_size);
867 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
868 NTOHUL(cd->cfh[i]->huff_words_size[j]);
869 }
870 NTOHUL(cd->MemForCompDict);
871 /* ad */
872 if (cd->cdh.novel_method == MG_NOVEL_BINARY ||
873 cd->cdh.novel_method == MG_NOVEL_DELTA ||
874 cd->cdh.novel_method == MG_NOVEL_HYBRID ||
875 cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
876 for (i = 0; i <= 1; i++)
877 {
878 int j;
879
880 NTOHUL(cd->ad->afh[i].num_frags);
881 NTOHUL(cd->ad->afh[i].mem_for_frags);
882 for (j = 0; j < 33; j++)
883 {
884 NTOHSI(cd->ad->blk_start[i][j]);
885 NTOHSI(cd->ad->blk_end[i][j]);
886 }
887 }
888 NTOHSI(cd->fast_loaded);
889
890 delete fixup;
891 return (cd);
892}
893
894
895
896
897compression_dict *
898LoadCompDict (File * text_comp_dict,
899 File * text_aux_dict,
900 File * text_fast_comp_dict)
901{
902 return text_fast_comp_dict ?
903 Load_Fast_Comp_Dict (text_fast_comp_dict) :
904 Load_Comp_Dict (text_comp_dict, text_aux_dict);
905}
906
907
908
909
910void
911FreeCompDict (compression_dict * cd)
912{
913 int which;
914
915 if (cd->fast_loaded)
916 {
917 delete cd;
918 return;
919 }
920 for (which = 0; which < 2; which++)
921 {
922 if (cd->cfh[which])
923 delete cd->cfh[which];
924 if (cd->chars_huff[which])
925 delete cd->chars_huff[which];
926 if (cd->lens_huff[which])
927 delete cd->lens_huff[which];
928 if (cd->values[which])
929 {
930 delete (cd->values[which][0][0]); /* [RJM 07/98: Memory Leak] */
931 delete cd->values[which][0];
932 delete (cd->values[which]);
933 }
934 if (cd->chars_vals[which])
935 {
936 delete cd->chars_vals[which][0];
937 delete cd->chars_vals[which];
938 }
939 if (cd->lens_vals[which])
940 {
941 delete cd->lens_vals[which][0];
942 delete cd->lens_vals[which];
943 }
944 }
945 if (cd->ad)
946 FreeAuxDict (cd->ad);
947 delete cd;
948}
949
950
951
952
953
954#define MY_HUFF_DECODE(len, code, mcodes) \
955 do { \
956 register unsigned long *__min_code = (mcodes); \
957 register unsigned long *__mclen = __min_code; \
958 register unsigned long __code = 0; \
959 do \
960 { \
961 __code += __code + buffer.bit(); \
962 } \
963 while (__code < *++__mclen); \
964 (len) = __mclen - __min_code; \
965 (code) = __code - *__mclen; \
966 } while(0);
967
968
969/*#define DUMPDOC */
970
971#define MAX_SWAPS 10000
972
973int
974DecodeText (compression_dict * cd,
975 u_char * s_in, int l_in, u_char * s_out, int *l_out)
976{
977 auxiliary_dict *ad = cd->ad;
978 int which;
979 u_long num_bits, bits;
980 u_char *ptr = s_out;
981 static int num = 0;
982 u_long binary_start[2];
983 int novels_used[2];
984 int swaps[2][MAX_SWAPS];
985 novels_used[0] = novels_used[1] = 0;
986
987 {
988 unsigned char bf = s_in[l_in - 1];
989 num_bits = 1;
990 while ((bf & 1) != 1)
991 {
992 num_bits++;
993 bf >>= 1;
994 }
995 num_bits = l_in * 8 - num_bits;
996 }
997
998 DECODE_START (s_in, l_in)
999
1000 which = buffer.bit();
1001 bits = 1;
1002
1003 if (cd->cdh.novel_method == MG_NOVEL_BINARY)
1004 {
1005 binary_start[0] = buffer.delta_decode (&bits);
1006 binary_start[1] = buffer.delta_decode (&bits);
1007 }
1008
1009
1010 while (bits < num_bits)
1011 {
1012 register unsigned code, len;
1013 register int r;
1014 register u_char *t, *b = NULL;
1015 u_char word[MAXWORDLEN + 1];
1016
1017#ifdef DUMPDOC
1018 printf ("\n%d %d ", bits, num_bits);
1019#endif
1020 if (cd->cfh[which])
1021 {
1022 MY_HUFF_DECODE (len, code, cd->cfh[which]->hd.min_code);
1023 bits += len;
1024
1025 r = code & ((1 << cd->cdh.lookback) - 1);
1026 t = cd->values[which][len][code >> cd->cdh.lookback];
1027
1028 /* step through from base pointer */
1029 b = word + 1;
1030 while (r--)
1031 {
1032 register int copy = *t >> 4;
1033 memcpy (word + copy + 1, t + 1, *t & 0xf);
1034 word[0] = copy + (*t & 0xf);
1035 t += ((*t) & 0xf) + 1;
1036 }
1037 }
1038 else
1039 t = NULL;
1040 if (t == cd->escape[which])
1041 {
1042 switch (cd->cdh.novel_method)
1043 {
1044 case MG_NOVEL_HUFFMAN_CHARS:
1045 {
1046 int len, i;
1047 int c;
1048 len = buffer.huff_decode(cd->lens_huff[which]->min_code,
1049 cd->lens_vals[which], &bits);
1050 for (i = 0; i < len; i++)
1051 {
1052 c = buffer.huff_decode(cd->chars_huff[which]->min_code,
1053 cd->chars_vals[which], &bits);
1054 *ptr++ = c;
1055 }
1056 }
1057 break;
1058 case MG_NOVEL_BINARY:
1059 case MG_NOVEL_DELTA:
1060 case MG_NOVEL_HYBRID:
1061 case MG_NOVEL_HYBRID_MTF:
1062 {
1063 int idx = 0, len;
1064 u_char *base;
1065 switch (cd->cdh.novel_method)
1066 {
1067 case MG_NOVEL_BINARY:
1068 {
1069 idx = buffer.binary_decode(binary_start[which], &bits);
1070 if (idx == binary_start[which])
1071 binary_start[which]++;
1072 idx--;
1073 }
1074 break;
1075 case MG_NOVEL_DELTA:
1076 {
1077 idx = buffer.delta_decode (&bits);
1078 idx--;
1079 }
1080 break;
1081 case MG_NOVEL_HYBRID:
1082 {
1083 int k;
1084 k = buffer.gamma_decode (&bits);
1085 k--;
1086 idx = buffer.binary_decode(ad->blk_end[which][k] -
1087 ad->blk_start[which][k] + 1,
1088 &bits);
1089 idx += ad->blk_start[which][k] - 1;
1090 }
1091 break;
1092 case MG_NOVEL_HYBRID_MTF:
1093 {
1094 int k;
1095 k = buffer.gamma_decode (&bits);
1096 k--;
1097 idx = buffer.binary_decode(ad->blk_end[which][k] -
1098 ad->blk_start[which][k] + 1,
1099 &bits);
1100 idx += ad->blk_start[which][k] - 1;
1101 if (idx >= novels_used[which])
1102 {
1103 u_char *temp;
1104 temp = ad->words[which][idx];
1105 ad->words[which][idx] =
1106 ad->words[which][novels_used[which]];
1107 ad->words[which][novels_used[which]] = temp;
1108 swaps[which][novels_used[which]] = idx;
1109 idx = novels_used[which]++;
1110 }
1111 }
1112 break;
1113 }
1114 base = ad->words[which][idx];
1115 len = *base++;
1116#ifdef DUMPDOC
1117 printf ("[[");
1118#endif
1119 for (; len; len--)
1120 {
1121 *ptr++ = *base++;
1122#ifdef DUMPDOC
1123 putchar (*(base - 1));
1124#endif
1125 }
1126#ifdef DUMPDOC
1127 printf ("]]");
1128#endif
1129 }
1130 break;
1131 }
1132 }
1133 else
1134 {
1135 /* copy over the matching prefix */
1136 r = (*t >> 4);
1137 while (r--)
1138#ifndef DUMPDOC
1139 *ptr++ = *b++;
1140#else
1141 {
1142 *ptr = *b++;
1143 putchar (*ptr);
1144 ptr++;
1145 }
1146#endif
1147
1148 /* and the stored suffix */
1149 r = ((*t) & 0xf);
1150 while (r--)
1151#ifndef DUMPDOC
1152 *ptr++ = *++t;
1153#else
1154 {
1155 *ptr = *++t;
1156 putchar (*ptr);
1157 ptr++;
1158 }
1159#endif
1160 }
1161 which = !which;
1162 }
1163
1164 DECODE_DONE
1165
1166 * l_out = ptr - s_out;
1167 num += *l_out + 1;
1168
1169 if (cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
1170 for (which = 0; which <= 1; which++)
1171 for (novels_used[which]--; novels_used[which] >= 0; novels_used[which]--)
1172 {
1173 int a = novels_used[which];
1174 int b = swaps[which][novels_used[which]];
1175 u_char *temp;
1176 temp = ad->words[which][a];
1177 ad->words[which][a] = ad->words[which][b];
1178 ad->words[which][b] = temp;
1179 }
1180 return (COMPALLOK);
1181}
Note: See TracBrowser for help on using the repository browser.