source: main/tags/2.80/indexers/mg/src/text/text_get.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.4 KB
Line 
1/**************************************************************************
2 *
3 * text_get.c -- Function for reading documents from the compressed text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: text_get.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "timing.h"
29#include "messages.h"
30#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
31
32#include "huffman.h"
33#include "bitio_m_mem.h"
34#include "bitio_m.h"
35#include "bitio_stdio.h"
36#include "huffman_stdio.h"
37
38#include "mg.h"
39#include "invf.h"
40#include "text.h"
41#include "lists.h"
42#include "backend.h"
43#include "text_get.h"
44#include "locallib.h"
45#include "words.h"
46#include "mg_errors.h"
47#include "local_strings.h"
48
49
50/*
51 $Log$
52 Revision 1.1 2003/02/20 21:18:24 mdewsnip
53 Addition of MG package for search and retrieval
54
55 Revision 1.1 1999/08/10 21:18:26 sjboddie
56 renamed mg-1.3d directory mg
57
58 Revision 1.1 1998/11/17 09:35:48 rjmcnab
59 *** empty log message ***
60
61 * Revision 1.3 1994/10/20 03:57:11 tes
62 * I have rewritten the boolean query optimiser and abstracted out the
63 * components of the boolean query.
64 *
65 * Revision 1.2 1994/09/20 04:42:15 tes
66 * For version 1.1
67 *
68 */
69
70static char *RCSID = "$Id: text_get.c 3745 2003-02-20 21:20:24Z mdewsnip $";
71
72
73
74
75
76/* FetchDocStart ()
77 * Reads into DocEnt the starting position of the document in the *.text file
78 * Where the first document is document number 1
79 * It returns the true weight of the document.
80 */
81
82
83
84
85static double
86FetchDocStartLev1 (text_data * td, u_long DN,
87 u_long * seek_pos, u_long * len)
88{
89 unsigned long data[2];
90 /* [TS:Sep/94] Fixed up the seek call to give the correct offset */
91 Fseek (td->TextIdxFile,
92 sizeof (unsigned long) * (DN - 1) + /* the doc offsets */
93 sizeof (unsigned long) + /* the magic number */
94 sizeof (compressed_text_header), /* the header */
95 0);
96 Fread ((char *) &data, sizeof (data), 1, td->TextIdxFile);
97
98 /* [RPAP - Jan 97: Endian Ordering] */
99 NTOHUL(data[0]);
100 NTOHUL(data[1]);
101
102 *seek_pos = data[0];
103 *len = data[1] - data[0];
104 return (1.0);
105}
106
107#define MG_PAGE_SIZE 2048
108
109static int
110LoadIdx (text_data * td, unsigned long DN)
111{
112 if (!td->idx_data)
113 {
114 td->idx_data = Xmalloc (sizeof (*(td->idx_data)) * MG_PAGE_SIZE);
115 if (!td->idx_data)
116 FatalError (1, "Out of memory in FDSL2");
117 }
118 if (td->current_pos == -1 || DN >= td->current_pos + MG_PAGE_SIZE - 1 ||
119 DN < td->current_pos)
120 {
121 int i, num; /* [RPAP - Jan 97: Endian Ordering] */
122
123 long rn = (long) DN - (MG_PAGE_SIZE >> 1);
124 if (rn < 1)
125 rn = 1;
126 Fseek (td->TextIdxWgtFile, (sizeof (unsigned long) + sizeof (float)) *
127 (rn - 1) + sizeof (unsigned long), 0);
128 num = Fread ((char *) td->idx_data, sizeof (*(td->idx_data)), MG_PAGE_SIZE, /* [RPAP - Jan 97: Endian Ordering] */
129 td->TextIdxWgtFile);
130
131 /* [RPAP - Jan 97: Endian Ordering] */
132 for (i = 0; i < num; i++)
133 {
134 NTOHUL(td->idx_data[i].Start);
135 NTOHF(td->idx_data[i].Weight);
136 }
137
138 td->current_pos = rn;
139 }
140 return DN - td->current_pos;
141}
142
143static double
144FDSL2 (text_data * td, unsigned long DN, unsigned long *Pos)
145{
146 unsigned long pos = LoadIdx (td, DN);
147 *Pos = td->idx_data[pos].Start;
148 return (td->idx_data[pos].Weight);
149}
150
151
152static double
153FetchDocStartLev2 (text_data * td, u_long DN,
154 u_long * seek_pos, u_long * len)
155{
156 double Weight;
157 unsigned long s1, s2;
158 Weight = FDSL2 (td, DN, &s1);
159 do
160 {
161 DN++;
162 FDSL2 (td, DN, &s2);
163 }
164 while (s2 == s1);
165 *seek_pos = s1;
166 *len = s2 - s1;
167 return (Weight);
168}
169
170
171
172
173double
174FetchDocStart (query_data * qd, u_long DN, u_long * seek_pos, u_long * len)
175{
176 qd->text_idx_lookups++;
177 if (qd->td->TextIdxWgtFile)
178 return FetchDocStartLev2 (qd->td, DN, seek_pos, len);
179 else
180 return FetchDocStartLev1 (qd->td, DN, seek_pos, len);
181}
182
183unsigned long
184FetchInitialParagraph (text_data * td, unsigned long ParaNum)
185{
186 if (td->TextIdxWgtFile)
187 {
188 unsigned long pos;
189 unsigned long start;
190 int PN = ParaNum - 1;
191 pos = LoadIdx (td, ParaNum);
192 start = td->idx_data[pos].Start;
193 while (PN > 0)
194 {
195 pos = LoadIdx (td, PN);
196 if (td->idx_data[pos].Start != start)
197 return PN + 1;
198 PN--;
199 }
200 return PN + 1;
201 }
202 else
203 return ParaNum;
204}
205
206
207
208/* FetchCompressed ()
209 * Reads into buffer DocBuff the compressed form of document DocNum.
210 * Where the first document is document number 1
211 */
212int
213FetchCompressed (query_data * qd, char **DocBuff, DocEntry * DocEnt)
214{
215 if (!DocEnt->SeekPos)
216 FetchDocStart (qd, DocEnt->DocNum, &DocEnt->SeekPos, &DocEnt->Len);
217 if (!(*DocBuff = Xmalloc (DocEnt->Len)))
218 return (-1);
219
220 if (Fseek (qd->td->TextFile, DocEnt->SeekPos, 0) == -1)
221 FatalError (1, "Error when seeking into text file");
222#if 0
223 printf ("Loading compressed text %d %d\n", DocEnt->SeekPos, DocEnt->Len);
224#endif
225 if (Fread (*DocBuff, 1, DocEnt->Len, qd->td->TextFile) != DocEnt->Len)
226 FatalError (1, "Error when reading data");
227
228 return (DocEnt->Len);
229
230}
231
232
233text_data *
234LoadTextData (File * text, File * text_idx_wgt, File * text_idx)
235{
236 text_data *td;
237
238 if (!(td = Xmalloc (sizeof (text_data))))
239 {
240 mg_errno = MG_NOMEM;
241 return (NULL);
242 }
243
244 td->TextFile = text;
245 td->TextIdxWgtFile = text_idx_wgt;
246 td->TextIdxFile = text_idx;
247 td->current_pos = -1;
248 td->idx_data = NULL;
249 Fread (&td->cth, sizeof (td->cth), 1, text);
250
251 /* [RPAP - Jan 97: Endian Ordering] */
252 NTOHUL(td->cth.num_of_docs);
253 NTOHD(td->cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
254 NTOHUL(td->cth.num_of_words);
255 NTOHUL(td->cth.length_of_longest_doc);
256 NTOHD(td->cth.ratio);
257
258 return (td);
259}
260
261
262void
263FreeTextData (text_data * td)
264{
265 if (td)
266 {
267 if (td->idx_data)
268 Xfree (td->idx_data);
269 Xfree (td);
270 }
271}
272
273
274static int
275pts_comp (const void *A, const void *B)
276{
277 const DocEntry *const *a = A;
278 const DocEntry *const *b = B;
279 return (*a)->DocNum - (*b)->DocNum;
280}
281
282
283
284
285int
286GetPosLens (query_data * qd, DocEntry * Docs, int num)
287{
288 DocEntry **pts;
289 int i, j;
290 if (!(pts = Xmalloc (num * sizeof (DocEntry *))))
291 {
292 mg_errno = MG_NOMEM;
293 return (-1);
294 }
295 for (i = j = 0; i < num; i++, Docs++)
296 if (!Docs->SeekPos)
297 pts[j++] = Docs;
298
299 if (j)
300 {
301 qsort (pts, j, sizeof (DocEntry *), pts_comp);
302 for (i = 0; i < j; i++)
303 FetchDocStart (qd, pts[i]->DocNum, &pts[i]->SeekPos, &pts[i]->Len);
304 }
305
306 Xfree (pts);
307 return (0);
308}
309
310
311
312
313
314int
315LoadBuffers (query_data * qd, DocEntry * Docs, int max_mem, int num)
316{
317 DocEntry **pts;
318 int i, j;
319 int mem;
320
321 if (!num)
322 return (0);
323 if (!(pts = Xmalloc (num * sizeof (DocEntry *))))
324 {
325 mg_errno = MG_NOMEM;
326 return (-1);
327 }
328
329 mem = i = 0;
330 do
331 {
332 pts[i] = Docs;
333 mem += Docs->Len;
334 i++;
335 Docs++;
336 }
337 while (i < num && mem < max_mem);
338 if (i > 1)
339 qsort (pts, i, sizeof (DocEntry *), pts_comp);
340 for (j = 0; j < i; j++)
341 {
342 if (FetchCompressed (qd, &pts[j]->CompTextBuffer, pts[j]) == -1)
343 return (-1);
344 ChangeMemInUse (qd, pts[j]->Len);
345 }
346
347 Xfree (pts);
348
349 return (i);
350}
351
352
353
354
355
356void
357FreeBuffers (query_data * qd, DocEntry * Docs, int num)
358{
359 int i;
360 for (i = 0; i < num; i++, Docs++)
361 if (Docs->CompTextBuffer)
362 {
363 Xfree (Docs->CompTextBuffer);
364 Docs->CompTextBuffer = NULL;
365 ChangeMemInUse (qd, -Docs->Len);
366 }
367}
368
369
370
371/****************************************************************************/
372
373static void
374FreeAuxDict (auxiliary_dict * ad)
375{
376 if (!ad)
377 return;
378 if (ad->word_data[0])
379 Xfree (ad->word_data[0]);
380 if (ad->word_data[1])
381 Xfree (ad->word_data[1]);
382 if (ad->words[0])
383 Xfree (ad->words[0]);
384 if (ad->words[1])
385 Xfree (ad->words[1]);
386 Xfree (ad);
387}
388
389static auxiliary_dict *
390LoadAuxDict (compression_dict * cd, File * text_aux_dict)
391{
392 auxiliary_dict *ad;
393 int i;
394
395 if (!(ad = Xmalloc (sizeof (auxiliary_dict))))
396 {
397 mg_errno = MG_NOMEM;
398 return (NULL);
399 }
400
401 bzero ((char *) ad, sizeof (*ad));
402
403 for (i = 0; i <= 1; i++)
404 {
405 int j;
406 u_char *pos;
407
408 Fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
409
410 /* [RPAP - Jan 97: Endian Ordering] */
411 NTOHUL(ad->afh[i].num_frags);
412 NTOHUL(ad->afh[i].mem_for_frags);
413
414 if (!(ad->word_data[i] = Xmalloc (ad->afh[i].mem_for_frags)))
415 {
416 mg_errno = MG_NOMEM;
417 FreeAuxDict (ad);
418 return (NULL);
419 }
420 if (!(ad->words[i] = Xmalloc (ad->afh[i].num_frags * sizeof (u_char *))))
421 {
422 mg_errno = MG_NOMEM;
423 FreeAuxDict (ad);
424 return (NULL);
425 }
426
427 Fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
428 text_aux_dict);
429
430 pos = ad->word_data[i];
431 for (j = 0; j < ad->afh[i].num_frags; j++)
432 {
433 ad->words[i][j] = pos;
434 pos += *pos + 1;
435 }
436 if (cd->cdh.novel_method == MG_NOVEL_HYBRID ||
437 cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
438 {
439 int num;
440 num = 1;
441 ad->blk_start[i][0] = 0;
442 ad->blk_end[i][0] = cd->cdh.num_words[i] - 1;
443 while (num < 33)
444 {
445 ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
446 ad->blk_end[i][num] = ad->blk_start[i][num] +
447 (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
448 num++;
449 }
450 }
451 }
452 return (ad);
453}
454
455
456
457
458
459
460static u_char ***
461ReadInWords (File * dict, compression_dict * cd,
462 comp_frags_header * cfh, u_char ** escape)
463{
464 int i, lookback;
465 int ptrs_reqd = 0;
466 int mem_reqd = 0;
467 int num_set[MAX_HUFFCODE_LEN + 1];
468 u_char *next_word[MAX_HUFFCODE_LEN + 1];
469 u_char **vals;
470 u_char ***values;
471 u_char word[MAXWORDLEN + 1];
472 u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
473
474 lookback = cd->cdh.lookback;
475
476 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
477 {
478 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
479 mem_reqd += cfh->huff_words_size[i];
480 }
481
482 if (!(vals = Xmalloc (ptrs_reqd * sizeof (*vals))))
483 return (NULL);
484
485 if (!(values = Xmalloc ((MAX_HUFFCODE_LEN + 1) * sizeof (u_char **))))
486 return (NULL);
487
488 if (!(next_word[0] = Xmalloc (mem_reqd))) return (NULL);
489
490 cd->MemForCompDict += ptrs_reqd * sizeof (*vals) +
491 (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
492 mem_reqd;
493
494 values[0] = vals;
495 values[0][0] = next_word[0];
496 for (i = 1; i <= cfh->hd.maxcodelen; i++)
497 {
498 int next_start = (values[i - 1] - vals) +
499 ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
500 values[i] = &vals[next_start];
501 next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
502 values[i][0] = next_word[i];
503 }
504
505 bzero ((char *) num_set, sizeof (num_set));
506
507 for (i = 0; i < cfh->hd.num_codes; i++)
508 {
509 register int val, copy;
510 register int len = cfh->hd.clens[i];
511 val = Getc (dict);
512 copy = (val >> 4) & 0xf;
513 val &= 0xf;
514
515 Fread (word + copy + 1, sizeof (u_char), val, dict);
516 *word = val + copy;
517
518 if ((num_set[len] & ((1 << lookback) - 1)) == 0)
519 {
520 values[len][num_set[len] >> lookback] = next_word[len];
521 memcpy (next_word[len], word, *word + 1);
522 if (escape && i == cfh->hd.num_codes - 1)
523 *escape = next_word[len];
524 next_word[len] += *word + 1;
525 }
526 else
527 {
528 copy = prefixlen (last_word[len], word);
529 memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
530 *next_word[len] = (copy << 4) + (*word - copy);
531 if (escape && i == cfh->hd.num_codes - 1)
532 *escape = next_word[len];
533 next_word[len] += (*word - copy) + 1;
534 }
535 memcpy (last_word[len], word, *word + 1);
536 num_set[len]++;
537 }
538 if (cfh->hd.clens)
539 Xfree (cfh->hd.clens);
540 cfh->hd.clens = NULL;
541 return values;
542}
543
544
545static compression_dict *
546Load_Comp_Dict (File * dict, File * aux_dict)
547{
548 int which;
549 compression_dict *cd;
550
551 if (!(cd = Xmalloc (sizeof (compression_dict))))
552 {
553 mg_errno = MG_NOMEM;
554 return (NULL);
555 }
556
557 bzero ((char *) cd, sizeof (compression_dict));
558
559 cd->MemForCompDict = sizeof (compression_dict);
560
561 if (F_Read_cdh (dict, &cd->cdh, &cd->MemForCompDict, NULL) == -1)
562 return NULL;
563
564 for (which = 0; which < 2; which++)
565 switch (cd->cdh.dict_type)
566 {
567 case MG_COMPLETE_DICTIONARY:
568 {
569 if (!(cd->cfh[which] = Xmalloc (sizeof (*cd->cfh[which]))))
570 return NULL;
571 cd->MemForCompDict += sizeof (*cd->cfh[which]);
572 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
573 return NULL;
574
575 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
576 NULL)))
577 return NULL;
578 cd->escape[which] = NULL;
579
580 }
581 break;
582 case MG_PARTIAL_DICTIONARY:
583 {
584 huff_data *hd;
585 u_long **vals;
586 if (cd->cdh.num_words[which])
587 {
588 if (!(cd->cfh[which] = Xmalloc (sizeof (*cd->cfh[which]))))
589 return NULL;
590 cd->MemForCompDict += sizeof (*cd->cfh[which]);
591 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
592 return NULL;
593
594 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
595 &cd->escape[which])))
596 return NULL;
597 }
598 if (!(hd = Xmalloc (sizeof (huff_data))))
599 return NULL;
600 cd->MemForCompDict += sizeof (huff_data);
601 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL) == -1)
602 return NULL;
603 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
604 return NULL;
605 if (hd->clens)
606 Xfree (hd->clens);
607 hd->clens = NULL;
608 cd->chars_huff[which] = hd;
609 cd->chars_vals[which] = vals;
610 if (!(hd = Xmalloc (sizeof (huff_data))))
611 return NULL;
612 cd->MemForCompDict += sizeof (huff_data);
613 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL) == -1)
614 return NULL;
615 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
616 return NULL;
617 cd->lens_huff[which] = hd;
618 cd->lens_vals[which] = vals;
619 if (hd->clens)
620 Xfree (hd->clens);
621 hd->clens = NULL;
622 }
623 break;
624 case MG_SEED_DICTIONARY:
625 {
626 huff_data *hd;
627 u_long **vals;
628 if (cd->cdh.num_words[which])
629 {
630 if (!(cd->cfh[which] = Xmalloc (sizeof (*cd->cfh[which]))))
631 return NULL;
632 cd->MemForCompDict += sizeof (*cd->cfh[which]);
633 if (F_Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL) == -1)
634 return NULL;
635
636 if (!(cd->values[which] = ReadInWords (dict, cd, cd->cfh[which],
637 &cd->escape[which])))
638 return NULL;
639 }
640 switch (cd->cdh.novel_method)
641 {
642 case MG_NOVEL_HUFFMAN_CHARS:
643 if (!(hd = Xmalloc (sizeof (huff_data))))
644 return NULL;
645 cd->chars_huff[which] = hd;
646 cd->MemForCompDict += sizeof (huff_data);
647 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict,
648 NULL) == -1)
649 return NULL;
650 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
651 return NULL;
652 cd->chars_vals[which] = vals;
653 if (hd->clens)
654 Xfree (hd->clens);
655 hd->clens = NULL;
656 if (!(hd = Xmalloc (sizeof (huff_data))))
657 return NULL;
658 cd->MemForCompDict += sizeof (huff_data);
659 cd->lens_huff[which] = hd;
660 if (F_Read_Huffman_Data (dict, hd, &cd->MemForCompDict
661 ,NULL) == -1)
662 return NULL;
663 if (!(vals = Generate_Huffman_Vals (hd, &cd->MemForCompDict)))
664 return NULL;
665 cd->lens_vals[which] = vals;
666 if (hd->clens)
667 Xfree (hd->clens);
668 hd->clens = NULL;
669 break;
670 case MG_NOVEL_BINARY:
671 break;
672 case MG_NOVEL_DELTA:
673 break;
674 case MG_NOVEL_HYBRID:
675 break;
676 case MG_NOVEL_HYBRID_MTF:
677 break;
678 }
679 break;
680 }
681 }
682
683 if (cd->cdh.novel_method == MG_NOVEL_BINARY ||
684 cd->cdh.novel_method == MG_NOVEL_DELTA ||
685 cd->cdh.novel_method == MG_NOVEL_HYBRID ||
686 cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
687 {
688 if (!aux_dict)
689 {
690 mg_errno = MG_NOFILE;
691 FreeCompDict (cd);
692 return (NULL);
693 }
694
695 if (!(cd->ad = LoadAuxDict (cd, aux_dict)))
696 {
697 FreeCompDict (cd);
698 return (NULL);
699 }
700 }
701
702
703 mg_errno = MG_NOERROR;
704
705 cd->fast_loaded = 0;
706 return (cd);
707}
708
709#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
710
711#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p, cd) & 7))) != 0)
712
713
714static compression_dict *
715Load_Fast_Comp_Dict (File * text_fast_comp_dict)
716{
717 compression_dict *cd;
718 u_long *p, *end;
719 u_char *fixup;
720 u_long mem;
721 u_long fixup_mem;
722 int i; /* [RPAP - Jan 97: Endian Ordering] */
723
724 Fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
725 NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
726 Fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
727 NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
728 if (!(cd = Xmalloc (mem)))
729 {
730 mg_errno = MG_NOMEM;
731 return (NULL);
732 }
733
734 end = (u_long *) (((u_char *) cd) + mem);
735 Fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
736
737 if (!(fixup = Xmalloc (fixup_mem)))
738 {
739 mg_errno = MG_NOMEM;
740 return (NULL);
741 }
742
743 Fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
744
745 for (p = (u_long *) cd; (u_long) p < (u_long) end; p++)
746 if (IS_FIXUP (p))
747 {
748 NTOHUL(*p); /* [RPAP - Jan 97: Endian Ordering] */
749 *p = *p + (u_long) cd;
750 }
751
752 /* [RPAP - Jan 97: Endian Ordering] */
753 /* cdh */
754 NTOHUL(cd->cdh.dict_type);
755 NTOHUL(cd->cdh.novel_method);
756 for (i = 0; i < TEXT_PARAMS; i++)
757 NTOHUL(cd->cdh.params[i]);
758 NTOHUL(cd->cdh.num_words[0]);
759 NTOHUL(cd->cdh.num_words[1]);
760 NTOHUL(cd->cdh.num_word_chars[0]);
761 NTOHUL(cd->cdh.num_word_chars[1]);
762 NTOHUL(cd->cdh.lookback);
763 /* cfh */
764 for (i = 0; i <= 1; i++)
765 {
766 int j;
767
768 NTOHSI(cd->cfh[i]->hd.num_codes);
769 NTOHSI(cd->cfh[i]->hd.mincodelen);
770 NTOHSI(cd->cfh[i]->hd.maxcodelen);
771 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
772 {
773 NTOHSI(cd->cfh[i]->hd.lencount[j]);
774 NTOHUL(cd->cfh[i]->hd.min_code[j]);
775 }
776 NTOHUL(cd->cfh[i]->uncompressed_size);
777 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
778 NTOHUL(cd->cfh[i]->huff_words_size[j]);
779 }
780 NTOHUL(cd->MemForCompDict);
781 /* ad */
782 if (cd->cdh.novel_method == MG_NOVEL_BINARY ||
783 cd->cdh.novel_method == MG_NOVEL_DELTA ||
784 cd->cdh.novel_method == MG_NOVEL_HYBRID ||
785 cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
786 for (i = 0; i <= 1; i++)
787 {
788 int j;
789
790 NTOHUL(cd->ad->afh[i].num_frags);
791 NTOHUL(cd->ad->afh[i].mem_for_frags);
792 for (j = 0; j < 33; j++)
793 {
794 NTOHSI(cd->ad->blk_start[i][j]);
795 NTOHSI(cd->ad->blk_end[i][j]);
796 }
797 }
798 NTOHSI(cd->fast_loaded);
799
800 Xfree (fixup);
801 return (cd);
802}
803
804
805
806
807compression_dict *
808LoadCompDict (File * text_comp_dict,
809 File * text_aux_dict,
810 File * text_fast_comp_dict)
811{
812 return text_fast_comp_dict ?
813 Load_Fast_Comp_Dict (text_fast_comp_dict) :
814 Load_Comp_Dict (text_comp_dict, text_aux_dict);
815}
816
817
818
819
820void
821FreeCompDict (compression_dict * cd)
822{
823 int which;
824 if (cd->fast_loaded)
825 {
826 Xfree (cd);
827 return;
828 }
829 for (which = 0; which < 2; which++)
830 {
831 if (cd->cfh[which])
832 Xfree (cd->cfh[which]);
833 if (cd->chars_huff[which])
834 Xfree (cd->chars_huff[which]);
835 if (cd->lens_huff[which])
836 Xfree (cd->lens_huff[which]);
837 if (cd->values[which])
838 {
839 Xfree (cd->values[which][0][0]); /* [RJM 07/98: Memory Leak] */
840 Xfree (cd->values[which][0]);
841 Xfree (cd->values[which]);
842 }
843 if (cd->chars_vals[which])
844 {
845 Xfree (cd->chars_vals[which][0]);
846 Xfree (cd->chars_vals[which]);
847 }
848 if (cd->lens_vals[which])
849 {
850 Xfree (cd->lens_vals[which][0]);
851 Xfree (cd->lens_vals[which]);
852 }
853 }
854 if (cd->ad)
855 FreeAuxDict (cd->ad);
856 Xfree (cd);
857}
858
859
860
861
862
863#define MY_HUFF_DECODE(len, code, mcodes) \
864 do { \
865 register unsigned long *__min_code = (mcodes); \
866 register unsigned long *__mclen = __min_code; \
867 register unsigned long __code = 0; \
868 do \
869 { \
870 DECODE_ADD(__code); \
871 } \
872 while (__code < *++__mclen); \
873 (len) = __mclen - __min_code; \
874 (code) = __code - *__mclen; \
875 } while(0);
876
877
878/*#define DUMPDOC */
879
880#define MAX_SWAPS 10000
881
882int
883DecodeText (compression_dict * cd,
884 u_char * s_in, int l_in, u_char * s_out, int *l_out)
885{
886 auxiliary_dict *ad = cd->ad;
887 int which;
888 u_long num_bits, bits;
889 u_char *ptr = s_out;
890 static int num = 0;
891 u_long binary_start[2];
892 int novels_used[2];
893 int swaps[2][MAX_SWAPS];
894 novels_used[0] = novels_used[1] = 0;
895
896 {
897 unsigned char bf = s_in[l_in - 1];
898 num_bits = 1;
899 while ((bf & 1) != 1)
900 {
901 num_bits++;
902 bf >>= 1;
903 }
904 num_bits = l_in * 8 - num_bits;
905 }
906
907 DECODE_START (s_in, l_in)
908
909 which = DECODE_BIT;
910 bits = 1;
911
912 if (cd->cdh.novel_method == MG_NOVEL_BINARY)
913 {
914 DELTA_DECODE_L (binary_start[0], bits);
915 DELTA_DECODE_L (binary_start[1], bits);
916 }
917
918
919 while (bits < num_bits)
920 {
921 register unsigned code, len;
922 register int r;
923 register u_char *t, *b = NULL;
924 u_char word[MAXWORDLEN + 1];
925
926#ifdef DUMPDOC
927 printf ("\n%d %d ", bits, num_bits);
928#endif
929 if (cd->cfh[which])
930 {
931 MY_HUFF_DECODE (len, code, cd->cfh[which]->hd.min_code);
932 bits += len;
933
934 r = code & ((1 << cd->cdh.lookback) - 1);
935 t = cd->values[which][len][code >> cd->cdh.lookback];
936
937 /* step through from base pointer */
938 b = word + 1;
939 while (r--)
940 {
941 register int copy = *t >> 4;
942 memcpy (word + copy + 1, t + 1, *t & 0xf);
943 word[0] = copy + (*t & 0xf);
944 t += ((*t) & 0xf) + 1;
945 }
946 }
947 else
948 t = NULL;
949 if (t == cd->escape[which])
950 {
951 switch (cd->cdh.novel_method)
952 {
953 case MG_NOVEL_HUFFMAN_CHARS:
954 {
955 int len, i;
956 int c;
957 HUFF_DECODE_L (len, cd->lens_huff[which]->min_code,
958 cd->lens_vals[which], bits);
959 for (i = 0; i < len; i++)
960 {
961 HUFF_DECODE_L (c, cd->chars_huff[which]->min_code,
962 cd->chars_vals[which], bits);
963 *ptr++ = c;
964 }
965 }
966 break;
967 case MG_NOVEL_BINARY:
968 case MG_NOVEL_DELTA:
969 case MG_NOVEL_HYBRID:
970 case MG_NOVEL_HYBRID_MTF:
971 {
972 int idx = 0, len;
973 u_char *base;
974 switch (cd->cdh.novel_method)
975 {
976 case MG_NOVEL_BINARY:
977 {
978 BINARY_DECODE_L (idx, binary_start[which], bits);
979 if (idx == binary_start[which])
980 binary_start[which]++;
981 idx--;
982 }
983 break;
984 case MG_NOVEL_DELTA:
985 {
986 DELTA_DECODE_L (idx, bits);
987 idx--;
988 }
989 break;
990 case MG_NOVEL_HYBRID:
991 {
992 int k;
993 GAMMA_DECODE_L (k, bits);
994 k--;
995 BINARY_DECODE_L (idx,
996 ad->blk_end[which][k] -
997 ad->blk_start[which][k] + 1, bits);
998 idx += ad->blk_start[which][k] - 1;
999 }
1000 break;
1001 case MG_NOVEL_HYBRID_MTF:
1002 {
1003 int k;
1004 GAMMA_DECODE_L (k, bits);
1005 k--;
1006 BINARY_DECODE_L (idx,
1007 ad->blk_end[which][k] -
1008 ad->blk_start[which][k] + 1, bits);
1009 idx += ad->blk_start[which][k] - 1;
1010 if (idx >= novels_used[which])
1011 {
1012 u_char *temp;
1013 temp = ad->words[which][idx];
1014 ad->words[which][idx] =
1015 ad->words[which][novels_used[which]];
1016 ad->words[which][novels_used[which]] = temp;
1017 swaps[which][novels_used[which]] = idx;
1018 idx = novels_used[which]++;
1019 }
1020 }
1021 break;
1022 }
1023 base = ad->words[which][idx];
1024 len = *base++;
1025#ifdef DUMPDOC
1026 printf ("[[");
1027#endif
1028 for (; len; len--)
1029 {
1030 *ptr++ = *base++;
1031#ifdef DUMPDOC
1032 putchar (*(base - 1));
1033#endif
1034 }
1035#ifdef DUMPDOC
1036 printf ("]]");
1037#endif
1038 }
1039 break;
1040 }
1041 }
1042 else
1043 {
1044 /* copy over the matching prefix */
1045 r = (*t >> 4);
1046 while (r--)
1047#ifndef DUMPDOC
1048 *ptr++ = *b++;
1049#else
1050 {
1051 *ptr = *b++;
1052 putchar (*ptr);
1053 ptr++;
1054 }
1055#endif
1056
1057 /* and the stored suffix */
1058 r = ((*t) & 0xf);
1059 while (r--)
1060#ifndef DUMPDOC
1061 *ptr++ = *++t;
1062#else
1063 {
1064 *ptr = *++t;
1065 putchar (*ptr);
1066 ptr++;
1067 }
1068#endif
1069 }
1070 which = !which;
1071 }
1072
1073 DECODE_DONE
1074
1075 * l_out = ptr - s_out;
1076 num += *l_out + 1;
1077
1078 if (cd->cdh.novel_method == MG_NOVEL_HYBRID_MTF)
1079 for (which = 0; which <= 1; which++)
1080 for (novels_used[which]--; novels_used[which] >= 0; novels_used[which]--)
1081 {
1082 int a = novels_used[which];
1083 int b = swaps[which][novels_used[which]];
1084 u_char *temp;
1085 temp = ad->words[which][a];
1086 ad->words[which][a] = ad->words[which][b];
1087 ad->words[which][b] = temp;
1088 }
1089 return (COMPALLOK);
1090}
Note: See TracBrowser for help on using the repository browser.