source: trunk/gsdl/src/mgpp/text/mgpp_fast_comp_dict.cpp@ 2928

Last change on this file since 2928 was 2928, checked in by jrm21, 22 years ago

replaced bzero and bcopy with memset and memcpy in the src, even though it was
already done in the headers, just to make the code a bit clearer.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.9 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_fast_comp_dict.cpp -- Program to generate a fast compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25// need this to avoid bizarre compiler problems under VC++ 6.0
26#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
27# include <iostream>
28#endif
29
30#if defined (__WIN32__)
31# include "getopt_old.h"
32#else
33# include <unistd.h>
34#endif
35
36#include "sysfuncs.h"
37#include "huffman.h"
38#include "messages.h"
39#include "memlib.h"
40#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
41
42#include "local_strings.h"
43#include "mg.h"
44#include "text.h"
45#include "invf.h"
46#include "mg_files.h"
47#include "locallib.h"
48#include "words.h"
49
50
51
52#define ALIGN_SIZE(p,t) (((p) + (sizeof(t)-1)) & (~(sizeof(t)-1)))
53
54#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
55#define FIXUP(p) (fixup[WORDNO(p,buffer)/8] |= (1<<(WORDNO(p, buffer) & 7)))
56
57#define IS_FIXUP(p) ((fixup[WORDNO(p,buffer)/8] & (1<<(WORDNO(p, buffer) & 7))) != 0)
58
59#define FIXUP_VALS(vals) do { \
60 int i; \
61 for (i=0; i < MAX_HUFFCODE_LEN+1; i++) \
62 FIXUP(&vals[i]); \
63 } while(0)
64
65
66
67static u_long mem_for_comp_dict (char *filename);
68static void load_comp_dict (char *filename);
69static void save_fast_dict (char *filename);
70static void unfixup_buffer (void);
71
72
73static void *buffer;
74static void *cur;
75static u_char *fixup;
76static u_long mem, fixup_mem;
77
78int main (int argc, char **argv)
79{
80 int ch;
81 char *filename = "";
82 opterr = 0;
83 msg_prefix = argv[0];
84 while ((ch = getopt (argc, argv, "f:d:h")) != -1)
85 switch (ch)
86 {
87 case 'f': /* input file */
88 filename = optarg;
89 break;
90 case 'd':
91 set_basepath (optarg);
92 break;
93 case 'h':
94 case '?':
95 fprintf (stderr, "usage: %s [-f input_file] [-d data directory] [-h]\n",
96 argv[0]);
97 exit (1);
98 }
99
100 mem = mem_for_comp_dict (filename);
101
102 fixup_mem = (ALIGN_SIZE (mem, u_char *) / sizeof (u_char *) + 7) / 8;
103
104 cur = buffer = Xmalloc (mem);
105 memset (buffer, 0, mem);
106 fixup = (u_char *) Xmalloc (fixup_mem);
107 memset (fixup, 0, fixup_mem);
108
109#ifndef SILENT
110 Message ("Estimated memory for fast_dict %u", mem);
111 Message ("Estimated memory for fixups %u", fixup_mem);
112#endif
113
114 load_comp_dict (filename);
115
116#ifndef SILENT
117 Message ("Actual memory for fast_dict %u", (char *) cur - (char *) buffer);
118#endif
119
120 if ((u_long) cur > (u_long) buffer + mem)
121 FatalError (1, "The buffer was not big enough for the dictionary");
122
123 {
124 /* [RPAP - Jan 97: Endian Ordering] */
125 compression_dict *cd = (compression_dict *) buffer;
126 int i, which;
127
128 /* cfh */
129 for (which = 0; which <= 1; which++)
130 {
131 int j;
132
133 HTONSI(cd->cfh[which]->hd.num_codes);
134 HTONSI(cd->cfh[which]->hd.mincodelen);
135 HTONSI(cd->cfh[which]->hd.maxcodelen);
136 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
137 {
138 HTONSI(cd->cfh[which]->hd.lencount[j]);
139 HTONUL(cd->cfh[which]->hd.min_code[j]);
140 }
141 HTONUL(cd->cfh[which]->uncompressed_size);
142 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
143 HTONUL(cd->cfh[which]->huff_words_size[j]);
144 }
145 HTONUL(cd->MemForCompDict);
146 /* ad */
147 if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
148 cd->cdh.novel_method == MG_NOVEL_HYBRID)
149 for (which = 0; which <= 1; which++)
150 {
151 int j;
152
153 HTONUL(cd->ad->afh[which].num_frags);
154 HTONUL(cd->ad->afh[which].mem_for_frags);
155 for (j = 0; j < 33; j++)
156 {
157 HTONSI(cd->ad->blk_start[which][j]);
158 HTONSI(cd->ad->blk_end[which][j]);
159 }
160 }
161 /* cdh */
162 HTONUL(cd->cdh.dict_type);
163 HTONUL(cd->cdh.novel_method);
164 for (i = 0; i < TEXT_PARAMS; i++)
165 HTONUL(cd->cdh.params[which]);
166 HTONUL(cd->cdh.num_words[0]);
167 HTONUL(cd->cdh.num_words[1]);
168 HTONUL(cd->cdh.num_word_chars[0]);
169 HTONUL(cd->cdh.num_word_chars[1]);
170 HTONUL(cd->cdh.lookback);
171 HTONSI(cd->fast_loaded);
172 }
173
174 unfixup_buffer ();
175
176 save_fast_dict (filename);
177
178 return (0);
179}
180
181
182
183static void
184unfixup_buffer ()
185{
186 u_long *p;
187 for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++)
188 {
189 if (IS_FIXUP (p))
190 *p = *p - (u_long) buffer;
191 }
192}
193
194
195
196
197static u_long
198mem_for_aux_dict (compression_dict_header * /*cdh*/, char *filename)
199{
200 int i;
201 u_long mem = sizeof (auxiliary_dict);
202 FILE *aux;
203
204 aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb",
205 MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
206
207 for (i = 0; i <= 1; i++)
208 {
209 aux_frags_header afh;
210 fread (&afh, sizeof (afh), 1, aux);
211 NTOHUL(afh.num_frags); /* [RPAP - Jan 97: Endian Ordering] */
212 NTOHUL(afh.mem_for_frags); /* [RPAP - Jan 97: Endian Ordering] */
213 mem += afh.num_frags * sizeof (u_char *);
214 mem = ALIGN_SIZE (mem + afh.mem_for_frags, u_char *);
215 fseek (aux, afh.mem_for_frags, SEEK_CUR);
216 }
217 fclose (aux);
218
219 return mem;
220}
221
222
223
224static u_long
225mem_for_words (FILE * dict, compression_dict_header * cdh,
226 comp_frags_header * cfh)
227{
228 u_long mem = 0;
229 long i, lookback;
230 int ptrs_reqd = 0;
231 int mem_reqd = 0;
232
233 lookback = cdh->lookback;
234
235 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
236 {
237 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
238 mem_reqd += cfh->huff_words_size[i];
239 }
240
241 mem += ptrs_reqd * sizeof (u_char *);
242 mem += (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **);
243 mem += mem_reqd;
244
245 for (i = 0; i < cfh->hd.num_codes; i++)
246 {
247 register int val;
248 for (val = getc (dict) & 0xf; val; val--)
249 getc (dict);
250 }
251 return ALIGN_SIZE (mem, u_char *);
252}
253
254static u_long mem_skip_hd(FILE *dict, u_long mem)
255{ huff_data hd;
256
257 mem += sizeof (hd);
258 Read_Huffman_Data (dict, &hd, NULL, NULL);
259 if (hd.clens)
260 delete hd.clens;
261 mem += hd.num_codes * sizeof (unsigned long);
262 mem += (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *);
263 return mem;
264}
265
266static u_long mem_read_cfh(FILE *dict, compression_dict_header *cdh, comp_frags_header *cfh,
267 u_long mem)
268{
269 mem += sizeof (comp_frags_header);
270 Read_cfh (dict, cfh, NULL, NULL);
271
272 /* Don't need to count the space for the clens of the huffman data */
273
274 mem += mem_for_words (dict, cdh, cfh);
275 if (cfh->hd.clens)
276 delete cfh->hd.clens;
277
278 return mem;
279}
280
281
282static u_long
283mem_for_comp_dict (char *filename)
284{
285 u_long mem = sizeof (compression_dict);
286 compression_dict_header cdh;
287 comp_frags_header cfh;
288 FILE *dict;
289 int which;
290 int i; /* [RPAP - Jan 97: Endian Ordering] */
291
292 dict = open_file (filename, TEXT_DICT_SUFFIX, "rb",
293 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
294
295 fread (&cdh, sizeof (cdh), 1, dict);
296 /* [RPAP - Jan 97: Endian Ordering] */
297 NTOHUL(cdh.dict_type);
298 NTOHUL(cdh.novel_method);
299 for (i = 0; i < TEXT_PARAMS; i++)
300 NTOHUL(cdh.params[i]);
301 NTOHUL(cdh.num_words[0]);
302 NTOHUL(cdh.num_words[1]);
303 NTOHUL(cdh.num_word_chars[0]);
304 NTOHUL(cdh.num_word_chars[1]);
305 NTOHUL(cdh.lookback);
306
307 for (which = 0; which < 2; which++)
308 switch (cdh.dict_type)
309 {
310 case MG_COMPLETE_DICTIONARY:
311 {
312 mem = mem_read_cfh(dict, &cdh, &cfh, mem);
313 }
314 break;
315 case MG_PARTIAL_DICTIONARY:
316 {
317 // huff_data hd;
318 if (cdh.num_words[which])
319 {
320 mem = mem_read_cfh(dict, &cdh, &cfh, mem);
321 }
322
323 mem = mem_skip_hd(dict, mem);
324 mem = mem_skip_hd(dict, mem);
325 }
326 break;
327 case MG_SEED_DICTIONARY:
328 {
329 // huff_data hd;
330 if (cdh.num_words[which])
331 {
332 mem = mem_read_cfh(dict, &cdh, &cfh, mem);
333 }
334 switch (cdh.novel_method)
335 {
336 case MG_NOVEL_HUFFMAN_CHARS:
337 mem = mem_skip_hd(dict, mem);
338 mem = mem_skip_hd(dict, mem);
339 break;
340 case MG_NOVEL_DELTA:
341 break;
342 case MG_NOVEL_HYBRID:
343 break;
344 }
345 break;
346 }
347 }
348 fclose (dict);
349
350 if (cdh.novel_method == MG_NOVEL_DELTA ||
351 cdh.novel_method == MG_NOVEL_HYBRID)
352 mem += mem_for_aux_dict (&cdh, filename);
353
354 return ALIGN_SIZE (mem, u_char *);
355}
356
357
358
359void *
360getmem (u_long size, int align)
361{
362 void *res;
363 cur = (void *) (((u_long) cur + (align - 1)) & (~(align - 1)));
364 res = cur;
365 cur = (char *) cur + size;
366 if ((u_long) cur > (u_long)buffer + mem)
367 FatalError (1, "The buffer was not big enough for the dictionary");
368 return res;
369}
370
371
372
373
374
375
376
377
378
379
380
381static auxiliary_dict *
382LoadAuxDict (compression_dict_header * cdh,
383 char *filename)
384{
385 auxiliary_dict *ad;
386 int i;
387 FILE *aux;
388
389 aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb",
390 MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
391
392 ad = (auxiliary_dict *) getmem (sizeof (auxiliary_dict), sizeof (u_char *));
393
394 for (i = 0; i <= 1; i++)
395 {
396 unsigned int j;
397 u_char *pos;
398
399 fread (&ad->afh[i], sizeof (aux_frags_header), 1, aux);
400
401 /* [RPAP - Jan 97: Endian Ordering] */
402 NTOHUL(ad->afh[i].num_frags);
403 NTOHUL(ad->afh[i].mem_for_frags);
404
405 ad->word_data[i] = (u_char *) getmem (ad->afh[i].mem_for_frags, 1);
406 FIXUP (&ad->word_data[i]);
407
408 ad->words[i] = (u_char **) getmem (ad->afh[i].num_frags * sizeof (u_char *),
409 sizeof (u_char *));
410 FIXUP (&ad->words[i]);
411
412 fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char), aux);
413
414 pos = ad->word_data[i];
415 for (j = 0; j < ad->afh[i].num_frags; j++)
416 {
417 ad->words[i][j] = pos;
418 FIXUP (&ad->words[i][j]);
419 pos += *pos + 1;
420 }
421 if (cdh->novel_method == MG_NOVEL_HYBRID)
422 {
423 int num;
424 num = 1;
425 ad->blk_start[i][0] = 0;
426 ad->blk_end[i][0] = cdh->num_words[i] - 1;
427 while (num < 33)
428 {
429 ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
430 ad->blk_end[i][num] = ad->blk_start[i][num] +
431 (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
432 num++;
433 }
434 }
435 }
436 fclose (aux);
437 return (ad);
438}
439
440
441
442
443
444static u_char ***
445ReadInWords (FILE * dict, compression_dict * cd,
446 comp_frags_header * cfh, u_char ** escape)
447{
448 int i, lookback;
449 int ptrs_reqd = 0;
450 int mem_reqd = 0;
451 int num_set[MAX_HUFFCODE_LEN + 1];
452 u_char *next_word[MAX_HUFFCODE_LEN + 1];
453 u_char **vals;
454 u_char ***values;
455 u_char word[MAXWORDLEN + 1];
456 u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
457
458 lookback = cd->cdh.lookback;
459
460 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
461 {
462 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
463 mem_reqd += cfh->huff_words_size[i];
464 }
465
466 vals = (u_char **) getmem (ptrs_reqd * sizeof (*vals), sizeof (u_char *));
467
468 values = (u_char ***) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (u_char **), sizeof (u_char **));
469
470 next_word[0] = (u_char *) getmem (mem_reqd, sizeof (char));
471
472 cd->MemForCompDict += ptrs_reqd * sizeof (*vals) +
473 (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
474 mem_reqd;
475
476 values[0] = vals;
477 FIXUP (&values[0]);
478 values[0][0] = next_word[0];
479 FIXUP (&values[0][0]);
480 for (i = 1; i <= cfh->hd.maxcodelen; i++)
481 {
482 int next_start = (values[i - 1] - vals) +
483 ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
484 values[i] = &vals[next_start];
485 FIXUP (&values[i]);
486 next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
487 values[i][0] = next_word[i];
488 FIXUP (&values[i][0]);
489 }
490
491 memset (num_set, '\0', sizeof (num_set));
492
493 for (i = 0; i < cfh->hd.num_codes; i++)
494 {
495 register int val, copy;
496 register int len = cfh->hd.clens[i];
497 val = getc (dict);
498 copy = (val >> 4) & 0xf;
499 val &= 0xf;
500
501 fread (word + copy + 1, sizeof (u_char), val, dict);
502 *word = val + copy;
503
504 if ((num_set[len] & ((1 << lookback) - 1)) == 0)
505 {
506 values[len][num_set[len] >> lookback] = next_word[len];
507 FIXUP (&values[len][num_set[len] >> lookback]);
508 memcpy (next_word[len], word, *word + 1);
509 if (escape && i == cfh->hd.num_codes - 1)
510 {
511 *escape = next_word[len];
512 FIXUP (escape);
513 }
514 next_word[len] += *word + 1;
515 }
516 else
517 {
518 copy = prefixlen (last_word[len], word);
519 memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
520 *next_word[len] = (copy << 4) + (*word - copy);
521 if (escape && i == cfh->hd.num_codes - 1)
522 {
523 *escape = next_word[len];
524 FIXUP (escape);
525 }
526 next_word[len] += (*word - copy) + 1;
527 }
528 memcpy (last_word[len], word, *word + 1);
529 num_set[len]++;
530 }
531 if (cfh->hd.clens)
532 delete cfh->hd.clens;
533 cfh->hd.clens = NULL;
534 return values;
535}
536
537
538static unsigned long **
539Generate_Fast_Huffman_Vals (huff_data * data, u_long * mem)
540{
541 int i;
542 unsigned long *fcode[MAX_HUFFCODE_LEN + 1];
543 unsigned long **values;
544 unsigned long *vals;
545
546 if (!data)
547 return (NULL);
548 vals = (unsigned long *) getmem (data->num_codes * sizeof (*vals), sizeof (long *));
549 values = (unsigned long **) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *),
550 sizeof (long *));
551
552 memset (values, '\0', (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *));
553
554 if (mem)
555 *mem += data->num_codes * sizeof (*vals) +
556 (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *);
557
558 fcode[0] = values[0] = &vals[0];
559 FIXUP (&values[0]);
560 for (i = 1; i <= data->maxcodelen; i++)
561 {
562 fcode[i] = values[i] = &vals[(values[i - 1] - vals) + data->lencount[i - 1]];
563 FIXUP (&values[i]);
564 }
565
566 for (i = 0; i < data->num_codes; i++)
567 if (data->clens[i])
568 *fcode[(int) (data->clens[i])]++ = i;
569 return (values);
570}
571
572
573static void load_read_huffman(FILE *dict, compression_dict *cd, int which)
574{
575 huff_data *hd;
576 u_long **vals;
577
578 hd = (huff_data *) getmem (sizeof (huff_data), sizeof (char *));
579 cd->MemForCompDict += sizeof (huff_data);
580 Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL);
581 vals = Generate_Fast_Huffman_Vals (hd, &cd->MemForCompDict);
582 cd->chars_huff[which] = hd;
583 FIXUP (&cd->chars_huff[which]);
584 cd->chars_vals[which] = vals;
585 FIXUP (&cd->chars_vals[which]);
586 if (hd->clens)
587 delete hd->clens;
588 hd->clens = NULL;
589}
590
591static void load_read_words(FILE *dict, compression_dict *cd, int which, int fixescape)
592{
593 cd->cfh[which] = (comp_frags_header *) getmem (sizeof (*cd->cfh[which]), sizeof (u_char *));
594 cd->MemForCompDict += sizeof (*cd->cfh[which]);
595 Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL);
596
597 cd->values[which] = ReadInWords (dict, cd, cd->cfh[which], NULL);
598 FIXUP (&cd->cfh[which]);
599 FIXUP (&cd->values[which]);
600 if (fixescape)
601 { FIXUP(&cd->escape[which]);
602 }
603 else
604 {
605 cd->escape[which] = NULL;
606 }
607}
608
609static void
610load_comp_dict (char *filename)
611{
612 FILE *dict;
613 int which;
614 compression_dict *cd;
615
616 cd = (compression_dict *) getmem (sizeof (compression_dict), sizeof (u_char *));
617 cd->fast_loaded = 1;
618
619 dict = open_file (filename, TEXT_DICT_SUFFIX, "rb",
620 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
621
622 Read_cdh (dict, &cd->cdh, NULL, NULL);
623
624 for (which = 0; which < 2; which++)
625 switch (cd->cdh.dict_type)
626 {
627 case MG_COMPLETE_DICTIONARY:
628 {
629 load_read_words(dict, cd, which, 0);
630 }
631 break;
632 case MG_PARTIAL_DICTIONARY:
633 {
634 if (cd->cdh.num_words[which])
635 {
636 load_read_words(dict, cd, which, 1);
637 }
638
639 load_read_huffman(dict, cd, which);
640 load_read_huffman(dict, cd, which);
641 }
642 break;
643 case MG_SEED_DICTIONARY:
644 {
645 if (cd->cdh.num_words[which])
646 {
647 load_read_words(dict, cd, which, 1);
648 }
649 switch (cd->cdh.novel_method)
650 {
651 case MG_NOVEL_HUFFMAN_CHARS:
652 load_read_huffman(dict, cd, which);
653 load_read_huffman(dict, cd, which);
654 break;
655 case MG_NOVEL_DELTA:
656 break;
657 case MG_NOVEL_HYBRID:
658 break;
659 }
660 break;
661 }
662 }
663 fclose (dict);
664
665
666 if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
667 cd->cdh.novel_method == MG_NOVEL_HYBRID)
668 {
669 cd->ad = LoadAuxDict (&cd->cdh, filename);
670 FIXUP (&cd->ad);
671 }
672}
673
674static void
675save_fast_dict (char *filename)
676{
677 FILE *fdict;
678
679 fdict = create_file (filename, TEXT_DICT_FAST_SUFFIX, "wb",
680 MAGIC_FAST_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
681
682 {
683 u_long *p;
684 for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++)
685 {
686 if (IS_FIXUP (p))
687 HTONUL(*p);
688 }
689 }
690
691 /* [RPAP - Jan 97: Endian Ordering] */
692 HTONUL(mem);
693 HTONUL(fixup_mem);
694
695 fwrite (&mem, sizeof (mem), 1, fdict);
696 fwrite (&fixup_mem, sizeof (fixup_mem), 1, fdict);
697
698 /* [RPAP - Jan 97: Endian Ordering] */
699 NTOHUL(mem);
700 NTOHUL(fixup_mem);
701
702 fwrite (buffer, sizeof (u_char), mem, fdict);
703 fwrite (fixup, sizeof (u_char), fixup_mem, fdict);
704
705 fclose (fdict);
706}
Note: See TracBrowser for help on using the repository browser.