source: trunk/gsdl/src/mgpp/text/mgpp_fast_comp_dict.cpp@ 2557

Last change on this file since 2557 was 2557, checked in by sjboddie, 23 years ago

getopt.h was renamed getopt_old.h

  • Property svn:keywords set to Author Date Id Revision
File size: 16.9 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_fast_comp_dict.cpp -- Program to generate a fast compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25// need this to avoid bizarre compiler problems under VC++ 6.0
26#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
27# include <iostream>
28#endif
29
30#if defined (__WIN32__)
31# include "getopt_old.h"
32#else
33# include <unistd.h>
34#endif
35
36#include "sysfuncs.h"
37#include "huffman.h"
38#include "messages.h"
39#include "memlib.h"
40#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
41
42#include "local_strings.h"
43#include "mg.h"
44#include "text.h"
45#include "invf.h"
46#include "mg_files.h"
47#include "locallib.h"
48#include "words.h"
49
50
51
52#define ALIGN_SIZE(p,t) (((p) + (sizeof(t)-1)) & (~(sizeof(t)-1)))
53
54#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
55#define FIXUP(p) (fixup[WORDNO(p,buffer)/8] |= (1<<(WORDNO(p, buffer) & 7)))
56
57#define IS_FIXUP(p) ((fixup[WORDNO(p,buffer)/8] & (1<<(WORDNO(p, buffer) & 7))) != 0)
58
59#define FIXUP_VALS(vals) do { \
60 int i; \
61 for (i=0; i < MAX_HUFFCODE_LEN+1; i++) \
62 FIXUP(&vals[i]); \
63 } while(0)
64
65
66
67static u_long mem_for_comp_dict (char *filename);
68static void load_comp_dict (char *filename);
69static void save_fast_dict (char *filename);
70static void unfixup_buffer (void);
71
72
73static void *buffer;
74static void *cur;
75static u_char *fixup;
76static u_long mem, fixup_mem;
77
78int main (int argc, char **argv)
79{
80 int ch;
81 char *filename = "";
82 opterr = 0;
83 msg_prefix = argv[0];
84 while ((ch = getopt (argc, argv, "f:d:h")) != -1)
85 switch (ch)
86 {
87 case 'f': /* input file */
88 filename = optarg;
89 break;
90 case 'd':
91 set_basepath (optarg);
92 break;
93 case 'h':
94 case '?':
95 fprintf (stderr, "usage: %s [-f input_file] [-d data directory] [-h]\n",
96 argv[0]);
97 exit (1);
98 }
99
100 mem = mem_for_comp_dict (filename);
101
102 fixup_mem = (ALIGN_SIZE (mem, u_char *) / sizeof (u_char *) + 7) / 8;
103
104 cur = buffer = Xmalloc (mem);
105 memset (buffer, 0, mem);
106 fixup = (u_char *) Xmalloc (fixup_mem);
107 memset (fixup, 0, fixup_mem);
108
109#ifndef SILENT
110 Message ("Estimated memory for fast_dict %u", mem);
111 Message ("Estimated memory for fixups %u", fixup_mem);
112#endif
113
114 load_comp_dict (filename);
115
116#ifndef SILENT
117 Message ("Actual memory for fast_dict %u", (char *) cur - (char *) buffer);
118#endif
119
120 if ((u_long) cur > (u_long) buffer + mem)
121 FatalError (1, "The buffer was not big enough for the dictionary");
122
123 {
124 /* [RPAP - Jan 97: Endian Ordering] */
125 compression_dict *cd = (compression_dict *) buffer;
126 int i, which;
127
128 /* cfh */
129 for (which = 0; which <= 1; which++)
130 {
131 int j;
132
133 HTONSI(cd->cfh[which]->hd.num_codes);
134 HTONSI(cd->cfh[which]->hd.mincodelen);
135 HTONSI(cd->cfh[which]->hd.maxcodelen);
136 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
137 {
138 HTONSI(cd->cfh[which]->hd.lencount[j]);
139 HTONUL(cd->cfh[which]->hd.min_code[j]);
140 }
141 HTONUL(cd->cfh[which]->uncompressed_size);
142 for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
143 HTONUL(cd->cfh[which]->huff_words_size[j]);
144 }
145 HTONUL(cd->MemForCompDict);
146 /* ad */
147 if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
148 cd->cdh.novel_method == MG_NOVEL_HYBRID)
149 for (which = 0; which <= 1; which++)
150 {
151 int j;
152
153 HTONUL(cd->ad->afh[which].num_frags);
154 HTONUL(cd->ad->afh[which].mem_for_frags);
155 for (j = 0; j < 33; j++)
156 {
157 HTONSI(cd->ad->blk_start[which][j]);
158 HTONSI(cd->ad->blk_end[which][j]);
159 }
160 }
161 /* cdh */
162 HTONUL(cd->cdh.dict_type);
163 HTONUL(cd->cdh.novel_method);
164 for (i = 0; i < TEXT_PARAMS; i++)
165 HTONUL(cd->cdh.params[which]);
166 HTONUL(cd->cdh.num_words[0]);
167 HTONUL(cd->cdh.num_words[1]);
168 HTONUL(cd->cdh.num_word_chars[0]);
169 HTONUL(cd->cdh.num_word_chars[1]);
170 HTONUL(cd->cdh.lookback);
171 HTONSI(cd->fast_loaded);
172 }
173
174 unfixup_buffer ();
175
176 save_fast_dict (filename);
177
178 return (0);
179}
180
181
182
183static void
184unfixup_buffer ()
185{
186 u_long *p;
187 for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++)
188 {
189 if (IS_FIXUP (p))
190 *p = *p - (u_long) buffer;
191 }
192}
193
194
195
196
197static u_long
198mem_for_aux_dict (compression_dict_header * /*cdh*/, char *filename)
199{
200 int i;
201 u_long mem = sizeof (auxiliary_dict);
202 FILE *aux;
203
204 aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb",
205 MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
206
207 for (i = 0; i <= 1; i++)
208 {
209 aux_frags_header afh;
210 fread (&afh, sizeof (afh), 1, aux);
211 NTOHUL(afh.num_frags); /* [RPAP - Jan 97: Endian Ordering] */
212 NTOHUL(afh.mem_for_frags); /* [RPAP - Jan 97: Endian Ordering] */
213 mem += afh.num_frags * sizeof (u_char *);
214 mem = ALIGN_SIZE (mem + afh.mem_for_frags, u_char *);
215 fseek (aux, afh.mem_for_frags, SEEK_CUR);
216 }
217 fclose (aux);
218
219 return mem;
220}
221
222
223
224static u_long
225mem_for_words (FILE * dict, compression_dict_header * cdh,
226 comp_frags_header * cfh)
227{
228 u_long mem = 0;
229 long i, lookback;
230 int ptrs_reqd = 0;
231 int mem_reqd = 0;
232
233 lookback = cdh->lookback;
234
235 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
236 {
237 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
238 mem_reqd += cfh->huff_words_size[i];
239 }
240
241 mem += ptrs_reqd * sizeof (u_char *);
242 mem += (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **);
243 mem += mem_reqd;
244
245 for (i = 0; i < cfh->hd.num_codes; i++)
246 {
247 register int val;
248 for (val = getc (dict) & 0xf; val; val--)
249 getc (dict);
250 }
251 return ALIGN_SIZE (mem, u_char *);
252}
253
254static u_long mem_skip_hd(FILE *dict, u_long mem)
255{ huff_data hd;
256
257 mem += sizeof (hd);
258 Read_Huffman_Data (dict, &hd, NULL, NULL);
259 if (hd.clens)
260 delete hd.clens;
261 mem += hd.num_codes * sizeof (unsigned long);
262 mem += (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *);
263 return mem;
264}
265
266static u_long mem_read_cfh(FILE *dict, compression_dict_header *cdh, comp_frags_header *cfh,
267 u_long mem)
268{
269 mem += sizeof (comp_frags_header);
270 Read_cfh (dict, cfh, NULL, NULL);
271
272 /* Don't need to count the space for the clens of the huffman data */
273
274 mem += mem_for_words (dict, cdh, cfh);
275 if (cfh->hd.clens)
276 delete cfh->hd.clens;
277
278 return mem;
279}
280
281
282static u_long
283mem_for_comp_dict (char *filename)
284{
285 u_long mem = sizeof (compression_dict);
286 compression_dict_header cdh;
287 comp_frags_header cfh;
288 FILE *dict;
289 int which;
290 int i; /* [RPAP - Jan 97: Endian Ordering] */
291
292 dict = open_file (filename, TEXT_DICT_SUFFIX, "rb",
293 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
294
295 fread (&cdh, sizeof (cdh), 1, dict);
296 /* [RPAP - Jan 97: Endian Ordering] */
297 NTOHUL(cdh.dict_type);
298 NTOHUL(cdh.novel_method);
299 for (i = 0; i < TEXT_PARAMS; i++)
300 NTOHUL(cdh.params[i]);
301 NTOHUL(cdh.num_words[0]);
302 NTOHUL(cdh.num_words[1]);
303 NTOHUL(cdh.num_word_chars[0]);
304 NTOHUL(cdh.num_word_chars[1]);
305 NTOHUL(cdh.lookback);
306
307 for (which = 0; which < 2; which++)
308 switch (cdh.dict_type)
309 {
310 case MG_COMPLETE_DICTIONARY:
311 {
312 mem = mem_read_cfh(dict, &cdh, &cfh, mem);
313 }
314 break;
315 case MG_PARTIAL_DICTIONARY:
316 {
317 // huff_data hd;
318 if (cdh.num_words[which])
319 {
320 mem = mem_read_cfh(dict, &cdh, &cfh, mem);
321 }
322
323 mem = mem_skip_hd(dict, mem);
324 mem = mem_skip_hd(dict, mem);
325 }
326 break;
327 case MG_SEED_DICTIONARY:
328 {
329 // huff_data hd;
330 if (cdh.num_words[which])
331 {
332 mem = mem_read_cfh(dict, &cdh, &cfh, mem);
333 }
334 switch (cdh.novel_method)
335 {
336 case MG_NOVEL_HUFFMAN_CHARS:
337 mem = mem_skip_hd(dict, mem);
338 mem = mem_skip_hd(dict, mem);
339 break;
340 case MG_NOVEL_DELTA:
341 break;
342 case MG_NOVEL_HYBRID:
343 break;
344 }
345 break;
346 }
347 }
348 fclose (dict);
349
350 if (cdh.novel_method == MG_NOVEL_DELTA ||
351 cdh.novel_method == MG_NOVEL_HYBRID)
352 mem += mem_for_aux_dict (&cdh, filename);
353
354 return ALIGN_SIZE (mem, u_char *);
355}
356
357
358
359void *
360getmem (u_long size, int align)
361{
362 void *res;
363 cur = (void *) (((u_long) cur + (align - 1)) & (~(align - 1)));
364 res = cur;
365 cur = (char *) cur + size;
366 if ((u_long) cur > (u_long)buffer + mem)
367 FatalError (1, "The buffer was not big enough for the dictionary");
368 return res;
369}
370
371
372
373
374
375
376
377
378
379
380
381static auxiliary_dict *
382LoadAuxDict (compression_dict_header * cdh,
383 char *filename)
384{
385 auxiliary_dict *ad;
386 int i;
387 FILE *aux;
388
389 aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb",
390 MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
391
392 ad = (auxiliary_dict *) getmem (sizeof (auxiliary_dict), sizeof (u_char *));
393
394 for (i = 0; i <= 1; i++)
395 {
396 unsigned int j;
397 u_char *pos;
398
399 fread (&ad->afh[i], sizeof (aux_frags_header), 1, aux);
400
401 /* [RPAP - Jan 97: Endian Ordering] */
402 NTOHUL(ad->afh[i].num_frags);
403 NTOHUL(ad->afh[i].mem_for_frags);
404
405 ad->word_data[i] = (u_char *) getmem (ad->afh[i].mem_for_frags, 1);
406 FIXUP (&ad->word_data[i]);
407
408 ad->words[i] = (u_char **) getmem (ad->afh[i].num_frags * sizeof (u_char *),
409 sizeof (u_char *));
410 FIXUP (&ad->words[i]);
411
412 fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char), aux);
413
414 pos = ad->word_data[i];
415 for (j = 0; j < ad->afh[i].num_frags; j++)
416 {
417 ad->words[i][j] = pos;
418 FIXUP (&ad->words[i][j]);
419 pos += *pos + 1;
420 }
421 if (cdh->novel_method == MG_NOVEL_HYBRID)
422 {
423 int num;
424 num = 1;
425 ad->blk_start[i][0] = 0;
426 ad->blk_end[i][0] = cdh->num_words[i] - 1;
427 while (num < 33)
428 {
429 ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
430 ad->blk_end[i][num] = ad->blk_start[i][num] +
431 (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
432 num++;
433 }
434 }
435 }
436 fclose (aux);
437 return (ad);
438}
439
440
441
442
443
444static u_char ***
445ReadInWords (FILE * dict, compression_dict * cd,
446 comp_frags_header * cfh, u_char ** escape)
447{
448 int i, lookback;
449 int ptrs_reqd = 0;
450 int mem_reqd = 0;
451 int num_set[MAX_HUFFCODE_LEN + 1];
452 u_char *next_word[MAX_HUFFCODE_LEN + 1];
453 u_char **vals;
454 u_char ***values;
455 u_char word[MAXWORDLEN + 1];
456 u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
457
458 lookback = cd->cdh.lookback;
459
460 for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
461 {
462 ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
463 mem_reqd += cfh->huff_words_size[i];
464 }
465
466 vals = (u_char **) getmem (ptrs_reqd * sizeof (*vals), sizeof (u_char *));
467
468 values = (u_char ***) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (u_char **), sizeof (u_char **));
469
470 next_word[0] = (u_char *) getmem (mem_reqd, sizeof (char));
471
472 cd->MemForCompDict += ptrs_reqd * sizeof (*vals) +
473 (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
474 mem_reqd;
475
476 values[0] = vals;
477 FIXUP (&values[0]);
478 values[0][0] = next_word[0];
479 FIXUP (&values[0][0]);
480 for (i = 1; i <= cfh->hd.maxcodelen; i++)
481 {
482 int next_start = (values[i - 1] - vals) +
483 ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
484 values[i] = &vals[next_start];
485 FIXUP (&values[i]);
486 next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
487 values[i][0] = next_word[i];
488 FIXUP (&values[i][0]);
489 }
490
491 bzero ((char *) num_set, sizeof (num_set));
492
493 for (i = 0; i < cfh->hd.num_codes; i++)
494 {
495 register int val, copy;
496 register int len = cfh->hd.clens[i];
497 val = getc (dict);
498 copy = (val >> 4) & 0xf;
499 val &= 0xf;
500
501 fread (word + copy + 1, sizeof (u_char), val, dict);
502 *word = val + copy;
503
504 if ((num_set[len] & ((1 << lookback) - 1)) == 0)
505 {
506 values[len][num_set[len] >> lookback] = next_word[len];
507 FIXUP (&values[len][num_set[len] >> lookback]);
508 memcpy (next_word[len], word, *word + 1);
509 if (escape && i == cfh->hd.num_codes - 1)
510 {
511 *escape = next_word[len];
512 FIXUP (escape);
513 }
514 next_word[len] += *word + 1;
515 }
516 else
517 {
518 copy = prefixlen (last_word[len], word);
519 memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
520 *next_word[len] = (copy << 4) + (*word - copy);
521 if (escape && i == cfh->hd.num_codes - 1)
522 {
523 *escape = next_word[len];
524 FIXUP (escape);
525 }
526 next_word[len] += (*word - copy) + 1;
527 }
528 memcpy (last_word[len], word, *word + 1);
529 num_set[len]++;
530 }
531 if (cfh->hd.clens)
532 delete cfh->hd.clens;
533 cfh->hd.clens = NULL;
534 return values;
535}
536
537
538static unsigned long **
539Generate_Fast_Huffman_Vals (huff_data * data, u_long * mem)
540{
541 int i;
542 unsigned long *fcode[MAX_HUFFCODE_LEN + 1];
543 unsigned long **values;
544 unsigned long *vals;
545
546 if (!data)
547 return (NULL);
548 vals = (unsigned long *) getmem (data->num_codes * sizeof (*vals), sizeof (long *));
549 values = (unsigned long **) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *),
550 sizeof (long *));
551
552 bzero ((char *) values, (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *));
553
554 if (mem)
555 *mem += data->num_codes * sizeof (*vals) +
556 (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *);
557
558 fcode[0] = values[0] = &vals[0];
559 FIXUP (&values[0]);
560 for (i = 1; i <= data->maxcodelen; i++)
561 {
562 fcode[i] = values[i] = &vals[(values[i - 1] - vals) + data->lencount[i - 1]];
563 FIXUP (&values[i]);
564 }
565
566 for (i = 0; i < data->num_codes; i++)
567 if (data->clens[i])
568 *fcode[(int) (data->clens[i])]++ = i;
569 return (values);
570}
571
572
573static void load_read_huffman(FILE *dict, compression_dict *cd, int which)
574{
575 huff_data *hd;
576 u_long **vals;
577
578 hd = (huff_data *) getmem (sizeof (huff_data), sizeof (char *));
579 cd->MemForCompDict += sizeof (huff_data);
580 Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL);
581 vals = Generate_Fast_Huffman_Vals (hd, &cd->MemForCompDict);
582 cd->chars_huff[which] = hd;
583 FIXUP (&cd->chars_huff[which]);
584 cd->chars_vals[which] = vals;
585 FIXUP (&cd->chars_vals[which]);
586 if (hd->clens)
587 delete hd->clens;
588 hd->clens = NULL;
589}
590
591static void load_read_words(FILE *dict, compression_dict *cd, int which, int fixescape)
592{
593 cd->cfh[which] = (comp_frags_header *) getmem (sizeof (*cd->cfh[which]), sizeof (u_char *));
594 cd->MemForCompDict += sizeof (*cd->cfh[which]);
595 Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL);
596
597 cd->values[which] = ReadInWords (dict, cd, cd->cfh[which], NULL);
598 FIXUP (&cd->cfh[which]);
599 FIXUP (&cd->values[which]);
600 if (fixescape)
601 { FIXUP(&cd->escape[which]);
602 }
603 else
604 {
605 cd->escape[which] = NULL;
606 }
607}
608
609static void
610load_comp_dict (char *filename)
611{
612 FILE *dict;
613 int which;
614 compression_dict *cd;
615
616 cd = (compression_dict *) getmem (sizeof (compression_dict), sizeof (u_char *));
617 cd->fast_loaded = 1;
618
619 dict = open_file (filename, TEXT_DICT_SUFFIX, "rb",
620 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
621
622 Read_cdh (dict, &cd->cdh, NULL, NULL);
623
624 for (which = 0; which < 2; which++)
625 switch (cd->cdh.dict_type)
626 {
627 case MG_COMPLETE_DICTIONARY:
628 {
629 load_read_words(dict, cd, which, 0);
630 }
631 break;
632 case MG_PARTIAL_DICTIONARY:
633 {
634 if (cd->cdh.num_words[which])
635 {
636 load_read_words(dict, cd, which, 1);
637 }
638
639 load_read_huffman(dict, cd, which);
640 load_read_huffman(dict, cd, which);
641 }
642 break;
643 case MG_SEED_DICTIONARY:
644 {
645 if (cd->cdh.num_words[which])
646 {
647 load_read_words(dict, cd, which, 1);
648 }
649 switch (cd->cdh.novel_method)
650 {
651 case MG_NOVEL_HUFFMAN_CHARS:
652 load_read_huffman(dict, cd, which);
653 load_read_huffman(dict, cd, which);
654 break;
655 case MG_NOVEL_DELTA:
656 break;
657 case MG_NOVEL_HYBRID:
658 break;
659 }
660 break;
661 }
662 }
663 fclose (dict);
664
665
666 if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
667 cd->cdh.novel_method == MG_NOVEL_HYBRID)
668 {
669 cd->ad = LoadAuxDict (&cd->cdh, filename);
670 FIXUP (&cd->ad);
671 }
672}
673
674static void
675save_fast_dict (char *filename)
676{
677 FILE *fdict;
678
679 fdict = create_file (filename, TEXT_DICT_FAST_SUFFIX, "wb",
680 MAGIC_FAST_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
681
682 {
683 u_long *p;
684 for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++)
685 {
686 if (IS_FIXUP (p))
687 HTONUL(*p);
688 }
689 }
690
691 /* [RPAP - Jan 97: Endian Ordering] */
692 HTONUL(mem);
693 HTONUL(fixup_mem);
694
695 fwrite (&mem, sizeof (mem), 1, fdict);
696 fwrite (&fixup_mem, sizeof (fixup_mem), 1, fdict);
697
698 /* [RPAP - Jan 97: Endian Ordering] */
699 NTOHUL(mem);
700 NTOHUL(fixup_mem);
701
702 fwrite (buffer, sizeof (u_char), mem, fdict);
703 fwrite (fixup, sizeof (u_char), fixup_mem, fdict);
704
705 fclose (fdict);
706}
Note: See TracBrowser for help on using the repository browser.