Context Navigation

source: trunk/gsdl/src/mgpp/text/TextGet.cpp@ 2928

Last change on this file since 2928 was 2928, checked in by jrm21, 22 years ago
replaced bzero and bcopy with memset and memcpy in the src, even though it was already done in the headers, just to make the code a bit clearer.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.8 KB

Line
1	/**************************************************************************
2	*
3	* TextGet.cpp -- Decompressing the text
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	// need this to avoid bizarre compiler problems under VC++ 6.0
23	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
24	# include <iostream>
25	#endif
26
27	#include "TextGet.h"
28	#include "mg_files.h"
29	#include "netorder.h"
30	#include "mg_errors.h"
31	#include "locallib.h"
32	#include "words.h"
33	#include "local_strings.h"
34	#include "bitio_m_stdio.h"
35
36	typedef enum huff_type {lengths, chars};
37
38
39	static auxiliary_dict LoadAuxDict (compression_dict &cd, FILE text_aux_dict) {
40	auxiliary_dict *ad;
41	int i;
42
43	if (!(ad = new auxiliary_dict))
44	{
45	mg_errno = MG_NOMEM;
46	return (NULL);
47	}
48
49	memset (ad, '\0', sizeof (*ad));
50
51	for (i = 0; i <= 1; i++)
52	{
53	int j;
54	u_char *pos;
55
56	fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
57
58	/* [RPAP - Jan 97: Endian Ordering] */
59	NTOHUL(ad->afh[i].num_frags);
60	NTOHUL(ad->afh[i].mem_for_frags);
61
62	if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags]))
63	{
64	mg_errno = MG_NOMEM;
65	delete ad;
66	return (NULL);
67	}
68	if (!(ad->words[i] = new u_char* [ad->afh[i].num_frags]))
69	{
70	mg_errno = MG_NOMEM;
71	delete ad;
72	return (NULL);
73	}
74
75	fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
76	text_aux_dict);
77
78	pos = ad->word_data[i];
79	for (j = 0; j < (int)ad->afh[i].num_frags; j++)
80	{
81	ad->words[i][j] = pos;
82	pos += *pos + 1;
83	}
84	if (cd.cdh.novel_method == MG_NOVEL_HYBRID)
85	{
86	int num;
87	num = 1;
88	ad->blk_start[i][0] = 0;
89	ad->blk_end[i][0] = cd.cdh.num_words[i] - 1;
90	while (num < 33)
91	{
92	ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
93	ad->blk_end[i][num] = ad->blk_start[i][num] +
94	(ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
95	num++;
96	}
97	}
98	}
99	return (ad);
100	}
101
102
103	static u_char **ReadInWords (FILE dict, compression_dict &cd,
104	comp_frags_header cfh, u_char *escape) {
105	int i, lookback;
106	int ptrs_reqd = 0;
107	int mem_reqd = 0;
108	int num_set[MAX_HUFFCODE_LEN + 1];
109	u_char *next_word[MAX_HUFFCODE_LEN + 1];
110	u_char **vals;
111	u_char ***values;
112	u_char word[MAXWORDLEN + 1];
113	u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
114
115	lookback = cd.cdh.lookback;
116
117	for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++) {
118	ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
119	mem_reqd += cfh->huff_words_size[i];
120	}
121
122	if (!(vals = new u_char* [ptrs_reqd]))
123	return (NULL);
124
125	if (!(values = new u_char** [MAX_HUFFCODE_LEN + 1]))
126	return (NULL);
127
128	if (!(next_word[0] = new u_char[mem_reqd]))
129	return (NULL);
130
131	cd.MemForCompDict += ptrs_reqd * sizeof (*vals) +
132	(MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
133	mem_reqd;
134
135	values[0] = vals;
136	values[0][0] = next_word[0];
137	for (i = 1; i <= cfh->hd.maxcodelen; i++)
138	{
139	int next_start = (values[i - 1] - vals) +
140	((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
141	values[i] = &vals[next_start];
142	next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
143	values[i][0] = next_word[i];
144	}
145
146	memset (num_set, '\0', sizeof (num_set));
147
148	for (i = 0; i < cfh->hd.num_codes; i++)
149	{
150	register int val, copy;
151	register int len = cfh->hd.clens[i];
152	val = getc (dict);
153	copy = (val >> 4) & 0xf;
154	val &= 0xf;
155
156	fread (word + copy + 1, sizeof (u_char), val, dict);
157	*word = val + copy;
158
159	if ((num_set[len] & ((1 << lookback) - 1)) == 0)
160	{
161	values[len][num_set[len] >> lookback] = next_word[len];
162	memcpy (next_word[len], word, *word + 1);
163	if (escape && i == cfh->hd.num_codes - 1)
164	*escape = next_word[len];
165	next_word[len] += *word + 1;
166	}
167	else
168	{
169	copy = prefixlen (last_word[len], word);
170	memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
171	next_word[len] = (copy << 4) + (word - copy);
172	if (escape && i == cfh->hd.num_codes - 1)
173	*escape = next_word[len];
174	next_word[len] += (*word - copy) + 1;
175	}
176	memcpy (last_word[len], word, *word + 1);
177	num_set[len]++;
178	}
179	if (cfh->hd.clens)
180	delete cfh->hd.clens;
181	cfh->hd.clens = NULL;
182	return values;
183	}
184
185	static int Load_Comp_HuffData(compression_dict &cd, int which, FILE *dict,
186	huff_type type) {
187	huff_data * hd;
188	u_long ** vals;
189
190	if (!(hd = new huff_data))
191	return 1;
192	cd.MemForCompDict += sizeof (huff_data);
193	if (Read_Huffman_Data (dict, hd, &cd.MemForCompDict, NULL) == -1)
194	return 2;
195	if (!(vals = Generate_Huffman_Vals (hd, &cd.MemForCompDict)))
196	return 3;
197	if (hd->clens)
198	delete hd->clens;
199	hd->clens = NULL;
200	if (type == chars)
201	{
202	cd.chars_huff[which] = hd;
203	cd.chars_vals[which] = vals;
204	}
205	else
206	{
207	cd.lens_huff[which] = hd;
208	cd.lens_vals[which] = vals;
209	}
210
211	return 0;
212	}
213
214	static int Load_Comp_FragsHeader(compression_dict &cd, int which, int getEscape,
215	FILE *dict) {
216	if (!(cd.cfh[which] = new comp_frags_header))
217	return 1;
218	cd.MemForCompDict += sizeof (*cd.cfh[which]);
219	if (Read_cfh (dict, cd.cfh[which], &cd.MemForCompDict, NULL) == -1)
220	return 2;
221
222	if (!(cd.values[which] = ReadInWords (dict, cd, cd.cfh[which],
223	getEscape == 0 ? NULL : &cd.escape[which])))
224	return 3;
225
226	return 0;
227	}
228
229	static bool LoadSlowCompDict (FILE dict, FILE aux_dict, compression_dict &cd) {
230	if (dict == NULL) return false;
231
232	int which;
233
234	memset (&cd, '\0', sizeof (compression_dict));
235
236	cd.MemForCompDict = sizeof (compression_dict);
237
238	if (Read_cdh (dict, &cd.cdh, &cd.MemForCompDict, NULL) == -1)
239	return false;
240
241	for (which = 0; which < 2; which++)
242	switch (cd.cdh.dict_type)
243	{
244	case MG_COMPLETE_DICTIONARY:
245	{
246	if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0)
247	return false;
248	cd.escape[which] = NULL;
249
250	}
251	break;
252	case MG_PARTIAL_DICTIONARY:
253	{
254	if (cd.cdh.num_words[which])
255	{
256	if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
257	return false;
258	}
259
260	if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
261	return false;
262
263	if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
264	return false;
265	}
266	break;
267	case MG_SEED_DICTIONARY:
268	{
269	if (cd.cdh.num_words[which])
270	{
271	if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
272	return false;
273	}
274	switch (cd.cdh.novel_method)
275	{
276	case MG_NOVEL_HUFFMAN_CHARS:
277	if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
278	return false;
279
280	if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
281	return false;
282	break;
283	case MG_NOVEL_DELTA:
284	break;
285	case MG_NOVEL_HYBRID:
286	break;
287	}
288	break;
289	}
290	}
291
292	if (cd.cdh.novel_method == MG_NOVEL_DELTA \|\|
293	cd.cdh.novel_method == MG_NOVEL_HYBRID)
294	{
295	if (!aux_dict)
296	{
297	mg_errno = MG_NOFILE;
298	cd.Clear();
299	return false;
300	}
301
302	if (!(cd.ad = LoadAuxDict (cd, aux_dict)))
303	{
304	cd.Clear();
305	return false;
306	}
307	}
308
309	mg_errno = MG_NOERROR;
310
311	cd.fast_loaded = 0;
312
313	return true;
314	}
315
316
317
318	#define WORDNO(p, base) ((((char)(p))-((char)(base)))/sizeof(u_char*))
319	#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p,cd) & 7))) != 0)
320
321	// fast loading really needs to be totally re-writen. "Unloading" the
322	// text data will currently cause a crash because memory is being
323	// deleted multiple times (and probably a zillion other reasons).
324	static bool LoadFastCompDict (FILE *text_fast_comp_dict, compression_dict &_cd) {
325	if (text_fast_comp_dict == NULL) return false;
326
327	u_long p, end;
328	u_char *fixup;
329	u_long mem;
330	u_long fixup_mem;
331	int i; /* [RPAP - Jan 97: Endian Ordering] */
332
333	fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
334	NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
335	fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
336	NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
337
338	compression_dict *cd;
339	if (!(cd = (compression_dict *)malloc (mem))) {
340	mg_errno = MG_NOMEM;
341	return false;
342	}
343
344	end = (u_long ) (((u_char ) cd) + mem);
345	fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
346
347	if (!(fixup = new u_char[fixup_mem]))
348	{
349	mg_errno = MG_NOMEM;
350	return false;
351	}
352
353	fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
354
355	for (p = (u_long *) cd; (u_long) p < (u_long) end; p++)
356	if (IS_FIXUP (p))
357	{
358	NTOHUL(p); / [RPAP - Jan 97: Endian Ordering] */
359	p = p + (u_long) cd;
360	}
361
362	/* [RPAP - Jan 97: Endian Ordering] */
363	/* cdh */
364	NTOHUL(cd->cdh.dict_type);
365	NTOHUL(cd->cdh.novel_method);
366	for (i = 0; i < TEXT_PARAMS; i++)
367	NTOHUL(cd->cdh.params[i]);
368	NTOHUL(cd->cdh.num_words[0]);
369	NTOHUL(cd->cdh.num_words[1]);
370	NTOHUL(cd->cdh.num_word_chars[0]);
371	NTOHUL(cd->cdh.num_word_chars[1]);
372	NTOHUL(cd->cdh.lookback);
373	/* cfh */
374	for (i = 0; i <= 1; i++)
375	{
376	int j;
377
378	NTOHSI(cd->cfh[i]->hd.num_codes);
379	NTOHSI(cd->cfh[i]->hd.mincodelen);
380	NTOHSI(cd->cfh[i]->hd.maxcodelen);
381	for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
382	{
383	NTOHSI(cd->cfh[i]->hd.lencount[j]);
384	NTOHUL(cd->cfh[i]->hd.min_code[j]);
385	}
386	NTOHUL(cd->cfh[i]->uncompressed_size);
387	for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
388	NTOHUL(cd->cfh[i]->huff_words_size[j]);
389	}
390	NTOHUL(cd->MemForCompDict);
391	/* ad */
392	if (cd->cdh.novel_method == MG_NOVEL_DELTA \|\|
393	cd->cdh.novel_method == MG_NOVEL_HYBRID)
394	for (i = 0; i <= 1; i++)
395	{
396	int j;
397
398	NTOHUL(cd->ad->afh[i].num_frags);
399	NTOHUL(cd->ad->afh[i].mem_for_frags);
400	for (j = 0; j < 33; j++)
401	{
402	NTOHSI(cd->ad->blk_start[i][j]);
403	NTOHSI(cd->ad->blk_end[i][j]);
404	}
405	}
406	NTOHSI(cd->fast_loaded);
407
408	delete fixup;
409
410	// the whole fast comp dict is a bit of a hack so I don't
411	// feel too bad about the next line :-) -- Rodger.
412	_cd = *cd;
413
414	return true;
415	}
416
417
418	static bool LoadCompDict (FILE *compDictFile,
419	FILE *auxDictFile,
420	FILE *fastCompDictFile,
421	compression_dict &cd) {
422	// see if we have a fast loading compression dictionary
423	if (fastCompDictFile != NULL)
424	return LoadFastCompDict (fastCompDictFile, cd);
425
426	// slow compression dictionary
427	return LoadSlowCompDict (compDictFile, auxDictFile, cd);
428	}
429
430
431	// try to open the dictionary files and load the dictionary
432	static bool OpenLoadCompDict (char *textname, compression_dict &cd) {
433	FILE *compDictFile = NULL;
434	FILE *auxDictFile = NULL;
435	FILE *fastCompDictFile = NULL;
436
437	fastCompDictFile = open_file (textname, TEXT_DICT_FAST_SUFFIX,
438	"rb", MAGIC_FAST_DICT, MG_CONTINUE);
439
440	if (fastCompDictFile == NULL) {
441	compDictFile = open_file (textname, TEXT_DICT_SUFFIX,
442	"rb", MAGIC_DICT, MG_MESSAGE);
443	auxDictFile = open_file (textname, TEXT_DICT_AUX_SUFFIX,
444	"rb", MAGIC_AUX_DICT, MG_CONTINUE);
445	}
446
447	bool res = LoadCompDict (compDictFile, auxDictFile, fastCompDictFile, cd);
448
449	if (compDictFile != NULL) fclose (compDictFile);
450	if (auxDictFile != NULL) fclose (auxDictFile);
451	if (fastCompDictFile != NULL) fclose (fastCompDictFile);
452
453	return res;
454	}
455
456	static bool LoadLevels (char *textname, FTextLevel &levels) {
457	FILE *levelFile = NULL;
458
459	// open the text level file
460	levelFile = open_file (textname, TEXT_LEVEL_SUFFIX,
461	"rb", MAGIC_TEXT_LEVELS, MG_CONTINUE);
462	if (levelFile == NULL) return false;
463
464	// seek to the appropriate place and read the level information
465	bool res = ((fseek (levelFile, sizeof (u_long), SEEK_SET) == 0) &&
466	levels.Read (levelFile));
467
468	// close the file
469	fclose (levelFile);
470
471	return res;
472	}
473
474
475	TextData::TextData () {
476	// put file pointers in known state first
477	textFile = NULL;
478	textIdxFile = NULL;
479	Clear ();
480	}
481
482	void TextData::Clear () {
483	cd.Clear();
484	textFile = NULL;
485	textIdxFile = NULL;
486	cth.Clear();
487	levels.Clear();
488	}
489
490	bool TextData::LoadData (char basepath, char textname) {
491
492	if (textname[0] == '\0') return false;
493
494	// set the basepath
495	set_basepath(basepath);
496
497	// load the compression dictionary
498	if (!OpenLoadCompDict (textname, cd)) return false;
499
500	// open the compressed text and text index file
501	textFile = open_file (textname, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_CONTINUE);
502	if (textFile == NULL) return false;
503
504	textIdxFile = open_file (textname, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI, MG_CONTINUE);
505	if (textIdxFile == NULL) return false;
506
507	// read in the compressed text header
508	if ((fseek (textFile, sizeof (u_long), SEEK_SET) != 0) \|\| !cth.Read (textFile))
509	return false;
510
511	// read in the level information
512	if (!LoadLevels (textname, levels)) return false;
513
514	return true;
515	}
516
517	bool TextData::UnloadData () {
518	// close any open files
519	if (textFile != NULL) {
520	fclose (textFile);
521	textFile = NULL;
522	}
523	if (textIdxFile != NULL) {
524	fclose (textIdxFile);
525	textIdxFile = NULL;
526	}
527
528	// do general clear
529	Clear ();
530
531	return true;
532	}
533
534
535	bool GetDocIdx (TextData &td, const UCArray &docLevel,
536	unsigned long docNum, TextIdx &docIdx) {
537	// make sure the text index file was opened successfully
538	if (td.textIdxFile == NULL) return false;
539
540	// read in the index
541	TextLevelInfo &levelInfo = td.levels.levelInfo[docLevel];
542	if (!docIdx.Read (td.textIdxFile, levelInfo, docNum)) return false;
543
544	return true;
545	}
546
547
548
549
550	#define MY_HUFF_DECODE(len, code, mcodes) \
551	do { \
552	register unsigned long *__min_code = (mcodes); \
553	register unsigned long *__mclen = __min_code; \
554	register unsigned long __code = 0; \
555	do \
556	{ \
557	__code += __code + buffer.bit(); \
558	} \
559	while (__code < *++__mclen); \
560	(len) = __mclen - __min_code; \
561	(code) = __code - *__mclen; \
562	} while(0);
563
564
565	bool GetDocText (TextData &td, const UCArray &docLevel,
566	unsigned long docNum, UCArray &docText) {
567	// erase the current text
568	docText.erase (docText.begin(), docText.end());
569
570	// look up the information about this document
571	TextIdx docIdx;
572	if (!GetDocIdx (td, docLevel, docNum, docIdx)) return false;
573
574	// do seek to appropriate position
575	stdio_bitio_buffer buffer (td.textFile);
576	buffer.seek (docIdx.start.byte, docIdx.start.bit);
577
578	// decompress the document
579	compression_dict &cd = td.cd;
580	auxiliary_dict *ad = cd.ad;
581	int which = docIdx.which;
582	unsigned long num_bits = (docIdx.end.byte*8+(8-docIdx.end.bit)) -
583	(docIdx.start.byte*8+(8-docIdx.start.bit));
584	unsigned long bits = 0;
585
586	// keep decoding bits until enough bits have been decoded
587	while (bits < num_bits) {
588	register unsigned code, len;
589	register int r;
590	register u_char t, b = NULL;
591	u_char word[MAXWORDLEN + 1];
592
593	if (cd.cfh[which]) {
594	MY_HUFF_DECODE (len, code, cd.cfh[which]->hd.min_code);
595	bits += len;
596
597	r = code & ((1 << cd.cdh.lookback) - 1);
598	t = cd.values[which][len][code >> cd.cdh.lookback];
599
600	/* step through from base pointer */
601	b = word + 1;
602	while (r--) {
603	register int copy = *t >> 4;
604	memcpy (word + copy + 1, t + 1, *t & 0xf);
605	word[0] = copy + (*t & 0xf);
606	t += ((*t) & 0xf) + 1;
607	}
608	} else t = NULL;
609
610	if (t == cd.escape[which]) {
611	switch (cd.cdh.novel_method) {
612	case MG_NOVEL_HUFFMAN_CHARS:
613	{
614	int len, i;
615	int c;
616	len = buffer.huff_decode(cd.lens_huff[which]->min_code,
617	cd.lens_vals[which], &bits);
618	for (i = 0; i < len; i++) {
619	c = buffer.huff_decode(cd.chars_huff[which]->min_code,
620	cd.chars_vals[which], &bits);
621	docText.push_back (c);
622	}
623	}
624	break;
625	case MG_NOVEL_DELTA:
626	case MG_NOVEL_HYBRID:
627	{
628	int idx = 0, len;
629	u_char *base;
630	switch (cd.cdh.novel_method)
631	{
632	case MG_NOVEL_DELTA:
633	{
634	idx = buffer.delta_decode (&bits);
635	idx--;
636	}
637	break;
638	case MG_NOVEL_HYBRID:
639	{
640	int k;
641	k = buffer.gamma_decode (&bits);
642	k--;
643	idx = buffer.binary_decode(ad->blk_end[which][k] -
644	ad->blk_start[which][k] + 1,
645	&bits);
646	idx += ad->blk_start[which][k] - 1;
647	}
648	break;
649	}
650	base = ad->words[which][idx];
651	len = *base++;
652	for (; len; len--)
653	{
654	docText.push_back (*base++);
655	}
656	}
657	break;
658	}
659	}
660	else
661	{
662	/* copy over the matching prefix */
663	r = (*t >> 4);
664	while (r--) {
665	docText.push_back (*b++);
666	}
667
668	/* and the stored suffix */
669	r = ((*t) & 0xf);
670	while (r--) {
671	docText.push_back (*++t);
672	}
673	}
674	which = !which;
675	}
676
677	buffer.done();
678
679	return true;
680	}
681

Note: See TracBrowser for help on using the repository browser.

Download in other formats: