Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/indexers/mgpp/text/TextGet.cpp@ 3365

Last change on this file since 3365 was 3365, checked in by kjdon, 22 years ago
Initial revision
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.8 KB

Line
1	/**************************************************************************
2	*
3	* TextGet.cpp -- Decompressing the text
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	// need this to avoid bizarre compiler problems under VC++ 6.0
23	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
24	# include <iostream>
25	#endif
26
27	#include "TextGet.h"
28	#include "mg_files.h"
29	#include "netorder.h"
30	#include "mg_errors.h"
31	#include "locallib.h"
32	#include "words.h"
33	#include "local_strings.h"
34	#include "bitio_m_stdio.h"
35
36	typedef enum huff_type {lengths, chars};
37
38
39	static auxiliary_dict LoadAuxDict (compression_dict &cd, FILE text_aux_dict) {
40	auxiliary_dict *ad;
41	int i;
42
43	if (!(ad = new auxiliary_dict))
44	{
45	mg_errno = MG_NOMEM;
46	return (NULL);
47	}
48
49	memset (ad, '\0', sizeof (*ad));
50
51	for (i = 0; i <= 1; i++)
52	{
53	int j;
54	u_char *pos;
55
56	fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
57
58	/* [RPAP - Jan 97: Endian Ordering] */
59	NTOHUL(ad->afh[i].num_frags);
60	NTOHUL(ad->afh[i].mem_for_frags);
61
62	if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags]))
63	{
64	mg_errno = MG_NOMEM;
65	delete ad;
66	return (NULL);
67	}
68	if (!(ad->words[i] = new u_char* [ad->afh[i].num_frags]))
69	{
70	mg_errno = MG_NOMEM;
71	delete ad;
72	return (NULL);
73	}
74
75	fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
76	text_aux_dict);
77
78	pos = ad->word_data[i];
79	for (j = 0; j < (int)ad->afh[i].num_frags; j++)
80	{
81	ad->words[i][j] = pos;
82	pos += *pos + 1;
83	}
84	if (cd.cdh.novel_method == MG_NOVEL_HYBRID)
85	{
86	int num;
87	num = 1;
88	ad->blk_start[i][0] = 0;
89	ad->blk_end[i][0] = cd.cdh.num_words[i] - 1;
90	while (num < 33)
91	{
92	ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
93	ad->blk_end[i][num] = ad->blk_start[i][num] +
94	(ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
95	num++;
96	}
97	}
98	}
99	return (ad);
100	}
101
102
103	static u_char **ReadInWords (FILE dict, compression_dict &cd,
104	comp_frags_header cfh, u_char *escape) {
105	int i, lookback;
106	int ptrs_reqd = 0;
107	int mem_reqd = 0;
108	int num_set[MAX_HUFFCODE_LEN + 1];
109	u_char *next_word[MAX_HUFFCODE_LEN + 1];
110	u_char **vals;
111	u_char ***values;
112	u_char word[MAXWORDLEN + 1];
113	u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
114
115	lookback = cd.cdh.lookback;
116
117	for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++) {
118	ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
119	mem_reqd += cfh->huff_words_size[i];
120	}
121
122	if (!(vals = new u_char* [ptrs_reqd]))
123	return (NULL);
124
125	if (!(values = new u_char** [MAX_HUFFCODE_LEN + 1]))
126	return (NULL);
127
128	if (!(next_word[0] = new u_char[mem_reqd]))
129	return (NULL);
130
131	cd.MemForCompDict += ptrs_reqd * sizeof (*vals) +
132	(MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
133	mem_reqd;
134
135	values[0] = vals;
136	values[0][0] = next_word[0];
137	for (i = 1; i <= cfh->hd.maxcodelen; i++)
138	{
139	int next_start = (values[i - 1] - vals) +
140	((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
141	values[i] = &vals[next_start];
142	next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
143	values[i][0] = next_word[i];
144	}
145
146	memset (num_set, '\0', sizeof (num_set));
147
148	for (i = 0; i < cfh->hd.num_codes; i++)
149	{
150	register int val, copy;
151	register int len = cfh->hd.clens[i];
152	val = getc (dict);
153	copy = (val >> 4) & 0xf;
154	val &= 0xf;
155
156	fread (word + copy + 1, sizeof (u_char), val, dict);
157	*word = val + copy;
158
159	if ((num_set[len] & ((1 << lookback) - 1)) == 0)
160	{
161	values[len][num_set[len] >> lookback] = next_word[len];
162	memcpy (next_word[len], word, *word + 1);
163	if (escape && i == cfh->hd.num_codes - 1)
164	*escape = next_word[len];
165	next_word[len] += *word + 1;
166	}
167	else
168	{
169	copy = prefixlen (last_word[len], word);
170	memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
171	next_word[len] = (copy << 4) + (word - copy);
172	if (escape && i == cfh->hd.num_codes - 1)
173	*escape = next_word[len];
174	next_word[len] += (*word - copy) + 1;
175	}
176	memcpy (last_word[len], word, *word + 1);
177	num_set[len]++;
178	}
179	if (cfh->hd.clens)
180	delete cfh->hd.clens;
181	cfh->hd.clens = NULL;
182	return values;
183	}
184
185	static int Load_Comp_HuffData(compression_dict &cd, int which, FILE *dict,
186	huff_type type) {
187	huff_data * hd;
188	u_long ** vals;
189
190	if (!(hd = new huff_data))
191	return 1;
192	cd.MemForCompDict += sizeof (huff_data);
193	if (Read_Huffman_Data (dict, hd, &cd.MemForCompDict, NULL) == -1)
194	return 2;
195	if (!(vals = Generate_Huffman_Vals (hd, &cd.MemForCompDict)))
196	return 3;
197	if (hd->clens)
198	delete hd->clens;
199	hd->clens = NULL;
200	if (type == chars)
201	{
202	cd.chars_huff[which] = hd;
203	cd.chars_vals[which] = vals;
204	}
205	else
206	{
207	cd.lens_huff[which] = hd;
208	cd.lens_vals[which] = vals;
209	}
210
211	return 0;
212	}
213
214	static int Load_Comp_FragsHeader(compression_dict &cd, int which, int getEscape,
215	FILE *dict) {
216	if (!(cd.cfh[which] = new comp_frags_header))
217	return 1;
218	cd.MemForCompDict += sizeof (*cd.cfh[which]);
219	if (Read_cfh (dict, cd.cfh[which], &cd.MemForCompDict, NULL) == -1)
220	return 2;
221
222	if (!(cd.values[which] = ReadInWords (dict, cd, cd.cfh[which],
223	getEscape == 0 ? NULL : &cd.escape[which])))
224	return 3;
225
226	return 0;
227	}
228
229	static bool LoadSlowCompDict (FILE dict, FILE aux_dict, compression_dict &cd) {
230	if (dict == NULL) return false;
231
232	int which;
233
234	memset (&cd, '\0', sizeof (compression_dict));
235
236	cd.MemForCompDict = sizeof (compression_dict);
237
238	if (Read_cdh (dict, &cd.cdh, &cd.MemForCompDict, NULL) == -1)
239	return false;
240
241	for (which = 0; which < 2; which++)
242	switch (cd.cdh.dict_type)
243	{
244	case MG_COMPLETE_DICTIONARY:
245	{
246	if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0)
247	return false;
248	cd.escape[which] = NULL;
249
250	}
251	break;
252	case MG_PARTIAL_DICTIONARY:
253	{
254	if (cd.cdh.num_words[which])
255	{
256	if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
257	return false;
258	}
259
260	if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
261	return false;
262
263	if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
264	return false;
265	}
266	break;
267	case MG_SEED_DICTIONARY:
268	{
269	if (cd.cdh.num_words[which])
270	{
271	if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
272	return false;
273	}
274	switch (cd.cdh.novel_method)
275	{
276	case MG_NOVEL_HUFFMAN_CHARS:
277	if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
278	return false;
279
280	if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
281	return false;
282	break;
283	case MG_NOVEL_DELTA:
284	break;
285	case MG_NOVEL_HYBRID:
286	break;
287	}
288	break;
289	}
290	}
291
292	if (cd.cdh.novel_method == MG_NOVEL_DELTA \|\|
293	cd.cdh.novel_method == MG_NOVEL_HYBRID)
294	{
295	if (!aux_dict)
296	{
297	mg_errno = MG_NOFILE;
298	cd.Clear();
299	return false;
300	}
301
302	if (!(cd.ad = LoadAuxDict (cd, aux_dict)))
303	{
304	cd.Clear();
305	return false;
306	}
307	}
308
309	mg_errno = MG_NOERROR;
310
311	cd.fast_loaded = 0;
312
313	return true;
314	}
315
316
317
318	#define WORDNO(p, base) ((((char)(p))-((char)(base)))/sizeof(u_char*))
319	#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p,cd) & 7))) != 0)
320
321	// fast loading really needs to be totally re-writen. "Unloading" the
322	// text data will currently cause a crash because memory is being
323	// deleted multiple times (and probably a zillion other reasons).
324	static bool LoadFastCompDict (FILE *text_fast_comp_dict, compression_dict &_cd) {
325	if (text_fast_comp_dict == NULL) return false;
326
327	u_long p, end;
328	u_char *fixup;
329	u_long mem;
330	u_long fixup_mem;
331	int i; /* [RPAP - Jan 97: Endian Ordering] */
332
333	fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
334	NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
335	fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
336	NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
337
338	compression_dict *cd;
339	if (!(cd = (compression_dict *)malloc (mem))) {
340	mg_errno = MG_NOMEM;
341	return false;
342	}
343
344	end = (u_long ) (((u_char ) cd) + mem);
345	fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
346
347	if (!(fixup = new u_char[fixup_mem]))
348	{
349	mg_errno = MG_NOMEM;
350	return false;
351	}
352
353	fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
354
355	for (p = (u_long *) cd; (u_long) p < (u_long) end; p++)
356	if (IS_FIXUP (p))
357	{
358	NTOHUL(p); / [RPAP - Jan 97: Endian Ordering] */
359	p = p + (u_long) cd;
360	}
361
362	/* [RPAP - Jan 97: Endian Ordering] */
363	/* cdh */
364	NTOHUL(cd->cdh.dict_type);
365	NTOHUL(cd->cdh.novel_method);
366	for (i = 0; i < TEXT_PARAMS; i++)
367	NTOHUL(cd->cdh.params[i]);
368	NTOHUL(cd->cdh.num_words[0]);
369	NTOHUL(cd->cdh.num_words[1]);
370	NTOHUL(cd->cdh.num_word_chars[0]);
371	NTOHUL(cd->cdh.num_word_chars[1]);
372	NTOHUL(cd->cdh.lookback);
373	/* cfh */
374	for (i = 0; i <= 1; i++)
375	{
376	int j;
377
378	NTOHSI(cd->cfh[i]->hd.num_codes);
379	NTOHSI(cd->cfh[i]->hd.mincodelen);
380	NTOHSI(cd->cfh[i]->hd.maxcodelen);
381	for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
382	{
383	NTOHSI(cd->cfh[i]->hd.lencount[j]);
384	NTOHUL(cd->cfh[i]->hd.min_code[j]);
385	}
386	NTOHUL(cd->cfh[i]->uncompressed_size);
387	for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
388	NTOHUL(cd->cfh[i]->huff_words_size[j]);
389	}
390	NTOHUL(cd->MemForCompDict);
391	/* ad */
392	if (cd->cdh.novel_method == MG_NOVEL_DELTA \|\|
393	cd->cdh.novel_method == MG_NOVEL_HYBRID)
394	for (i = 0; i <= 1; i++)
395	{
396	int j;
397
398	NTOHUL(cd->ad->afh[i].num_frags);
399	NTOHUL(cd->ad->afh[i].mem_for_frags);
400	for (j = 0; j < 33; j++)
401	{
402	NTOHSI(cd->ad->blk_start[i][j]);
403	NTOHSI(cd->ad->blk_end[i][j]);
404	}
405	}
406	NTOHSI(cd->fast_loaded);
407
408	delete fixup;
409
410	// the whole fast comp dict is a bit of a hack so I don't
411	// feel too bad about the next line :-) -- Rodger.
412	_cd = *cd;
413
414	return true;
415	}
416
417
418	static bool LoadCompDict (FILE *compDictFile,
419	FILE *auxDictFile,
420	FILE *fastCompDictFile,
421	compression_dict &cd) {
422	// see if we have a fast loading compression dictionary
423	if (fastCompDictFile != NULL)
424	return LoadFastCompDict (fastCompDictFile, cd);
425
426	// slow compression dictionary
427	return LoadSlowCompDict (compDictFile, auxDictFile, cd);
428	}
429
430
431	// try to open the dictionary files and load the dictionary
432	static bool OpenLoadCompDict (char *textname, compression_dict &cd) {
433	FILE *compDictFile = NULL;
434	FILE *auxDictFile = NULL;
435	FILE *fastCompDictFile = NULL;
436
437	fastCompDictFile = open_file (textname, TEXT_DICT_FAST_SUFFIX,
438	"rb", MAGIC_FAST_DICT, MG_CONTINUE);
439
440	if (fastCompDictFile == NULL) {
441	compDictFile = open_file (textname, TEXT_DICT_SUFFIX,
442	"rb", MAGIC_DICT, MG_MESSAGE);
443	auxDictFile = open_file (textname, TEXT_DICT_AUX_SUFFIX,
444	"rb", MAGIC_AUX_DICT, MG_CONTINUE);
445	}
446
447	bool res = LoadCompDict (compDictFile, auxDictFile, fastCompDictFile, cd);
448
449	if (compDictFile != NULL) fclose (compDictFile);
450	if (auxDictFile != NULL) fclose (auxDictFile);
451	if (fastCompDictFile != NULL) fclose (fastCompDictFile);
452
453	return res;
454	}
455
456	static bool LoadLevels (char *textname, FTextLevel &levels) {
457	FILE *levelFile = NULL;
458
459	// open the text level file
460	levelFile = open_file (textname, TEXT_LEVEL_SUFFIX,
461	"rb", MAGIC_TEXT_LEVELS, MG_CONTINUE);
462	if (levelFile == NULL) return false;
463
464	// seek to the appropriate place and read the level information
465	bool res = ((fseek (levelFile, sizeof (u_long), SEEK_SET) == 0) &&
466	levels.Read (levelFile));
467
468	// close the file
469	fclose (levelFile);
470
471	return res;
472	}
473
474
475	TextData::TextData () {
476	// put file pointers in known state first
477	textFile = NULL;
478	textIdxFile = NULL;
479	Clear ();
480	}
481
482	void TextData::Clear () {
483	cd.Clear();
484	textFile = NULL;
485	textIdxFile = NULL;
486	cth.Clear();
487	levels.Clear();
488	}
489
490	bool TextData::LoadData (char basepath, char textname) {
491
492	if (textname[0] == '\0') return false;
493
494	// set the basepath
495	set_basepath(basepath);
496
497	// load the compression dictionary
498	if (!OpenLoadCompDict (textname, cd)) return false;
499
500	// open the compressed text and text index file
501	textFile = open_file (textname, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_CONTINUE);
502	if (textFile == NULL) return false;
503
504	textIdxFile = open_file (textname, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI, MG_CONTINUE);
505	if (textIdxFile == NULL) return false;
506
507	// read in the compressed text header
508	if ((fseek (textFile, sizeof (u_long), SEEK_SET) != 0) \|\| !cth.Read (textFile))
509	return false;
510
511	// read in the level information
512	if (!LoadLevels (textname, levels)) return false;
513
514	return true;
515	}
516
517	bool TextData::UnloadData () {
518	// close any open files
519	if (textFile != NULL) {
520	fclose (textFile);
521	textFile = NULL;
522	}
523	if (textIdxFile != NULL) {
524	fclose (textIdxFile);
525	textIdxFile = NULL;
526	}
527
528	// do general clear
529	Clear ();
530
531	return true;
532	}
533
534
535	bool GetDocIdx (TextData &td, const UCArray &docLevel,
536	unsigned long docNum, TextIdx &docIdx) {
537	// make sure the text index file was opened successfully
538	if (td.textIdxFile == NULL) return false;
539
540	// read in the index
541	TextLevelInfo &levelInfo = td.levels.levelInfo[docLevel];
542	if (!docIdx.Read (td.textIdxFile, levelInfo, docNum)) return false;
543
544	return true;
545	}
546
547
548
549
550	#define MY_HUFF_DECODE(len, code, mcodes) \
551	do { \
552	register unsigned long *__min_code = (mcodes); \
553	register unsigned long *__mclen = __min_code; \
554	register unsigned long __code = 0; \
555	do \
556	{ \
557	__code += __code + buffer.bit(); \
558	} \
559	while (__code < *++__mclen); \
560	(len) = __mclen - __min_code; \
561	(code) = __code - *__mclen; \
562	} while(0);
563
564
565	bool GetDocText (TextData &td, const UCArray &docLevel,
566	unsigned long docNum, UCArray &docText) {
567	// erase the current text
568	docText.erase (docText.begin(), docText.end());
569
570	// look up the information about this document
571	TextIdx docIdx;
572	if (!GetDocIdx (td, docLevel, docNum, docIdx)) return false;
573
574	// do seek to appropriate position
575	stdio_bitio_buffer buffer (td.textFile);
576	buffer.seek (docIdx.start.byte, docIdx.start.bit);
577
578	// decompress the document
579	compression_dict &cd = td.cd;
580	auxiliary_dict *ad = cd.ad;
581	int which = docIdx.which;
582	unsigned long num_bits = (docIdx.end.byte*8+(8-docIdx.end.bit)) -
583	(docIdx.start.byte*8+(8-docIdx.start.bit));
584	unsigned long bits = 0;
585
586	// keep decoding bits until enough bits have been decoded
587	while (bits < num_bits) {
588	register unsigned code, len;
589	register int r;
590	register u_char t, b = NULL;
591	u_char word[MAXWORDLEN + 1];
592
593	if (cd.cfh[which]) {
594	MY_HUFF_DECODE (len, code, cd.cfh[which]->hd.min_code);
595	bits += len;
596
597	r = code & ((1 << cd.cdh.lookback) - 1);
598	t = cd.values[which][len][code >> cd.cdh.lookback];
599
600	/* step through from base pointer */
601	b = word + 1;
602	while (r--) {
603	register int copy = *t >> 4;
604	memcpy (word + copy + 1, t + 1, *t & 0xf);
605	word[0] = copy + (*t & 0xf);
606	t += ((*t) & 0xf) + 1;
607	}
608	} else t = NULL;
609
610	if (t == cd.escape[which]) {
611	switch (cd.cdh.novel_method) {
612	case MG_NOVEL_HUFFMAN_CHARS:
613	{
614	int len, i;
615	int c;
616	len = buffer.huff_decode(cd.lens_huff[which]->min_code,
617	cd.lens_vals[which], &bits);
618	for (i = 0; i < len; i++) {
619	c = buffer.huff_decode(cd.chars_huff[which]->min_code,
620	cd.chars_vals[which], &bits);
621	docText.push_back (c);
622	}
623	}
624	break;
625	case MG_NOVEL_DELTA:
626	case MG_NOVEL_HYBRID:
627	{
628	int idx = 0, len;
629	u_char *base;
630	switch (cd.cdh.novel_method)
631	{
632	case MG_NOVEL_DELTA:
633	{
634	idx = buffer.delta_decode (&bits);
635	idx--;
636	}
637	break;
638	case MG_NOVEL_HYBRID:
639	{
640	int k;
641	k = buffer.gamma_decode (&bits);
642	k--;
643	idx = buffer.binary_decode(ad->blk_end[which][k] -
644	ad->blk_start[which][k] + 1,
645	&bits);
646	idx += ad->blk_start[which][k] - 1;
647	}
648	break;
649	}
650	base = ad->words[which][idx];
651	len = *base++;
652	for (; len; len--)
653	{
654	docText.push_back (*base++);
655	}
656	}
657	break;
658	}
659	}
660	else
661	{
662	/* copy over the matching prefix */
663	r = (*t >> 4);
664	while (r--) {
665	docText.push_back (*b++);
666	}
667
668	/* and the stored suffix */
669	r = ((*t) & 0xf);
670	while (r--) {
671	docText.push_back (*++t);
672	}
673	}
674	which = !which;
675	}
676
677	buffer.done();
678
679	return true;
680	}
681

Note: See TracBrowser for help on using the repository browser.

Download in other formats: