Context Navigation

source: trunk/indexers/mgpp/text/TextGet.cpp@ 12321

Last change on this file since 12321 was 8692, checked in by kjdon, 19 years ago
Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.0 KB

Line
1	/**************************************************************************
2	*
3	* TextGet.cpp -- Decompressing the text
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	// is important to be first, so we escape the truncation warning on VC++
23	#include "TextGet.h"
24	// need this to avoid bizarre compiler problems under VC++ 6.0
25	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
26	# include <iostream>
27	#endif
28
29	#include "mg_files.h"
30	#include "netorder.h"
31	#include "mg_errors.h"
32	#include "locallib.h"
33	#include "words.h"
34	#include "local_strings.h"
35	#include "bitio_m_stdio.h"
36
37	typedef enum huff_type {lengths, chars};
38
39
40	static auxiliary_dict LoadAuxDict (compression_dict &cd, FILE text_aux_dict) {
41	auxiliary_dict *ad;
42	int i;
43
44	if (!(ad = new auxiliary_dict))
45	{
46	mg_errno = MG_NOMEM;
47	return (NULL);
48	}
49
50	memset (ad, '\0', sizeof (*ad));
51
52	for (i = 0; i <= 1; ++i)
53	{
54	int j;
55	u_char *pos;
56
57	fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict);
58
59	/* [RPAP - Jan 97: Endian Ordering] */
60	NTOHUL(ad->afh[i].num_frags);
61	NTOHUL(ad->afh[i].mem_for_frags);
62
63	if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags]))
64	{
65	mg_errno = MG_NOMEM;
66	delete ad;
67	return (NULL);
68	}
69	if (!(ad->words[i] = new u_char* [ad->afh[i].num_frags]))
70	{
71	mg_errno = MG_NOMEM;
72	delete ad;
73	return (NULL);
74	}
75
76	fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char),
77	text_aux_dict);
78
79	pos = ad->word_data[i];
80	for (j = 0; j < (int)ad->afh[i].num_frags; ++j)
81	{
82	ad->words[i][j] = pos;
83	pos += *pos + 1;
84	}
85	if (cd.cdh.novel_method == MG_NOVEL_HYBRID)
86	{
87	int num;
88	num = 1;
89	ad->blk_start[i][0] = 0;
90	ad->blk_end[i][0] = cd.cdh.num_words[i] - 1;
91	while (num < 33)
92	{
93	ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
94	ad->blk_end[i][num] = ad->blk_start[i][num] +
95	(ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
96	++num;
97	}
98	}
99	}
100	return (ad);
101	}
102
103
104	static u_char **ReadInWords (FILE dict, compression_dict &cd,
105	comp_frags_header cfh, u_char *escape) {
106	int i, lookback;
107	int ptrs_reqd = 0;
108	int mem_reqd = 0;
109	int num_set[MAX_HUFFCODE_LEN + 1];
110	u_char *next_word[MAX_HUFFCODE_LEN + 1];
111	u_char **vals;
112	u_char ***values;
113	u_char word[MAXWORDLEN + 1];
114	u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];
115
116	lookback = cd.cdh.lookback;
117
118	for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; ++i) {
119	ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
120	mem_reqd += cfh->huff_words_size[i];
121	}
122
123	if (!(vals = new u_char* [ptrs_reqd]))
124	return (NULL);
125
126	if (!(values = new u_char** [MAX_HUFFCODE_LEN + 1]))
127	return (NULL);
128
129	if (!(next_word[0] = new u_char[mem_reqd]))
130	return (NULL);
131
132	cd.MemForCompDict += ptrs_reqd * sizeof (*vals) +
133	(MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
134	mem_reqd;
135
136	values[0] = vals;
137	values[0][0] = next_word[0];
138	for (i = 1; i <= cfh->hd.maxcodelen; ++i)
139	{
140	int next_start = (values[i - 1] - vals) +
141	((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
142	values[i] = &vals[next_start];
143	next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
144	values[i][0] = next_word[i];
145	}
146
147	memset (num_set, '\0', sizeof (num_set));
148
149	for (i = 0; i < cfh->hd.num_codes; ++i)
150	{
151	register int val, copy;
152	register int len = cfh->hd.clens[i];
153	val = getc (dict);
154	copy = (val >> 4) & 0xf;
155	val &= 0xf;
156
157	fread (word + copy + 1, sizeof (u_char), val, dict);
158	*word = val + copy;
159
160	if ((num_set[len] & ((1 << lookback) - 1)) == 0)
161	{
162	values[len][num_set[len] >> lookback] = next_word[len];
163	memcpy (next_word[len], word, *word + 1);
164	if (escape && i == cfh->hd.num_codes - 1)
165	*escape = next_word[len];
166	next_word[len] += *word + 1;
167	}
168	else
169	{
170	copy = prefixlen (last_word[len], word);
171	memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
172	next_word[len] = (copy << 4) + (word - copy);
173	if (escape && i == cfh->hd.num_codes - 1)
174	*escape = next_word[len];
175	next_word[len] += (*word - copy) + 1;
176	}
177	memcpy (last_word[len], word, *word + 1);
178	++num_set[len];
179	}
180	if (cfh->hd.clens)
181	delete []cfh->hd.clens;
182	cfh->hd.clens = NULL;
183	return values;
184	}
185
186	static int Load_Comp_HuffData(compression_dict &cd, int which, FILE *dict,
187	huff_type type) {
188	huff_data * hd;
189	u_long ** vals;
190
191	if (!(hd = new huff_data))
192	return 1;
193	cd.MemForCompDict += sizeof (huff_data);
194	if (Read_Huffman_Data (dict, hd, &cd.MemForCompDict, NULL) == -1)
195	return 2;
196	if (!(vals = Generate_Huffman_Vals (hd, &cd.MemForCompDict)))
197	return 3;
198	if (hd->clens)
199	delete []hd->clens;
200	hd->clens = NULL;
201	if (type == chars)
202	{
203	cd.chars_huff[which] = hd;
204	cd.chars_vals[which] = vals;
205	}
206	else
207	{
208	cd.lens_huff[which] = hd;
209	cd.lens_vals[which] = vals;
210	}
211
212	return 0;
213	}
214
215	static int Load_Comp_FragsHeader(compression_dict &cd, int which, int getEscape,
216	FILE *dict) {
217	if (!(cd.cfh[which] = new comp_frags_header))
218	return 1;
219	cd.MemForCompDict += sizeof (*cd.cfh[which]);
220	if (Read_cfh (dict, cd.cfh[which], &cd.MemForCompDict, NULL) == -1)
221	return 2;
222
223	if (!(cd.values[which] = ReadInWords (dict, cd, cd.cfh[which],
224	getEscape == 0 ? NULL : &cd.escape[which])))
225	return 3;
226
227	return 0;
228	}
229
230	static bool LoadSlowCompDict (FILE dict, FILE aux_dict, compression_dict &cd) {
231	if (dict == NULL) return false;
232
233	int which;
234
235	memset (&cd, '\0', sizeof (compression_dict));
236
237	cd.MemForCompDict = sizeof (compression_dict);
238
239	if (Read_cdh (dict, &cd.cdh, &cd.MemForCompDict, NULL) == -1)
240	return false;
241
242	for (which = 0; which < 2; ++which)
243	switch (cd.cdh.dict_type)
244	{
245	case MG_COMPLETE_DICTIONARY:
246	{
247	if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0)
248	return false;
249	cd.escape[which] = NULL;
250
251	}
252	break;
253	case MG_PARTIAL_DICTIONARY:
254	{
255	if (cd.cdh.num_words[which])
256	{
257	if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
258	return false;
259	}
260
261	if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
262	return false;
263
264	if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
265	return false;
266	}
267	break;
268	case MG_SEED_DICTIONARY:
269	{
270	if (cd.cdh.num_words[which])
271	{
272	if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0)
273	return false;
274	}
275	switch (cd.cdh.novel_method)
276	{
277	case MG_NOVEL_HUFFMAN_CHARS:
278	if (Load_Comp_HuffData(cd, which, dict, chars) != 0)
279	return false;
280
281	if (Load_Comp_HuffData(cd, which, dict, lengths) != 0)
282	return false;
283	break;
284	case MG_NOVEL_DELTA:
285	break;
286	case MG_NOVEL_HYBRID:
287	break;
288	}
289	break;
290	}
291	}
292
293	if (cd.cdh.novel_method == MG_NOVEL_DELTA \|\|
294	cd.cdh.novel_method == MG_NOVEL_HYBRID)
295	{
296	if (!aux_dict)
297	{
298	mg_errno = MG_NOFILE;
299	cd.Clear();
300	return false;
301	}
302
303	if (!(cd.ad = LoadAuxDict (cd, aux_dict)))
304	{
305	cd.Clear();
306	return false;
307	}
308	}
309
310	mg_errno = MG_NOERROR;
311
312	cd.fast_loaded = 0;
313
314	return true;
315	}
316
317
318
319	#define WORDNO(p, base) ((((char)(p))-((char)(base)))/sizeof(u_char*))
320	#define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p,cd) & 7))) != 0)
321
322	// fast loading really needs to be totally re-writen. "Unloading" the
323	// text data will currently cause a crash because memory is being
324	// deleted multiple times (and probably a zillion other reasons).
325	static bool LoadFastCompDict (FILE *text_fast_comp_dict, compression_dict &_cd) {
326	if (text_fast_comp_dict == NULL) return false;
327
328	u_long p, end;
329	u_char *fixup;
330	u_long mem;
331	u_long fixup_mem;
332	int i; /* [RPAP - Jan 97: Endian Ordering] */
333
334	fread (&mem, sizeof (mem), 1, text_fast_comp_dict);
335	NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */
336	fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict);
337	NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */
338
339	compression_dict *cd;
340	if (!(cd = (compression_dict *)malloc (mem))) {
341	mg_errno = MG_NOMEM;
342	return false;
343	}
344
345	end = (u_long ) (((u_char ) cd) + mem);
346	fread (cd, sizeof (u_char), mem, text_fast_comp_dict);
347
348	if (!(fixup = new u_char[fixup_mem]))
349	{
350	mg_errno = MG_NOMEM;
351	return false;
352	}
353
354	fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict);
355
356	for (p = (u_long *) cd; (u_long) p < (u_long) end; ++p)
357	if (IS_FIXUP (p))
358	{
359	NTOHUL(p); / [RPAP - Jan 97: Endian Ordering] */
360	p = p + (u_long) cd;
361	}
362
363	/* [RPAP - Jan 97: Endian Ordering] */
364	/* cdh */
365	NTOHUL(cd->cdh.dict_type);
366	NTOHUL(cd->cdh.novel_method);
367	for (i = 0; i < TEXT_PARAMS; ++i)
368	NTOHUL(cd->cdh.params[i]);
369	NTOHUL(cd->cdh.num_words[0]);
370	NTOHUL(cd->cdh.num_words[1]);
371	NTOHUL(cd->cdh.num_word_chars[0]);
372	NTOHUL(cd->cdh.num_word_chars[1]);
373	NTOHUL(cd->cdh.lookback);
374	/* cfh */
375	for (i = 0; i <= 1; ++i)
376	{
377	int j;
378
379	NTOHSI(cd->cfh[i]->hd.num_codes);
380	NTOHSI(cd->cfh[i]->hd.mincodelen);
381	NTOHSI(cd->cfh[i]->hd.maxcodelen);
382	for (j = 0; j < MAX_HUFFCODE_LEN + 1; ++j)
383	{
384	NTOHSI(cd->cfh[i]->hd.lencount[j]);
385	NTOHUL(cd->cfh[i]->hd.min_code[j]);
386	}
387	NTOHUL(cd->cfh[i]->uncompressed_size);
388	for (j = 0; j < MAX_HUFFCODE_LEN + 1; ++j)
389	NTOHUL(cd->cfh[i]->huff_words_size[j]);
390	}
391	NTOHUL(cd->MemForCompDict);
392	/* ad */
393	if (cd->cdh.novel_method == MG_NOVEL_DELTA \|\|
394	cd->cdh.novel_method == MG_NOVEL_HYBRID)
395	for (i = 0; i <= 1; ++i)
396	{
397	int j;
398
399	NTOHUL(cd->ad->afh[i].num_frags);
400	NTOHUL(cd->ad->afh[i].mem_for_frags);
401	for (j = 0; j < 33; ++j)
402	{
403	NTOHSI(cd->ad->blk_start[i][j]);
404	NTOHSI(cd->ad->blk_end[i][j]);
405	}
406	}
407	NTOHSI(cd->fast_loaded);
408
409	delete []fixup;
410
411	// the whole fast comp dict is a bit of a hack so I don't
412	// feel too bad about the next line :-) -- Rodger.
413	_cd = *cd;
414
415	return true;
416	}
417
418
419	static bool LoadCompDict (FILE *compDictFile,
420	FILE *auxDictFile,
421	FILE *fastCompDictFile,
422	compression_dict &cd) {
423	// see if we have a fast loading compression dictionary
424	if (fastCompDictFile != NULL)
425	return LoadFastCompDict (fastCompDictFile, cd);
426
427	// slow compression dictionary
428	return LoadSlowCompDict (compDictFile, auxDictFile, cd);
429	}
430
431
432	// try to open the dictionary files and load the dictionary
433	static bool OpenLoadCompDict (char *textname, compression_dict &cd) {
434	FILE *compDictFile = NULL;
435	FILE *auxDictFile = NULL;
436	FILE *fastCompDictFile = NULL;
437
438	fastCompDictFile = open_file (textname, TEXT_DICT_FAST_SUFFIX,
439	"rb", MAGIC_FAST_DICT, MG_CONTINUE);
440
441	if (fastCompDictFile == NULL) {
442	compDictFile = open_file (textname, TEXT_DICT_SUFFIX,
443	"rb", MAGIC_DICT, MG_MESSAGE);
444	auxDictFile = open_file (textname, TEXT_DICT_AUX_SUFFIX,
445	"rb", MAGIC_AUX_DICT, MG_CONTINUE);
446	}
447
448	bool res = LoadCompDict (compDictFile, auxDictFile, fastCompDictFile, cd);
449
450	if (compDictFile != NULL) fclose (compDictFile);
451	if (auxDictFile != NULL) fclose (auxDictFile);
452	if (fastCompDictFile != NULL) fclose (fastCompDictFile);
453
454	return res;
455	}
456
457	static bool LoadLevels (char *textname, FTextLevel &levels) {
458	FILE *levelFile = NULL;
459
460	// open the text level file
461	levelFile = open_file (textname, TEXT_LEVEL_SUFFIX,
462	"rb", MAGIC_TEXT_LEVELS, MG_CONTINUE);
463	if (levelFile == NULL) return false;
464
465	// seek to the appropriate place and read the level information
466	bool res = ((fseek (levelFile, sizeof (u_long), SEEK_SET) == 0) &&
467	levels.Read (levelFile));
468
469	// close the file
470	fclose (levelFile);
471
472	return res;
473	}
474
475
476	TextData::TextData () {
477	// put file pointers in known state first
478	textFile = NULL;
479	textIdxFile = NULL;
480	Clear ();
481	}
482
483	void TextData::Clear () {
484	cd.Clear();
485	textFile = NULL;
486	textIdxFile = NULL;
487	cth.Clear();
488	levels.Clear();
489	}
490
491	bool TextData::LoadData (char basepath, char textname) {
492
493	if (textname[0] == '\0') return false;
494
495	// set the basepath
496	set_basepath(basepath);
497
498	// load the compression dictionary
499	if (!OpenLoadCompDict (textname, cd)) return false;
500
501	// open the compressed text and text index file
502	textFile = open_file (textname, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_CONTINUE);
503	if (textFile == NULL) return false;
504
505	textIdxFile = open_file (textname, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI, MG_CONTINUE);
506	if (textIdxFile == NULL) return false;
507
508	// read in the compressed text header
509	if ((fseek (textFile, sizeof (u_long), SEEK_SET) != 0) \|\| !cth.Read (textFile))
510	return false;
511
512	// read in the level information
513	if (!LoadLevels (textname, levels)) return false;
514
515	return true;
516	}
517
518	bool TextData::UnloadData () {
519	// close any open files
520	if (textFile != NULL) {
521	fclose (textFile);
522	textFile = NULL;
523	}
524	if (textIdxFile != NULL) {
525	fclose (textIdxFile);
526	textIdxFile = NULL;
527	}
528
529	// do general clear
530	Clear ();
531
532	return true;
533	}
534
535
536	bool GetDocIdx (TextData &td, const UCArray &docLevel,
537	unsigned long docNum, TextIdx &docIdx) {
538	// make sure the text index file was opened successfully
539	if (td.textIdxFile == NULL) return false;
540
541	// read in the index
542	TextLevelInfo &levelInfo = td.levels.levelInfo[docLevel];
543	if (!docIdx.Read (td.textIdxFile, levelInfo, docNum)) return false;
544
545	return true;
546	}
547
548
549
550
551	#define MY_HUFF_DECODE(len, code, mcodes) \
552	do { \
553	register unsigned long *__min_code = (mcodes); \
554	register unsigned long *__mclen = __min_code; \
555	register unsigned long __code = 0; \
556	do \
557	{ \
558	__code += __code + buffer.bit(); \
559	} \
560	while (__code < *++__mclen); \
561	(len) = __mclen - __min_code; \
562	(code) = __code - *__mclen; \
563	} while(0);
564
565
566	bool GetDocText (TextData &td, const UCArray &docLevel,
567	unsigned long docNum, UCArray &docText) {
568	// erase the current text
569	docText.erase (docText.begin(), docText.end());
570
571	// look up the information about this document
572	TextIdx docIdx;
573	if (!GetDocIdx (td, docLevel, docNum, docIdx)) return false;
574
575	// do seek to appropriate position
576	stdio_bitio_buffer buffer (td.textFile);
577	buffer.seek (docIdx.start.byte, docIdx.start.bit);
578
579	// decompress the document
580	compression_dict &cd = td.cd;
581	auxiliary_dict *ad = cd.ad;
582	int which = docIdx.which;
583	unsigned long num_bits = (docIdx.end.byte*8+(8-docIdx.end.bit)) -
584	(docIdx.start.byte*8+(8-docIdx.start.bit));
585	unsigned long bits = 0;
586
587	if (docText.capacity() < docText.size() + num_bits + 1) {
588	docText.reserve(docText.size() + num_bits + 1);
589	}
590	// keep decoding bits until enough bits have been decoded
591	while (bits < num_bits) {
592	register unsigned code, len;
593	register int r;
594	register u_char t, b = NULL;
595	u_char word[MAXWORDLEN + 1];
596
597	if (cd.cfh[which]) {
598	MY_HUFF_DECODE (len, code, cd.cfh[which]->hd.min_code);
599	bits += len;
600
601	r = code & ((1 << cd.cdh.lookback) - 1);
602	t = cd.values[which][len][code >> cd.cdh.lookback];
603
604	/* step through from base pointer */
605	b = word + 1;
606	while (r--) {
607	register int copy = *t >> 4;
608	memcpy (word + copy + 1, t + 1, *t & 0xf);
609	word[0] = copy + (*t & 0xf);
610	t += ((*t) & 0xf) + 1;
611	}
612	} else t = NULL;
613
614	if (t == cd.escape[which]) {
615	switch (cd.cdh.novel_method) {
616	case MG_NOVEL_HUFFMAN_CHARS:
617	{
618	int len, i;
619	int c;
620	len = buffer.huff_decode(cd.lens_huff[which]->min_code,
621	cd.lens_vals[which], &bits);
622	for (i = 0; i < len; ++i) {
623	c = buffer.huff_decode(cd.chars_huff[which]->min_code,
624	cd.chars_vals[which], &bits);
625	docText.push_back (c);
626	}
627	}
628	break;
629	case MG_NOVEL_DELTA:
630	case MG_NOVEL_HYBRID:
631	{
632	int idx = 0, len;
633	u_char *base;
634	switch (cd.cdh.novel_method)
635	{
636	case MG_NOVEL_DELTA:
637	{
638	idx = buffer.delta_decode (&bits);
639	--idx;
640	}
641	break;
642	case MG_NOVEL_HYBRID:
643	{
644	int k;
645	k = buffer.gamma_decode (&bits);
646	--k;
647	idx = buffer.binary_decode(ad->blk_end[which][k] -
648	ad->blk_start[which][k] + 1,
649	&bits);
650	idx += ad->blk_start[which][k] - 1;
651	}
652	break;
653	}
654	base = ad->words[which][idx];
655	len = *base++;
656	for (; len; --len)
657	{
658	docText.push_back (*base++);
659	}
660	}
661	break;
662	}
663	}
664	else
665	{
666	/* copy over the matching prefix */
667	r = (*t >> 4);
668	while (r--) {
669	docText.push_back (*b++);
670	}
671
672	/* and the stored suffix */
673	r = ((*t) & 0xf);
674	while (r--) {
675	docText.push_back (*++t);
676	}
677	}
678	which = !which;
679	}
680
681	buffer.done();
682
683	return true;
684	}
685

Note: See TracBrowser for help on using the repository browser.

Download in other formats: