Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/mgpp/text/text.pass2.cpp@ 3365

Last change on this file since 3365 was 3365, checked in by kjdon, 22 years ago
Initial revision
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.1 KB

Line
1	/**************************************************************************
2	*
3	* text.pass2.cpp -- Text compression (Pass 2)
4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5	* Copyright (C) 1999 Rodger McNab
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	**************************************************************************/
22
23	// need this to avoid bizarre compiler problems under VC++ 6.0
24	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
25	# include <iostream>
26	#endif
27
28	#if defined (__WIN32__)
29	# include <stdio.h>
30	# define unlink _unlink
31	#endif
32
33	#include "sysfuncs.h"
34	#include "memlib.h"
35	#include "messages.h"
36	#include "local_strings.h"
37	#include "bitio_m_stdio.h"
38	#include "huffman.h"
39	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
40	#include "mg.h"
41	#include "mg_files.h"
42	#include "build.h"
43	#include "words.h"
44	#include "text.h"
45	#include "hash.h"
46	#include "locallib.h"
47	#include "comp_dict.h"
48	#include "FText.h"
49
50	#define POOL_SIZE 1024*256
51
52
53	struct char_pool {
54	struct char_pool *next;
55	u_long left;
56	u_char *ptr;
57	u_char pool[POOL_SIZE];
58	};
59
60	struct novel_hash_rec {
61	u_long ordinal_num;
62	u_char *word;
63	};
64
65
66	#define INITIAL_HASH_SIZE 7927
67
68	struct novel_hash_table {
69	novel_hash_rec *HashTable;
70	u_long HashSize, HashUsed;
71	char_pool *first_pool;
72	char_pool *pool;
73	u_long next_num, binary_start;
74	novel_hash_rec **code_to_nhr;
75	};
76
77
78	static FILE *text;
79	static stdio_bitio_buffer textOutBuf;
80	static u_long headLen;
81
82	static novel_hash_table nht[2];
83
84	int blk_start[2][33], blk_end[2][33];
85
86	// 0 = non-word, 1 = word
87	unsigned char whichWordType = 0;
88
89	CompressTextInfoMap textInfoMap;
90
91
92	static char_pool new_pool (char_pool pool) {
93	char_pool p = (char_pool ) Xmalloc (sizeof (char_pool));
94	if (!p) FatalError (1, "Unable to allocate memory for pool");
95
96	if (pool) pool->next = p;
97	p->next = NULL;
98	p->left = POOL_SIZE;
99	p->ptr = p->pool;
100	return p;
101	}
102
103
104	int init_text_2 (const TagInfo &/tagInfo/, char *fileName) {
105	char path[512];
106
107	// load the compression dictionary
108	if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
109	path)) == COMPERROR)
110	return COMPERROR;
111
112	// create the compressed text file and attach it to the text buffer
113	if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
114	MAGIC_TEXT, MG_MESSAGE)))
115	return COMPERROR;
116	textOutBuf.attachFile (text);
117	textOutBuf.encodeStart ();
118
119	// write out the compressed text header
120	memset (&cth, '\0', sizeof (cth));
121	if (fwrite (&cth, sizeof (cth), 1, text) != 1)
122	return COMPERROR;
123
124	// the length of the compressed header is the size of the
125	// magic number plus the size of the compressed text header
126	headLen = sizeof (u_long) + sizeof (cth);
127
128	// start off with non-word
129	whichWordType = 0;
130
131	// clear the text pointers
132	textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
133
134	// set up the hash table of novel words
135	int i;
136	if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
137	for (i = 0; i <= 1; i++)
138	{
139	nht[i].HashSize = INITIAL_HASH_SIZE;
140	nht[i].HashTable = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) nht[i].HashSize);
141	memset (nht[i].HashTable, '\0',
142	sizeof (novel_hash_rec) * nht[i].HashSize);
143	nht[i].HashUsed = 0;
144	nht[i].HashSize = INITIAL_HASH_SIZE;
145	nht[i].pool = nht[i].first_pool = new_pool (NULL);
146	nht[i].next_num = 1;
147	nht[i].binary_start = 1;
148	nht[i].code_to_nhr = NULL;
149	if (cdh.novel_method == MG_NOVEL_HYBRID)
150	{
151	int num;
152	num = 1;
153	blk_start[i][0] = 0;
154	blk_end[i][0] = cdh.num_words[i] - 1;
155	while (num < 33)
156	{
157	blk_start[i][num] = blk_end[i][num - 1] + 1;
158	blk_end[i][num] = blk_start[i][num] +
159	(blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
160	num++;
161	}
162	}
163	}
164
165	return (COMPALLOK);
166	}
167
168
169	static int compress_text (bitio_buffer &buffer, const u_char *s_in,
170	int l_in, unsigned char &which,
171	unsigned long &numWords) {
172	const u_char *end = s_in + l_in - 1;
173
174	for (; s_in <= end; which = !which) {
175	u_char Word[MAXWORDLEN + 1];
176	int res;
177
178	if (which) numWords++;
179
180	/* First parse a word or non-word out of the string */
181	if (which) PARSE_WORD (Word, s_in, end);
182	else PARSE_NON_WORD (Word, s_in, end);
183
184	/* Search the hash table for Word */
185	if (ht[which]) {
186	register unsigned long hashval, step;
187	register int tsize = ht[which]->size;
188	register u_char **wptr;
189	HASH (hashval, step, Word, tsize);
190	for (;;) {
191	register u_char *s1;
192	register u_char *s2;
193	register int len;
194	wptr = ht[which]->table[hashval];
195	if (wptr == NULL) {
196	res = COMPERROR;
197	break;
198	}
199
200	/* Compare the words */
201	s1 = Word;
202	s2 = *wptr;
203	len = *s1 + 1;
204	for (; len; len--)
205	if (s1++ != s2++) break;
206
207	if (len) {
208	hashval += step;
209	if (hashval >= (unsigned long)tsize) hashval -= tsize;
210
211	} else {
212	res = ht[which]->table[hashval] - ht[which]->words;
213	break;
214	}
215	}
216	}
217	else
218	res = COMPERROR;
219	/* Check that the word was found in the dictionary */
220	if (res == COMPERROR)
221	{
222	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
223	{
224	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
225	return (COMPERROR);
226	}
227	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
228	{
229	u_long i;
230	if (ht[which])
231	{
232	res = ht[which]->hd->num_codes - 1;
233	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
234	}
235	buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
236	for (i = 0; i < Word[0]; i++)
237	buffer.huff_encode (Word[i + 1], char_codes[which],
238	char_huff[which].clens, NULL);
239	}
240	if (cdh.dict_type == MG_SEED_DICTIONARY)
241	{
242	if (ht[which])
243	{
244	res = ht[which]->hd->num_codes - 1;
245	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
246	}
247	switch (cdh.novel_method)
248	{
249	case MG_NOVEL_HUFFMAN_CHARS:
250	{
251	u_long i;
252	buffer.huff_encode (Word[0], lens_codes[which],
253	lens_huff[which].clens, NULL);
254	for (i = 0; i < Word[0]; i++)
255	buffer.huff_encode (Word[i + 1], char_codes[which],
256	char_huff[which].clens, NULL);
257	}
258	break;
259	case MG_NOVEL_DELTA:
260	case MG_NOVEL_HYBRID:
261	{
262	register unsigned long hashval, step;
263	register novel_hash_table *h = &nht[which];
264	register int hsize = h->HashSize;
265	register novel_hash_rec *ent;
266	HASH (hashval, step, Word, hsize);
267	for (;;)
268	{
269	register u_char s1, s2;
270	register int len;
271	ent = h->HashTable + hashval;
272	if (!ent->word)
273	{
274	int len = *Word + 1;
275	if ((unsigned)len > h->pool->left)
276	h->pool = new_pool (h->pool);
277	ent->word = h->pool->ptr;
278	ent->ordinal_num = h->next_num++;
279	memcpy (h->pool->ptr, Word, len);
280	h->pool->ptr += len;
281	h->pool->left -= len;
282	h->HashUsed++;
283	break;
284	}
285	/* Compare the words */
286	s1 = Word;
287	s2 = ent->word;
288	len = *s1 + 1;
289	for (; len; len--)
290	if (s1++ != s2++)
291	break;
292
293	if (!len)
294	break;
295
296	hashval = (hashval + step);
297	if (hashval >= (unsigned)hsize) hashval -= hsize;
298	}
299
300	switch (cdh.novel_method) {
301	case MG_NOVEL_DELTA:
302	buffer.delta_encode (ent->ordinal_num, NULL);
303	break;
304	case MG_NOVEL_HYBRID:
305	int k = 0;
306	int j = ent->ordinal_num - 1;
307	while (j > blk_end[which][k])
308	k++;
309	assert (j - blk_start[which][k] + 1 >= 1 &&
310	j - blk_start[which][k] + 1 <=
311	blk_end[which][k] - blk_start[which][k] + 1);
312
313	buffer.gamma_encode (k + 1, NULL);
314	buffer.binary_encode (j - blk_start[which][k] + 1,
315	blk_end[which][k] - blk_start[which][k] + 1,
316	NULL);
317	break;
318	}
319
320	if (h->HashUsed >= h->HashSize >> 1)
321	{
322	novel_hash_rec *ht;
323	unsigned long size;
324	unsigned long i;
325	size = prime (h->HashSize * 2);
326	if (!(ht = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) size)))
327	{
328	Message ("Unable to allocate memory for table");
329	return (COMPERROR);
330	}
331	memset (ht, '\0', sizeof (novel_hash_rec) * size);
332
333	for (i = 0; i < h->HashSize; i++)
334	if (h->HashTable[i].word)
335	{
336	register u_char *wptr;
337	register unsigned long hashval, step;
338
339	wptr = h->HashTable[i].word;
340	HASH (hashval, step, wptr, size);
341	wptr = (ht + hashval)->word;
342	while (wptr)
343	{
344	hashval += step;
345	if (hashval >= size)
346	hashval -= size;
347	wptr = (ht + hashval)->word;
348	}
349	ht[hashval] = h->HashTable[i];
350	}
351	Xfree (h->HashTable);
352	h->HashTable = ht;
353	h->HashSize = size;
354	}
355	}
356	break;
357	}
358	}
359	}
360	else
361	{
362	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
363	}
364	}
365
366	return COMPALLOK;
367	}
368
369
370	int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
371	// we need to remember where the last text element ends
372	unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
373	unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
374
375	// process each text element
376	TextElArray::const_iterator here = doc.begin();
377	TextElArray::const_iterator end = doc.end();
378	while (here != end) {
379	// remember current place in compressed text
380	unsigned char startWhich = whichWordType;
381	unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
382	unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
383
384	// compress the text
385	if (compress_text (textOutBuf,
386	&(here->text[0]),
387	(*here).text.size(),
388	whichWordType,
389	cth.num_of_words) != COMPALLOK)
390	return COMPERROR;
391
392	endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
393	endPos = textOutBuf.GetBytesWritten () + headLen;
394
395	// update the pointers if this is a level tag
396	if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
397	if ((*here).elType == OpenTagE) {
398	// set the start of the document
399	textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
400
401	} else if ((*here).elType == CloseTagE) {
402	// set the end of the document
403	textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
404
405	} else {
406	// TextE, nothing needs to be done
407	}
408	}
409
410	cth.num_of_bytes += (*here).text.size();
411	here++;
412	}
413
414	// close off any unclosed tags
415	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
416	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
417	while (tiHere != tiEnd) {
418	if ((tiHere).second.inDoc) (tiHere).second.SetEnd (endPos, endBit);
419	tiHere++;
420	}
421
422	// we've processed one more document
423	cth.num_of_docs++;
424
425	return COMPALLOK;
426	}
427
428
429	static int write_aux_dict (char *fileName) {
430	int i;
431	FILE *aux;
432	if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
433	MAGIC_AUX_DICT, MG_MESSAGE)))
434	return COMPERROR;
435
436	for (i = 0; i <= 1; i++)
437	{
438	aux_frags_header afh;
439	char_pool *cp;
440
441	afh.num_frags = nht[i].HashUsed;
442	afh.mem_for_frags = 0;
443	for (cp = nht[i].first_pool; cp; cp = cp->next)
444	afh.mem_for_frags += POOL_SIZE - cp->left;
445
446	/* [RPAP - Jan 97: Endian Ordering] */
447	HTONUL(afh.num_frags);
448	HTONUL(afh.mem_for_frags);
449
450	fwrite (&afh, sizeof (afh), 1, aux);
451
452	for (cp = nht[i].first_pool; cp; cp = cp->next)
453	fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
454	}
455	fclose (aux);
456	return COMPALLOK;
457	}
458
459
460	static void estimate_compressed_aux_dict (void) {
461	int i;
462	u_long aux_compressed = 0, total_uncomp = 0;
463	for (i = 0; i <= 1; i++)
464	{
465	int j;
466	long chars[256], fchars[256];
467	long lens[16], flens[16];
468	char_pool *cp;
469	memset (chars, '\0', sizeof (chars));
470	memset (lens, '\0', sizeof (lens));
471	for (cp = nht[i].first_pool; cp; cp = cp->next)
472	{
473	u_char *buf = cp->pool;
474	while (buf != cp->ptr)
475	{
476	int len = *buf++;
477	lens[len]++;
478	total_uncomp += len + 4;
479	for (; len; len--)
480	chars[*buf++]++;
481	}
482	}
483	for (j = 0; j < 256; j++)
484	if (!chars[j] && PESINAWORD (j) == i)
485	fchars[j] = 1;
486	else
487	fchars[j] = chars[j];
488	for (j = 0; j < 16; j++)
489	if (!lens[j])
490	flens[j] = 1;
491	else
492	flens[j] = lens[j];
493
494	aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
495	Calculate_Huffman_Size (256, fchars, chars)) / 8);
496
497	}
498	}
499
500
501
502	static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
503	// create the text index
504	FILE *textIdxFile;
505	if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
506	MAGIC_TEXI, MG_MESSAGE)))
507	return false;
508
509	// output each level
510	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
511	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
512	while (tiHere != tiEnd) {
513	// remember the level information
514	TextLevelInfo levelInfo;
515	levelInfo.levelTag = (*tiHere).first;
516	levelInfo.textIdxPtr = ftell (textIdxFile);
517	levelInfo.numEntries = (*tiHere).second.docPtrs.size();
518	textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
519
520	// write out the index array for this level
521	if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
522	return false;
523
524	tiHere++;
525	}
526
527	// close the file
528	fclose (textIdxFile);
529
530	return true;
531	}
532
533
534	bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
535	// cout << textLevels;
536
537	// create the text index
538	FILE *textLevelFile;
539	if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
540	MAGIC_TEXT_LEVELS, MG_MESSAGE)))
541	return false;
542
543	// output each level
544	if (!textLevels.Write (textLevelFile)) return false;
545
546	// close the file
547	fclose (textLevelFile);
548
549	return true;
550	}
551
552
553	int done_text_2 (const TagInfo &/tagInfo/, char *fileName) {
554	// write out any remaining bits
555	textOutBuf.encodeDone();
556
557	// write out the compressed text header (with updated statistics)
558	if (fseek (text, sizeof (u_long), SEEK_SET) == -1 \|\| !cth.Write (text))
559	return COMPERROR;
560	fclose (text);
561
562	// write out the text index file and get the level information
563	FTextLevel textLevels;
564	if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
565
566	// write out the text level information
567	if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
568
569	// write out the auxiliary dictionary
570	if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
571	(cdh.novel_method == MG_NOVEL_DELTA \|\|
572	cdh.novel_method == MG_NOVEL_HYBRID)) {
573	if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
574	estimate_compressed_aux_dict ();
575
576	} else {
577	unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
578	}
579
580	return (COMPALLOK);
581	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: