Context Navigation

source: trunk/gsdl/src/mgpp/text/text.pass2.cpp@ 2554

Last change on this file since 2554 was 2541, checked in by jrm21, 23 years ago

1) replaced obsolete bzero() calls with memset() calls.
2) Makefile.in fixed for c++ variables (CXX instead of CPP), and GSDLOS is now

set from configure, so files are installed in the right place.

3) text.pass1.cpp now prints an error message if index file can't be created.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 15.1 KB

Line
1	/**************************************************************************
2	*
3	* text.pass2.cpp -- Text compression (Pass 2)
4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5	* Copyright (C) 1999 Rodger McNab
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	**************************************************************************/
22
23	// need this to avoid bizarre compiler problems under VC++ 6.0
24	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
25	# include <iostream>
26	#endif
27
28	#if defined (__WIN32__)
29	# include <stdio.h>
30	# define unlink _unlink
31	#endif
32
33	#include "sysfuncs.h"
34	#include "memlib.h"
35	#include "messages.h"
36	#include "local_strings.h"
37	#include "bitio_m_stdio.h"
38	#include "huffman.h"
39	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
40	#include "mg.h"
41	#include "mg_files.h"
42	#include "build.h"
43	#include "words.h"
44	#include "text.h"
45	#include "hash.h"
46	#include "locallib.h"
47	#include "comp_dict.h"
48	#include "FText.h"
49
50	#define POOL_SIZE 1024*256
51
52
53	struct char_pool {
54	struct char_pool *next;
55	u_long left;
56	u_char *ptr;
57	u_char pool[POOL_SIZE];
58	};
59
60	struct novel_hash_rec {
61	u_long ordinal_num;
62	u_char *word;
63	};
64
65
66	#define INITIAL_HASH_SIZE 7927
67
68	struct novel_hash_table {
69	novel_hash_rec *HashTable;
70	u_long HashSize, HashUsed;
71	char_pool *first_pool;
72	char_pool *pool;
73	u_long next_num, binary_start;
74	novel_hash_rec **code_to_nhr;
75	};
76
77
78	static FILE *text;
79	static stdio_bitio_buffer textOutBuf;
80	static u_long headLen;
81
82	static novel_hash_table nht[2];
83
84	int blk_start[2][33], blk_end[2][33];
85
86	// 0 = non-word, 1 = word
87	unsigned char whichWordType = 0;
88
89	CompressTextInfoMap textInfoMap;
90
91
92	static char_pool new_pool (char_pool pool) {
93	char_pool p = (char_pool ) Xmalloc (sizeof (char_pool));
94	if (!p) FatalError (1, "Unable to allocate memory for pool");
95
96	if (pool) pool->next = p;
97	p->next = NULL;
98	p->left = POOL_SIZE;
99	p->ptr = p->pool;
100	return p;
101	}
102
103
104	int init_text_2 (const TagInfo &/tagInfo/, char *fileName) {
105	char path[512];
106
107	// load the compression dictionary
108	if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
109	path)) == COMPERROR)
110	return COMPERROR;
111
112	// create the compressed text file and attach it to the text buffer
113	if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
114	MAGIC_TEXT, MG_MESSAGE)))
115	return COMPERROR;
116	textOutBuf.attachFile (text);
117	textOutBuf.encodeStart ();
118
119	// write out the compressed text header
120	memset (&cth, '\0', sizeof (cth));
121	if (fwrite (&cth, sizeof (cth), 1, text) != 1)
122	return COMPERROR;
123
124	// the length of the compressed header is the size of the
125	// magic number plus the size of the compressed text header
126	headLen = sizeof (u_long) + sizeof (cth);
127
128	// start off with non-word
129	whichWordType = 0;
130
131	// clear the text pointers
132	textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
133
134	// set up the hash table of novel words
135	int i;
136	if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
137	for (i = 0; i <= 1; i++)
138	{
139	nht[i].HashSize = INITIAL_HASH_SIZE;
140	nht[i].HashTable = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) nht[i].HashSize);
141	memset (nht[i].HashTable, '\0',
142	sizeof (novel_hash_rec) * nht[i].HashSize);
143	nht[i].HashUsed = 0;
144	nht[i].HashSize = INITIAL_HASH_SIZE;
145	nht[i].pool = nht[i].first_pool = new_pool (NULL);
146	nht[i].next_num = 1;
147	nht[i].binary_start = 1;
148	nht[i].code_to_nhr = NULL;
149	if (cdh.novel_method == MG_NOVEL_HYBRID)
150	{
151	int num;
152	num = 1;
153	blk_start[i][0] = 0;
154	blk_end[i][0] = cdh.num_words[i] - 1;
155	while (num < 33)
156	{
157	blk_start[i][num] = blk_end[i][num - 1] + 1;
158	blk_end[i][num] = blk_start[i][num] +
159	(blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
160	num++;
161	}
162	}
163	}
164
165	return (COMPALLOK);
166	}
167
168
169	static int compress_text (bitio_buffer &buffer, const u_char *s_in,
170	int l_in, unsigned char &which,
171	unsigned long &numWords) {
172	const u_char *end = s_in + l_in - 1;
173
174	for (; s_in <= end; which = !which) {
175	u_char Word[MAXWORDLEN + 1];
176	int res;
177
178	if (which) numWords++;
179
180	/* First parse a word or non-word out of the string */
181	if (which) PARSE_WORD (Word, s_in, end);
182	else PARSE_NON_WORD (Word, s_in, end);
183
184	/* Search the hash table for Word */
185	if (ht[which]) {
186	register unsigned long hashval, step;
187	register int tsize = ht[which]->size;
188	register u_char **wptr;
189	HASH (hashval, step, Word, tsize);
190	for (;;) {
191	register u_char *s1;
192	register u_char *s2;
193	register int len;
194	wptr = ht[which]->table[hashval];
195	if (wptr == NULL) {
196	res = COMPERROR;
197	break;
198	}
199
200	/* Compare the words */
201	s1 = Word;
202	s2 = *wptr;
203	len = *s1 + 1;
204	for (; len; len--)
205	if (s1++ != s2++) break;
206
207	if (len) {
208	hashval += step;
209	if (hashval >= (unsigned long)tsize) hashval -= tsize;
210
211	} else {
212	res = ht[which]->table[hashval] - ht[which]->words;
213	break;
214	}
215	}
216	}
217	else
218	res = COMPERROR;
219	/* Check that the word was found in the dictionary */
220	if (res == COMPERROR)
221	{
222	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
223	{
224	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
225	return (COMPERROR);
226	}
227	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
228	{
229	u_long i;
230	if (ht[which])
231	{
232	res = ht[which]->hd->num_codes - 1;
233	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
234	}
235	buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
236	for (i = 0; i < Word[0]; i++)
237	buffer.huff_encode (Word[i + 1], char_codes[which],
238	char_huff[which].clens, NULL);
239	}
240	if (cdh.dict_type == MG_SEED_DICTIONARY)
241	{
242	if (ht[which])
243	{
244	res = ht[which]->hd->num_codes - 1;
245	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
246	}
247	switch (cdh.novel_method)
248	{
249	case MG_NOVEL_HUFFMAN_CHARS:
250	{
251	u_long i;
252	buffer.huff_encode (Word[0], lens_codes[which],
253	lens_huff[which].clens, NULL);
254	for (i = 0; i < Word[0]; i++)
255	buffer.huff_encode (Word[i + 1], char_codes[which],
256	char_huff[which].clens, NULL);
257	}
258	break;
259	case MG_NOVEL_DELTA:
260	case MG_NOVEL_HYBRID:
261	{
262	register unsigned long hashval, step;
263	register novel_hash_table *h = &nht[which];
264	register int hsize = h->HashSize;
265	register novel_hash_rec *ent;
266	HASH (hashval, step, Word, hsize);
267	for (;;)
268	{
269	register u_char s1, s2;
270	register int len;
271	ent = h->HashTable + hashval;
272	if (!ent->word)
273	{
274	int len = *Word + 1;
275	if ((unsigned)len > h->pool->left)
276	h->pool = new_pool (h->pool);
277	ent->word = h->pool->ptr;
278	ent->ordinal_num = h->next_num++;
279	memcpy (h->pool->ptr, Word, len);
280	h->pool->ptr += len;
281	h->pool->left -= len;
282	h->HashUsed++;
283	break;
284	}
285	/* Compare the words */
286	s1 = Word;
287	s2 = ent->word;
288	len = *s1 + 1;
289	for (; len; len--)
290	if (s1++ != s2++)
291	break;
292
293	if (!len)
294	break;
295
296	hashval = (hashval + step);
297	if (hashval >= (unsigned)hsize) hashval -= hsize;
298	}
299
300	switch (cdh.novel_method) {
301	case MG_NOVEL_DELTA:
302	buffer.delta_encode (ent->ordinal_num, NULL);
303	break;
304	case MG_NOVEL_HYBRID:
305	int k = 0;
306	int j = ent->ordinal_num - 1;
307	while (j > blk_end[which][k])
308	k++;
309	assert (j - blk_start[which][k] + 1 >= 1 &&
310	j - blk_start[which][k] + 1 <=
311	blk_end[which][k] - blk_start[which][k] + 1);
312
313	buffer.gamma_encode (k + 1, NULL);
314	buffer.binary_encode (j - blk_start[which][k] + 1,
315	blk_end[which][k] - blk_start[which][k] + 1,
316	NULL);
317	break;
318	}
319
320	if (h->HashUsed >= h->HashSize >> 1)
321	{
322	novel_hash_rec *ht;
323	unsigned long size;
324	unsigned long i;
325	size = prime (h->HashSize * 2);
326	if (!(ht = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) size)))
327	{
328	Message ("Unable to allocate memory for table");
329	return (COMPERROR);
330	}
331	memset (ht, '\0', sizeof (novel_hash_rec) * size);
332
333	for (i = 0; i < h->HashSize; i++)
334	if (h->HashTable[i].word)
335	{
336	register u_char *wptr;
337	register unsigned long hashval, step;
338
339	wptr = h->HashTable[i].word;
340	HASH (hashval, step, wptr, size);
341	wptr = (ht + hashval)->word;
342	while (wptr)
343	{
344	hashval += step;
345	if (hashval >= size)
346	hashval -= size;
347	wptr = (ht + hashval)->word;
348	}
349	ht[hashval] = h->HashTable[i];
350	}
351	Xfree (h->HashTable);
352	h->HashTable = ht;
353	h->HashSize = size;
354	}
355	}
356	break;
357	}
358	}
359	}
360	else
361	{
362	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
363	}
364	}
365
366	return COMPALLOK;
367	}
368
369
370	int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
371	// we need to remember where the last text element ends
372	unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
373	unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
374
375	// process each text element
376	TextElArray::const_iterator here = doc.begin();
377	TextElArray::const_iterator end = doc.end();
378	while (here != end) {
379	// remember current place in compressed text
380	unsigned char startWhich = whichWordType;
381	unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
382	unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
383
384	// compress the text
385	if (compress_text (textOutBuf,
386	(*here).text.begin(),
387	(*here).text.size(),
388	whichWordType,
389	cth.num_of_words) != COMPALLOK)
390	return COMPERROR;
391
392	endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
393	endPos = textOutBuf.GetBytesWritten () + headLen;
394
395	// update the pointers if this is a level tag
396	if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
397	if ((*here).elType == OpenTagE) {
398	// set the start of the document
399	textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
400
401	} else if ((*here).elType == CloseTagE) {
402	// set the end of the document
403	textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
404
405	} else {
406	// TextE, nothing needs to be done
407	}
408	}
409
410	cth.num_of_bytes += (*here).text.size();
411	here++;
412	}
413
414	// close off any unclosed tags
415	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
416	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
417	while (tiHere != tiEnd) {
418	if ((tiHere).second.inDoc) (tiHere).second.SetEnd (endPos, endBit);
419	tiHere++;
420	}
421
422	// we've processed one more document
423	cth.num_of_docs++;
424
425	return COMPALLOK;
426	}
427
428
429	static int write_aux_dict (char *fileName) {
430	int i;
431	FILE *aux;
432	if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
433	MAGIC_AUX_DICT, MG_MESSAGE)))
434	return COMPERROR;
435
436	for (i = 0; i <= 1; i++)
437	{
438	aux_frags_header afh;
439	char_pool *cp;
440
441	afh.num_frags = nht[i].HashUsed;
442	afh.mem_for_frags = 0;
443	for (cp = nht[i].first_pool; cp; cp = cp->next)
444	afh.mem_for_frags += POOL_SIZE - cp->left;
445
446	/* [RPAP - Jan 97: Endian Ordering] */
447	HTONUL(afh.num_frags);
448	HTONUL(afh.mem_for_frags);
449
450	fwrite (&afh, sizeof (afh), 1, aux);
451
452	for (cp = nht[i].first_pool; cp; cp = cp->next)
453	fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
454	}
455	fclose (aux);
456	return COMPALLOK;
457	}
458
459
460	static void estimate_compressed_aux_dict (void) {
461	int i;
462	u_long aux_compressed = 0, total_uncomp = 0;
463	for (i = 0; i <= 1; i++)
464	{
465	int j;
466	long chars[256], fchars[256];
467	long lens[16], flens[16];
468	char_pool *cp;
469	memset (chars, '\0', sizeof (chars));
470	memset (lens, '\0', sizeof (lens));
471	for (cp = nht[i].first_pool; cp; cp = cp->next)
472	{
473	u_char *buf = cp->pool;
474	while (buf != cp->ptr)
475	{
476	int len = *buf++;
477	lens[len]++;
478	total_uncomp += len + 4;
479	for (; len; len--)
480	chars[*buf++]++;
481	}
482	}
483	for (j = 0; j < 256; j++)
484	if (!chars[j] && PESINAWORD (j) == i)
485	fchars[j] = 1;
486	else
487	fchars[j] = chars[j];
488	for (j = 0; j < 16; j++)
489	if (!lens[j])
490	flens[j] = 1;
491	else
492	flens[j] = lens[j];
493
494	aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
495	Calculate_Huffman_Size (256, fchars, chars)) / 8);
496
497	}
498	}
499
500
501
502	static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
503	// create the text index
504	FILE *textIdxFile;
505	if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
506	MAGIC_TEXI, MG_MESSAGE)))
507	return false;
508
509	// output each level
510	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
511	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
512	while (tiHere != tiEnd) {
513	// remember the level information
514	TextLevelInfo levelInfo;
515	levelInfo.levelTag = (*tiHere).first;
516	levelInfo.textIdxPtr = ftell (textIdxFile);
517	levelInfo.numEntries = (*tiHere).second.docPtrs.size();
518	textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
519
520	// write out the index array for this level
521	if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
522	return false;
523
524	tiHere++;
525	}
526
527	// close the file
528	fclose (textIdxFile);
529
530	return true;
531	}
532
533
534	bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
535	// cout << textLevels;
536
537	// create the text index
538	FILE *textLevelFile;
539	if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
540	MAGIC_TEXT_LEVELS, MG_MESSAGE)))
541	return false;
542
543	// output each level
544	if (!textLevels.Write (textLevelFile)) return false;
545
546	// close the file
547	fclose (textLevelFile);
548
549	return true;
550	}
551
552
553	int done_text_2 (const TagInfo &/tagInfo/, char *fileName) {
554	// write out any remaining bits
555	textOutBuf.encodeDone();
556
557	// write out the compressed text header (with updated statistics)
558	if (fseek (text, sizeof (u_long), SEEK_SET) == -1 \|\| !cth.Write (text))
559	return COMPERROR;
560	fclose (text);
561
562	// write out the text index file and get the level information
563	FTextLevel textLevels;
564	if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
565
566	// write out the text level information
567	if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
568
569	// write out the auxiliary dictionary
570	if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
571	(cdh.novel_method == MG_NOVEL_DELTA \|\|
572	cdh.novel_method == MG_NOVEL_HYBRID)) {
573	if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
574	estimate_compressed_aux_dict ();
575
576	} else {
577	unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
578	}
579
580	return (COMPALLOK);
581	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: