Context Navigation

source: trunk/indexers/mgpp/text/text.pass2.cpp@ 8692

Last change on this file since 8692 was 8692, checked in by kjdon, 19 years ago
Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.2 KB

Line
1	/**************************************************************************
2	*
3	* text.pass2.cpp -- Text compression (Pass 2)
4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5	* Copyright (C) 1999 Rodger McNab
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	**************************************************************************/
22
23	#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(__WIN32__)
24	#pragma warning(disable:4786)
25	#endif
26
27	// need this to avoid bizarre compiler problems under VC++ 6.0
28	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
29	# include <iostream>
30	#endif
31
32	#if defined (__WIN32__)
33	# include <stdio.h>
34	# define unlink _unlink
35	#endif
36
37	#include "sysfuncs.h"
38	#include "memlib.h"
39	#include "messages.h"
40	#include "local_strings.h"
41	#include "bitio_m_stdio.h"
42	#include "huffman.h"
43	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
44	#include "mg.h"
45	#include "mg_files.h"
46	#include "build.h"
47	#include "words.h"
48	#include "text.h"
49	#include "hash.h"
50	#include "locallib.h"
51	#include "comp_dict.h"
52	#include "FText.h"
53
54	#define POOL_SIZE 1024*256
55
56
57	struct char_pool {
58	struct char_pool *next;
59	u_long left;
60	u_char *ptr;
61	u_char pool[POOL_SIZE];
62	};
63
64	struct novel_hash_rec {
65	u_long ordinal_num;
66	u_char *word;
67	};
68
69
70	#define INITIAL_HASH_SIZE 7927
71
72	struct novel_hash_table {
73	novel_hash_rec *HashTable;
74	u_long HashSize, HashUsed;
75	char_pool *first_pool;
76	char_pool *pool;
77	u_long next_num, binary_start;
78	novel_hash_rec **code_to_nhr;
79	};
80
81
82	static FILE *text;
83	static stdio_bitio_buffer textOutBuf;
84	static u_long headLen;
85
86	static novel_hash_table nht[2];
87
88	int blk_start[2][33], blk_end[2][33];
89
90	// 0 = non-word, 1 = word
91	unsigned char whichWordType = 0;
92
93	CompressTextInfoMap textInfoMap;
94
95
96	static char_pool new_pool (char_pool pool) {
97	char_pool p = (char_pool ) Xmalloc (sizeof (char_pool));
98	if (!p) FatalError (1, "Unable to allocate memory for pool");
99
100	if (pool) pool->next = p;
101	p->next = NULL;
102	p->left = POOL_SIZE;
103	p->ptr = p->pool;
104	return p;
105	}
106
107
108	int init_text_2 (const TagInfo &/tagInfo/, char *fileName) {
109	char path[512];
110
111	// load the compression dictionary
112	if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
113	path)) == COMPERROR)
114	return COMPERROR;
115
116	// create the compressed text file and attach it to the text buffer
117	if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
118	MAGIC_TEXT, MG_MESSAGE)))
119	return COMPERROR;
120	textOutBuf.attachFile (text);
121	textOutBuf.encodeStart ();
122
123	// write out the compressed text header
124	memset (&cth, '\0', sizeof (cth));
125	if (fwrite (&cth, sizeof (cth), 1, text) != 1)
126	return COMPERROR;
127
128	// the length of the compressed header is the size of the
129	// magic number plus the size of the compressed text header
130	headLen = sizeof (u_long) + sizeof (cth);
131
132	// start off with non-word
133	whichWordType = 0;
134
135	// clear the text pointers
136	textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
137
138	// set up the hash table of novel words
139	int i;
140	if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
141	for (i = 0; i <= 1; ++i)
142	{
143	nht[i].HashSize = INITIAL_HASH_SIZE;
144	nht[i].HashTable = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) nht[i].HashSize);
145	memset (nht[i].HashTable, '\0',
146	sizeof (novel_hash_rec) * nht[i].HashSize);
147	nht[i].HashUsed = 0;
148	nht[i].HashSize = INITIAL_HASH_SIZE;
149	nht[i].pool = nht[i].first_pool = new_pool (NULL);
150	nht[i].next_num = 1;
151	nht[i].binary_start = 1;
152	nht[i].code_to_nhr = NULL;
153	if (cdh.novel_method == MG_NOVEL_HYBRID)
154	{
155	int num;
156	num = 1;
157	blk_start[i][0] = 0;
158	blk_end[i][0] = cdh.num_words[i] - 1;
159	while (num < 33)
160	{
161	blk_start[i][num] = blk_end[i][num - 1] + 1;
162	blk_end[i][num] = blk_start[i][num] +
163	(blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
164	++num;
165	}
166	}
167	}
168
169	return (COMPALLOK);
170	}
171
172
173	static int compress_text (bitio_buffer &buffer, const u_char *s_in,
174	int l_in, unsigned char &which,
175	unsigned long &numWords) {
176	const u_char *end = s_in + l_in - 1;
177
178	for (; s_in <= end; which = !which) {
179	u_char Word[MAXWORDLEN + 1];
180	int res;
181
182	if (which) ++numWords;
183
184	/* First parse a word or non-word out of the string */
185	if (which) PARSE_WORD (Word, s_in, end);
186	else PARSE_NON_WORD (Word, s_in, end);
187
188	/* Search the hash table for Word */
189	if (ht[which]) {
190	register unsigned long hashval, step;
191	register int tsize = ht[which]->size;
192	register u_char **wptr;
193	HASH (hashval, step, Word, tsize);
194	for (;;) {
195	register u_char *s1;
196	register u_char *s2;
197	register int len;
198	wptr = ht[which]->table[hashval];
199	if (wptr == NULL) {
200	res = COMPERROR;
201	break;
202	}
203
204	/* Compare the words */
205	s1 = Word;
206	s2 = *wptr;
207	len = *s1 + 1;
208	for (; len; --len)
209	if (s1++ != s2++) break;
210
211	if (len) {
212	hashval += step;
213	if (hashval >= (unsigned long)tsize) hashval -= tsize;
214
215	} else {
216	res = ht[which]->table[hashval] - ht[which]->words;
217	break;
218	}
219	}
220	}
221	else
222	res = COMPERROR;
223	/* Check that the word was found in the dictionary */
224	if (res == COMPERROR)
225	{
226	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
227	{
228	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
229	return (COMPERROR);
230	}
231	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
232	{
233	u_long i;
234	if (ht[which])
235	{
236	res = ht[which]->hd->num_codes - 1;
237	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
238	}
239	buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
240	for (i = 0; i < Word[0]; ++i)
241	buffer.huff_encode (Word[i + 1], char_codes[which],
242	char_huff[which].clens, NULL);
243	}
244	if (cdh.dict_type == MG_SEED_DICTIONARY)
245	{
246	if (ht[which])
247	{
248	res = ht[which]->hd->num_codes - 1;
249	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
250	}
251	switch (cdh.novel_method)
252	{
253	case MG_NOVEL_HUFFMAN_CHARS:
254	{
255	u_long i;
256	buffer.huff_encode (Word[0], lens_codes[which],
257	lens_huff[which].clens, NULL);
258	for (i = 0; i < Word[0]; ++i)
259	buffer.huff_encode (Word[i + 1], char_codes[which],
260	char_huff[which].clens, NULL);
261	}
262	break;
263	case MG_NOVEL_DELTA:
264	case MG_NOVEL_HYBRID:
265	{
266	register unsigned long hashval, step;
267	register novel_hash_table *h = &nht[which];
268	register int hsize = h->HashSize;
269	register novel_hash_rec *ent;
270	HASH (hashval, step, Word, hsize);
271	for (;;)
272	{
273	register u_char s1, s2;
274	register int len;
275	ent = h->HashTable + hashval;
276	if (!ent->word)
277	{
278	int len = *Word + 1;
279	if ((unsigned)len > h->pool->left)
280	h->pool = new_pool (h->pool);
281	ent->word = h->pool->ptr;
282	ent->ordinal_num = h->next_num++;
283	memcpy (h->pool->ptr, Word, len);
284	h->pool->ptr += len;
285	h->pool->left -= len;
286	++h->HashUsed;
287	break;
288	}
289	/* Compare the words */
290	s1 = Word;
291	s2 = ent->word;
292	len = *s1 + 1;
293	for (; len; --len)
294	if (s1++ != s2++)
295	break;
296
297	if (!len)
298	break;
299
300	hashval = (hashval + step);
301	if (hashval >= (unsigned)hsize) hashval -= hsize;
302	}
303
304	switch (cdh.novel_method) {
305	case MG_NOVEL_DELTA:
306	buffer.delta_encode (ent->ordinal_num, NULL);
307	break;
308	case MG_NOVEL_HYBRID:
309	int k = 0;
310	int j = ent->ordinal_num - 1;
311	while (j > blk_end[which][k])
312	++k;
313	assert (j - blk_start[which][k] + 1 >= 1 &&
314	j - blk_start[which][k] + 1 <=
315	blk_end[which][k] - blk_start[which][k] + 1);
316
317	buffer.gamma_encode (k + 1, NULL);
318	buffer.binary_encode (j - blk_start[which][k] + 1,
319	blk_end[which][k] - blk_start[which][k] + 1,
320	NULL);
321	break;
322	}
323
324	if (h->HashUsed >= h->HashSize >> 1)
325	{
326	novel_hash_rec *ht;
327	unsigned long size;
328	unsigned long i;
329	size = prime (h->HashSize * 2);
330	if (!(ht = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) size)))
331	{
332	Message ("Unable to allocate memory for table");
333	return (COMPERROR);
334	}
335	memset (ht, '\0', sizeof (novel_hash_rec) * size);
336
337	for (i = 0; i < h->HashSize; ++i)
338	if (h->HashTable[i].word)
339	{
340	register u_char *wptr;
341	register unsigned long hashval, step;
342
343	wptr = h->HashTable[i].word;
344	HASH (hashval, step, wptr, size);
345	wptr = (ht + hashval)->word;
346	while (wptr)
347	{
348	hashval += step;
349	if (hashval >= size)
350	hashval -= size;
351	wptr = (ht + hashval)->word;
352	}
353	ht[hashval] = h->HashTable[i];
354	}
355	Xfree (h->HashTable);
356	h->HashTable = ht;
357	h->HashSize = size;
358	}
359	}
360	break;
361	}
362	}
363	}
364	else
365	{
366	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
367	}
368	}
369
370	return COMPALLOK;
371	}
372
373
374	int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
375	// we need to remember where the last text element ends
376	unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
377	unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
378
379	// process each text element
380	TextElArray::const_iterator here = doc.begin();
381	TextElArray::const_iterator end = doc.end();
382	while (here != end) {
383	// remember current place in compressed text
384	unsigned char startWhich = whichWordType;
385	unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
386	unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
387
388	// compress the text
389	if (compress_text (textOutBuf,
390	&(here->text[0]),
391	(*here).text.size(),
392	whichWordType,
393	cth.num_of_words) != COMPALLOK)
394	return COMPERROR;
395
396	endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
397	endPos = textOutBuf.GetBytesWritten () + headLen;
398
399	// update the pointers if this is a level tag
400	if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
401	if ((*here).elType == OpenTagE) {
402	// set the start of the document
403	textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
404
405	} else if ((*here).elType == CloseTagE) {
406	// set the end of the document
407	textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
408
409	} else {
410	// TextE, nothing needs to be done
411	}
412	}
413
414	cth.num_of_bytes += (*here).text.size();
415	++here;
416	}
417
418	// close off any unclosed tags
419	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
420	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
421	while (tiHere != tiEnd) {
422	if ((tiHere).second.inDoc) (tiHere).second.SetEnd (endPos, endBit);
423	++tiHere;
424	}
425
426	// we've processed one more document
427	++cth.num_of_docs;
428
429	return COMPALLOK;
430	}
431
432
433	static int write_aux_dict (char *fileName) {
434	int i;
435	FILE *aux;
436	if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
437	MAGIC_AUX_DICT, MG_MESSAGE)))
438	return COMPERROR;
439
440	for (i = 0; i <= 1; ++i)
441	{
442	aux_frags_header afh;
443	char_pool *cp;
444
445	afh.num_frags = nht[i].HashUsed;
446	afh.mem_for_frags = 0;
447	for (cp = nht[i].first_pool; cp; cp = cp->next)
448	afh.mem_for_frags += POOL_SIZE - cp->left;
449
450	/* [RPAP - Jan 97: Endian Ordering] */
451	HTONUL(afh.num_frags);
452	HTONUL(afh.mem_for_frags);
453
454	fwrite (&afh, sizeof (afh), 1, aux);
455
456	for (cp = nht[i].first_pool; cp; cp = cp->next)
457	fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
458	}
459	fclose (aux);
460	return COMPALLOK;
461	}
462
463
464	static void estimate_compressed_aux_dict (void) {
465	int i;
466	u_long aux_compressed = 0, total_uncomp = 0;
467	for (i = 0; i <= 1; ++i)
468	{
469	int j;
470	long chars[256], fchars[256];
471	long lens[16], flens[16];
472	char_pool *cp;
473	memset (chars, '\0', sizeof (chars));
474	memset (lens, '\0', sizeof (lens));
475	for (cp = nht[i].first_pool; cp; cp = cp->next)
476	{
477	u_char *buf = cp->pool;
478	while (buf != cp->ptr)
479	{
480	int len = *buf++;
481	++lens[len];
482	total_uncomp += len + 4;
483	for (; len; --len)
484	++chars[*buf++];
485	}
486	}
487	for (j = 0; j < 256; ++j)
488	if (!chars[j] && PESINAWORD (j) == i)
489	fchars[j] = 1;
490	else
491	fchars[j] = chars[j];
492	for (j = 0; j < 16; ++j)
493	if (!lens[j])
494	flens[j] = 1;
495	else
496	flens[j] = lens[j];
497
498	aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
499	Calculate_Huffman_Size (256, fchars, chars)) / 8);
500
501	}
502	}
503
504
505
506	static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
507	// create the text index
508	FILE *textIdxFile;
509	if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
510	MAGIC_TEXI, MG_MESSAGE)))
511	return false;
512
513	// output each level
514	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
515	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
516	while (tiHere != tiEnd) {
517	// remember the level information
518	TextLevelInfo levelInfo;
519	levelInfo.levelTag = (*tiHere).first;
520	levelInfo.textIdxPtr = ftell (textIdxFile);
521	levelInfo.numEntries = (*tiHere).second.docPtrs.size();
522	textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
523
524	// write out the index array for this level
525	if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
526	return false;
527
528	++tiHere;
529	}
530
531	// close the file
532	fclose (textIdxFile);
533
534	return true;
535	}
536
537
538	bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
539	// cout << textLevels;
540
541	// create the text index
542	FILE *textLevelFile;
543	if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
544	MAGIC_TEXT_LEVELS, MG_MESSAGE)))
545	return false;
546
547	// output each level
548	if (!textLevels.Write (textLevelFile)) return false;
549
550	// close the file
551	fclose (textLevelFile);
552
553	return true;
554	}
555
556
557	int done_text_2 (const TagInfo &/tagInfo/, char *fileName) {
558	// write out any remaining bits
559	textOutBuf.encodeDone();
560
561	// write out the compressed text header (with updated statistics)
562	if (fseek (text, sizeof (u_long), SEEK_SET) == -1 \|\| !cth.Write (text))
563	return COMPERROR;
564	fclose (text);
565
566	// write out the text index file and get the level information
567	FTextLevel textLevels;
568	if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
569
570	// write out the text level information
571	if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
572
573	// write out the auxiliary dictionary
574	if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
575	(cdh.novel_method == MG_NOVEL_DELTA \|\|
576	cdh.novel_method == MG_NOVEL_HYBRID)) {
577	if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
578	estimate_compressed_aux_dict ();
579
580	} else {
581	unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
582	}
583
584	return (COMPALLOK);
585	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: