Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/mgpp/text/text.pass2.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago
Rodgers new C++ mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.6 KB

Line
1	/**************************************************************************
2	*
3	* text.pass2.cpp -- Text compression (Pass 2)
4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5	* Copyright (C) 1999 Rodger McNab
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	* $Id: text.pass2.cpp 856 2000-01-14 02:26:25Z sjboddie $
22	*
23	**************************************************************************/
24
25
26	#include "sysfuncs.h"
27
28	#include "memlib.h"
29	#include "messages.h"
30	#include "local_strings.h"
31	#include "bitio_m_stdio.h"
32	#include "huffman.h"
33	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
34
35	#include "mg.h"
36	#include "mg_files.h"
37	#include "build.h"
38	#include "words.h"
39	#include "text.h"
40	#include "hash.h"
41	#include "locallib.h"
42	#include "comp_dict.h"
43
44	#include "FText.h"
45
46
47	/*
48	$Log$
49	Revision 1.1 2000/01/14 02:26:24 sjboddie
50	Rodgers new C++ mg
51
52	Revision 1.1 1999/10/11 02:58:38 cs025
53	Base install of MG-PP
54
55	Revision 1.1 1999/08/10 21:18:25 sjboddie
56	renamed mg-1.3d directory mg
57
58	Revision 1.2 1998/12/17 09:12:54 rjmcnab
59
60	Altered mg to process utf-8 encoded Unicode. The main changes
61	are in the parsing of the input, the casefolding, and the stemming.
62
63	Revision 1.1 1998/11/17 09:35:47 rjmcnab
64	* empty log message *
65
66	* Revision 1.3 1994/10/20 03:57:10 tes
67	* I have rewritten the boolean query optimiser and abstracted out the
68	* components of the boolean query.
69	*
70	* Revision 1.2 1994/09/20 04:42:14 tes
71	* For version 1.1
72	*
73	*/
74
75	#define POOL_SIZE 1024*256
76
77	struct char_pool {
78	struct char_pool *next;
79	u_long left;
80	u_char *ptr;
81	u_char pool[POOL_SIZE];
82	};
83
84	struct novel_hash_rec {
85	u_long ordinal_num;
86	u_char *word;
87	};
88
89
90	#define INITIAL_HASH_SIZE 7927
91
92	struct novel_hash_table {
93	novel_hash_rec *HashTable;
94	u_long HashSize, HashUsed;
95	char_pool *first_pool;
96	char_pool *pool;
97	u_long next_num, binary_start;
98	novel_hash_rec **code_to_nhr;
99	};
100
101
102	static FILE *text;
103	static stdio_bitio_buffer textOutBuf;
104	static u_long headLen;
105
106	static novel_hash_table nht[2];
107
108	int blk_start[2][33], blk_end[2][33];
109
110	// 0 = non-word, 1 = word
111	unsigned char whichWordType = 0;
112
113	CompressTextInfoMap textInfoMap;
114
115
116	static char_pool new_pool (char_pool pool) {
117	char_pool p = (char_pool ) Xmalloc (sizeof (char_pool));
118	if (!p) FatalError (1, "Unable to allocate memory for pool");
119
120	if (pool) pool->next = p;
121	p->next = NULL;
122	p->left = POOL_SIZE;
123	p->ptr = p->pool;
124	return p;
125	}
126
127
128	int init_text_2 (const TagInfo &/tagInfo/, char *fileName) {
129	char path[512];
130
131	// load the compression dictionary
132	if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
133	path)) == COMPERROR)
134	return COMPERROR;
135
136	// create the compressed text file and attach it to the text buffer
137	if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
138	MAGIC_TEXT, MG_MESSAGE)))
139	return COMPERROR;
140	textOutBuf.attachFile (text);
141	textOutBuf.encodeStart ();
142
143	// write out the compressed text header
144	bzero ((char *) &cth, sizeof (cth));
145	if (fwrite (&cth, sizeof (cth), 1, text) != 1)
146	return COMPERROR;
147
148	// the length of the compressed header is the size of the
149	// magic number plus the size of the compressed text header
150	headLen = sizeof (u_long) + sizeof (cth);
151
152	// start off with non-word
153	whichWordType = 0;
154
155	// clear the text pointers
156	textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
157
158	// set up the hash table of novel words
159	int i;
160	if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
161	for (i = 0; i <= 1; i++)
162	{
163	nht[i].HashSize = INITIAL_HASH_SIZE;
164	nht[i].HashTable = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) nht[i].HashSize);
165	bzero ((char *) nht[i].HashTable,
166	sizeof (novel_hash_rec) * nht[i].HashSize);
167	nht[i].HashUsed = 0;
168	nht[i].HashSize = INITIAL_HASH_SIZE;
169	nht[i].pool = nht[i].first_pool = new_pool (NULL);
170	nht[i].next_num = 1;
171	nht[i].binary_start = 1;
172	nht[i].code_to_nhr = NULL;
173	if (cdh.novel_method == MG_NOVEL_HYBRID)
174	{
175	int num;
176	num = 1;
177	blk_start[i][0] = 0;
178	blk_end[i][0] = cdh.num_words[i] - 1;
179	while (num < 33)
180	{
181	blk_start[i][num] = blk_end[i][num - 1] + 1;
182	blk_end[i][num] = blk_start[i][num] +
183	(blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
184	num++;
185	}
186	}
187	}
188
189	return (COMPALLOK);
190	}
191
192
193	static int compress_text (bitio_buffer &buffer, const u_char *s_in,
194	int l_in, unsigned char &which,
195	unsigned long &numWords) {
196	const u_char *end = s_in + l_in - 1;
197
198	for (; s_in <= end; which = !which) {
199	u_char Word[MAXWORDLEN + 1];
200	int res;
201
202	if (which) numWords++;
203
204	/* First parse a word or non-word out of the string */
205	if (which) PARSE_WORD (Word, s_in, end);
206	else PARSE_NON_WORD (Word, s_in, end);
207
208	/* Search the hash table for Word */
209	if (ht[which]) {
210	register unsigned long hashval, step;
211	register int tsize = ht[which]->size;
212	register u_char **wptr;
213	HASH (hashval, step, Word, tsize);
214	for (;;) {
215	register u_char *s1;
216	register u_char *s2;
217	register int len;
218	wptr = ht[which]->table[hashval];
219	if (wptr == NULL) {
220	res = COMPERROR;
221	break;
222	}
223
224	/* Compare the words */
225	s1 = Word;
226	s2 = *wptr;
227	len = *s1 + 1;
228	for (; len; len--)
229	if (s1++ != s2++) break;
230
231	if (len) {
232	hashval += step;
233	if (hashval >= (unsigned long)tsize) hashval -= tsize;
234
235	} else {
236	res = ht[which]->table[hashval] - ht[which]->words;
237	break;
238	}
239	}
240	}
241	else
242	res = COMPERROR;
243	/* Check that the word was found in the dictionary */
244	if (res == COMPERROR)
245	{
246	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
247	{
248	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
249	return (COMPERROR);
250	}
251	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
252	{
253	u_long i;
254	if (ht[which])
255	{
256	res = ht[which]->hd->num_codes - 1;
257	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
258	}
259	buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
260	for (i = 0; i < Word[0]; i++)
261	buffer.huff_encode (Word[i + 1], char_codes[which],
262	char_huff[which].clens, NULL);
263	}
264	if (cdh.dict_type == MG_SEED_DICTIONARY)
265	{
266	if (ht[which])
267	{
268	res = ht[which]->hd->num_codes - 1;
269	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
270	}
271	switch (cdh.novel_method)
272	{
273	case MG_NOVEL_HUFFMAN_CHARS:
274	{
275	u_long i;
276	buffer.huff_encode (Word[0], lens_codes[which],
277	lens_huff[which].clens, NULL);
278	for (i = 0; i < Word[0]; i++)
279	buffer.huff_encode (Word[i + 1], char_codes[which],
280	char_huff[which].clens, NULL);
281	}
282	break;
283	case MG_NOVEL_DELTA:
284	case MG_NOVEL_HYBRID:
285	{
286	register unsigned long hashval, step;
287	register novel_hash_table *h = &nht[which];
288	register int hsize = h->HashSize;
289	register novel_hash_rec *ent;
290	HASH (hashval, step, Word, hsize);
291	for (;;)
292	{
293	register u_char s1, s2;
294	register int len;
295	ent = h->HashTable + hashval;
296	if (!ent->word)
297	{
298	int len = *Word + 1;
299	if ((unsigned)len > h->pool->left)
300	h->pool = new_pool (h->pool);
301	ent->word = h->pool->ptr;
302	ent->ordinal_num = h->next_num++;
303	memcpy (h->pool->ptr, Word, len);
304	h->pool->ptr += len;
305	h->pool->left -= len;
306	h->HashUsed++;
307	break;
308	}
309	/* Compare the words */
310	s1 = Word;
311	s2 = ent->word;
312	len = *s1 + 1;
313	for (; len; len--)
314	if (s1++ != s2++)
315	break;
316
317	if (!len)
318	break;
319
320	hashval = (hashval + step);
321	if (hashval >= (unsigned)hsize) hashval -= hsize;
322	}
323
324	switch (cdh.novel_method) {
325	case MG_NOVEL_DELTA:
326	buffer.delta_encode (ent->ordinal_num, NULL);
327	break;
328	case MG_NOVEL_HYBRID:
329	int k = 0;
330	int j = ent->ordinal_num - 1;
331	while (j > blk_end[which][k])
332	k++;
333	assert (j - blk_start[which][k] + 1 >= 1 &&
334	j - blk_start[which][k] + 1 <=
335	blk_end[which][k] - blk_start[which][k] + 1);
336
337	buffer.gamma_encode (k + 1, NULL);
338	buffer.binary_encode (j - blk_start[which][k] + 1,
339	blk_end[which][k] - blk_start[which][k] + 1,
340	NULL);
341	break;
342	}
343
344	if (h->HashUsed >= h->HashSize >> 1)
345	{
346	novel_hash_rec *ht;
347	unsigned long size;
348	unsigned long i;
349	size = prime (h->HashSize * 2);
350	if (!(ht = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) size)))
351	{
352	Message ("Unable to allocate memory for table");
353	return (COMPERROR);
354	}
355	bzero ((char ) ht, sizeof (novel_hash_rec) size);
356
357	for (i = 0; i < h->HashSize; i++)
358	if (h->HashTable[i].word)
359	{
360	register u_char *wptr;
361	register unsigned long hashval, step;
362
363	wptr = h->HashTable[i].word;
364	HASH (hashval, step, wptr, size);
365	wptr = (ht + hashval)->word;
366	while (wptr)
367	{
368	hashval += step;
369	if (hashval >= size)
370	hashval -= size;
371	wptr = (ht + hashval)->word;
372	}
373	ht[hashval] = h->HashTable[i];
374	}
375	Xfree (h->HashTable);
376	h->HashTable = ht;
377	h->HashSize = size;
378	}
379	}
380	break;
381	}
382	}
383	}
384	else
385	{
386	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
387	}
388	}
389
390	return COMPALLOK;
391	}
392
393
394	int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
395	// we need to remember where the last text element ends
396	unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
397	unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
398
399	// process each text element
400	TextElArray::const_iterator here = doc.begin();
401	TextElArray::const_iterator end = doc.end();
402	while (here != end) {
403	// remember current place in compressed text
404	unsigned char startWhich = whichWordType;
405	unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
406	unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
407
408	// compress the text
409	if (compress_text (textOutBuf,
410	(*here).text.begin(),
411	(*here).text.size(),
412	whichWordType,
413	cth.num_of_words) != COMPALLOK)
414	return COMPERROR;
415
416	endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
417	endPos = textOutBuf.GetBytesWritten () + headLen;
418
419	// update the pointers if this is a level tag
420	if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
421	if ((*here).elType == OpenTagE) {
422	// set the start of the document
423	textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
424
425	} else if ((*here).elType == CloseTagE) {
426	// set the end of the document
427	textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
428
429	} else {
430	// TextE, nothing needs to be done
431	}
432	}
433
434	cth.num_of_bytes += (*here).text.size();
435	here++;
436	}
437
438	// close off any unclosed tags
439	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
440	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
441	while (tiHere != tiEnd) {
442	if ((tiHere).second.inDoc) (tiHere).second.SetEnd (endPos, endBit);
443	tiHere++;
444	}
445
446	// we've processed one more document
447	cth.num_of_docs++;
448
449	return COMPALLOK;
450	}
451
452
453	static int write_aux_dict (char *fileName) {
454	int i;
455	FILE *aux;
456	if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
457	MAGIC_AUX_DICT, MG_MESSAGE)))
458	return COMPERROR;
459
460	for (i = 0; i <= 1; i++)
461	{
462	aux_frags_header afh;
463	char_pool *cp;
464
465	afh.num_frags = nht[i].HashUsed;
466	afh.mem_for_frags = 0;
467	for (cp = nht[i].first_pool; cp; cp = cp->next)
468	afh.mem_for_frags += POOL_SIZE - cp->left;
469
470	/* [RPAP - Jan 97: Endian Ordering] */
471	HTONUL(afh.num_frags);
472	HTONUL(afh.mem_for_frags);
473
474	fwrite (&afh, sizeof (afh), 1, aux);
475
476	for (cp = nht[i].first_pool; cp; cp = cp->next)
477	fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
478	}
479	fclose (aux);
480	return COMPALLOK;
481	}
482
483
484	static void estimate_compressed_aux_dict (void) {
485	int i;
486	u_long aux_compressed = 0, total_uncomp = 0;
487	for (i = 0; i <= 1; i++)
488	{
489	int j;
490	long chars[256], fchars[256];
491	long lens[16], flens[16];
492	char_pool *cp;
493	bzero ((char *) chars, sizeof (chars));
494	bzero ((char *) lens, sizeof (lens));
495	for (cp = nht[i].first_pool; cp; cp = cp->next)
496	{
497	u_char *buf = cp->pool;
498	while (buf != cp->ptr)
499	{
500	int len = *buf++;
501	lens[len]++;
502	total_uncomp += len + 4;
503	for (; len; len--)
504	chars[*buf++]++;
505	}
506	}
507	for (j = 0; j < 256; j++)
508	if (!chars[j] && PESINAWORD (j) == i)
509	fchars[j] = 1;
510	else
511	fchars[j] = chars[j];
512	for (j = 0; j < 16; j++)
513	if (!lens[j])
514	flens[j] = 1;
515	else
516	flens[j] = lens[j];
517
518	aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
519	Calculate_Huffman_Size (256, fchars, chars)) / 8);
520
521	}
522	}
523
524
525
526	static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
527	// create the text index
528	FILE *textIdxFile;
529	if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
530	MAGIC_TEXI, MG_MESSAGE)))
531	return false;
532
533	// output each level
534	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
535	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
536	while (tiHere != tiEnd) {
537	// remember the level information
538	TextLevelInfo levelInfo;
539	levelInfo.levelTag = (*tiHere).first;
540	levelInfo.textIdxPtr = ftell (textIdxFile);
541	levelInfo.numEntries = (*tiHere).second.docPtrs.size();
542	textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
543
544	// write out the index array for this level
545	if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
546	return false;
547
548	tiHere++;
549	}
550
551	// close the file
552	fclose (textIdxFile);
553
554	return true;
555	}
556
557
558	bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
559	// cout << textLevels;
560
561	// create the text index
562	FILE *textLevelFile;
563	if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
564	MAGIC_TEXT_LEVELS, MG_MESSAGE)))
565	return false;
566
567	// output each level
568	if (!textLevels.Write (textLevelFile)) return false;
569
570	// close the file
571	fclose (textLevelFile);
572
573	return true;
574	}
575
576
577	int done_text_2 (const TagInfo &/tagInfo/, char *fileName) {
578	// write out any remaining bits
579	textOutBuf.encodeDone();
580
581	// write out the compressed text header (with updated statistics)
582	if (fseek (text, sizeof (u_long), SEEK_SET) == -1 \|\| !cth.Write (text))
583	return COMPERROR;
584	fclose (text);
585
586	// write out the text index file and get the level information
587	FTextLevel textLevels;
588	if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
589
590	// write out the text level information
591	if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
592
593	// write out the auxiliary dictionary
594	if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
595	(cdh.novel_method == MG_NOVEL_DELTA \|\|
596	cdh.novel_method == MG_NOVEL_HYBRID)) {
597	if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
598	estimate_compressed_aux_dict ();
599
600	} else {
601	unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
602	}
603
604	return (COMPALLOK);
605	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: