Context Navigation

source: trunk/gsdl/src/mgpp/text/text.pass2.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 15.1 KB

Rev	Line
[856]	1	/**************************************************************************
	2	*
	3	* text.pass2.cpp -- Text compression (Pass 2)
	4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
	5	* Copyright (C) 1999 Rodger McNab
	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation; either version 2 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* This program is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with this program; if not, write to the Free Software
	19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	20	*
	21	**************************************************************************/
	22
[2468]	23	// need this to avoid bizarre compiler problems under VC++ 6.0
	24	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
	25	# include <iostream>
	26	#endif
[856]	27
[2468]	28	#if defined (__WIN32__)
	29	# include <stdio.h>
	30	# define unlink _unlink
	31	#endif
	32
[856]	33	#include "sysfuncs.h"
	34	#include "memlib.h"
	35	#include "messages.h"
	36	#include "local_strings.h"
	37	#include "bitio_m_stdio.h"
	38	#include "huffman.h"
	39	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
	40	#include "mg.h"
	41	#include "mg_files.h"
	42	#include "build.h"
	43	#include "words.h"
	44	#include "text.h"
	45	#include "hash.h"
	46	#include "locallib.h"
	47	#include "comp_dict.h"
	48	#include "FText.h"
	49
[2468]	50	#define POOL_SIZE 1024*256
[856]	51
	52
	53	struct char_pool {
	54	struct char_pool *next;
	55	u_long left;
	56	u_char *ptr;
	57	u_char pool[POOL_SIZE];
	58	};
	59
	60	struct novel_hash_rec {
	61	u_long ordinal_num;
	62	u_char *word;
	63	};
	64
	65
	66	#define INITIAL_HASH_SIZE 7927
	67
	68	struct novel_hash_table {
	69	novel_hash_rec *HashTable;
	70	u_long HashSize, HashUsed;
	71	char_pool *first_pool;
	72	char_pool *pool;
	73	u_long next_num, binary_start;
	74	novel_hash_rec **code_to_nhr;
	75	};
	76
	77
	78	static FILE *text;
	79	static stdio_bitio_buffer textOutBuf;
	80	static u_long headLen;
	81
	82	static novel_hash_table nht[2];
	83
	84	int blk_start[2][33], blk_end[2][33];
	85
	86	// 0 = non-word, 1 = word
	87	unsigned char whichWordType = 0;
	88
	89	CompressTextInfoMap textInfoMap;
	90
	91
	92	static char_pool new_pool (char_pool pool) {
	93	char_pool p = (char_pool ) Xmalloc (sizeof (char_pool));
	94	if (!p) FatalError (1, "Unable to allocate memory for pool");
	95
	96	if (pool) pool->next = p;
	97	p->next = NULL;
	98	p->left = POOL_SIZE;
	99	p->ptr = p->pool;
	100	return p;
	101	}
	102
	103
	104	int init_text_2 (const TagInfo &/tagInfo/, char *fileName) {
	105	char path[512];
	106
	107	// load the compression dictionary
	108	if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
	109	path)) == COMPERROR)
	110	return COMPERROR;
	111
	112	// create the compressed text file and attach it to the text buffer
	113	if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
	114	MAGIC_TEXT, MG_MESSAGE)))
	115	return COMPERROR;
	116	textOutBuf.attachFile (text);
	117	textOutBuf.encodeStart ();
	118
	119	// write out the compressed text header
	120	bzero ((char *) &cth, sizeof (cth));
	121	if (fwrite (&cth, sizeof (cth), 1, text) != 1)
	122	return COMPERROR;
	123
	124	// the length of the compressed header is the size of the
	125	// magic number plus the size of the compressed text header
	126	headLen = sizeof (u_long) + sizeof (cth);
	127
	128	// start off with non-word
	129	whichWordType = 0;
	130
	131	// clear the text pointers
	132	textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
	133
	134	// set up the hash table of novel words
	135	int i;
	136	if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
	137	for (i = 0; i <= 1; i++)
	138	{
	139	nht[i].HashSize = INITIAL_HASH_SIZE;
	140	nht[i].HashTable = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) nht[i].HashSize);
	141	bzero ((char *) nht[i].HashTable,
	142	sizeof (novel_hash_rec) * nht[i].HashSize);
	143	nht[i].HashUsed = 0;
	144	nht[i].HashSize = INITIAL_HASH_SIZE;
	145	nht[i].pool = nht[i].first_pool = new_pool (NULL);
	146	nht[i].next_num = 1;
	147	nht[i].binary_start = 1;
	148	nht[i].code_to_nhr = NULL;
	149	if (cdh.novel_method == MG_NOVEL_HYBRID)
	150	{
	151	int num;
	152	num = 1;
	153	blk_start[i][0] = 0;
	154	blk_end[i][0] = cdh.num_words[i] - 1;
	155	while (num < 33)
	156	{
	157	blk_start[i][num] = blk_end[i][num - 1] + 1;
	158	blk_end[i][num] = blk_start[i][num] +
	159	(blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
	160	num++;
	161	}
	162	}
	163	}
	164
	165	return (COMPALLOK);
	166	}
	167
	168
	169	static int compress_text (bitio_buffer &buffer, const u_char *s_in,
	170	int l_in, unsigned char &which,
	171	unsigned long &numWords) {
	172	const u_char *end = s_in + l_in - 1;
	173
	174	for (; s_in <= end; which = !which) {
	175	u_char Word[MAXWORDLEN + 1];
	176	int res;
	177
	178	if (which) numWords++;
	179
	180	/* First parse a word or non-word out of the string */
	181	if (which) PARSE_WORD (Word, s_in, end);
	182	else PARSE_NON_WORD (Word, s_in, end);
	183
	184	/* Search the hash table for Word */
	185	if (ht[which]) {
	186	register unsigned long hashval, step;
	187	register int tsize = ht[which]->size;
	188	register u_char **wptr;
	189	HASH (hashval, step, Word, tsize);
	190	for (;;) {
	191	register u_char *s1;
	192	register u_char *s2;
	193	register int len;
	194	wptr = ht[which]->table[hashval];
	195	if (wptr == NULL) {
	196	res = COMPERROR;
	197	break;
	198	}
	199
	200	/* Compare the words */
	201	s1 = Word;
	202	s2 = *wptr;
	203	len = *s1 + 1;
	204	for (; len; len--)
	205	if (s1++ != s2++) break;
	206
	207	if (len) {
	208	hashval += step;
	209	if (hashval >= (unsigned long)tsize) hashval -= tsize;
	210
	211	} else {
	212	res = ht[which]->table[hashval] - ht[which]->words;
	213	break;
	214	}
	215	}
	216	}
	217	else
	218	res = COMPERROR;
	219	/* Check that the word was found in the dictionary */
	220	if (res == COMPERROR)
	221	{
	222	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
	223	{
	224	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
	225	return (COMPERROR);
	226	}
	227	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
	228	{
	229	u_long i;
	230	if (ht[which])
	231	{
	232	res = ht[which]->hd->num_codes - 1;
	233	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
	234	}
	235	buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
	236	for (i = 0; i < Word[0]; i++)
	237	buffer.huff_encode (Word[i + 1], char_codes[which],
	238	char_huff[which].clens, NULL);
	239	}
	240	if (cdh.dict_type == MG_SEED_DICTIONARY)
	241	{
	242	if (ht[which])
	243	{
	244	res = ht[which]->hd->num_codes - 1;
	245	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
	246	}
	247	switch (cdh.novel_method)
	248	{
	249	case MG_NOVEL_HUFFMAN_CHARS:
	250	{
	251	u_long i;
	252	buffer.huff_encode (Word[0], lens_codes[which],
	253	lens_huff[which].clens, NULL);
	254	for (i = 0; i < Word[0]; i++)
	255	buffer.huff_encode (Word[i + 1], char_codes[which],
	256	char_huff[which].clens, NULL);
	257	}
	258	break;
	259	case MG_NOVEL_DELTA:
	260	case MG_NOVEL_HYBRID:
	261	{
	262	register unsigned long hashval, step;
	263	register novel_hash_table *h = &nht[which];
	264	register int hsize = h->HashSize;
	265	register novel_hash_rec *ent;
	266	HASH (hashval, step, Word, hsize);
	267	for (;;)
	268	{
	269	register u_char s1, s2;
	270	register int len;
	271	ent = h->HashTable + hashval;
	272	if (!ent->word)
	273	{
	274	int len = *Word + 1;
	275	if ((unsigned)len > h->pool->left)
	276	h->pool = new_pool (h->pool);
	277	ent->word = h->pool->ptr;
	278	ent->ordinal_num = h->next_num++;
	279	memcpy (h->pool->ptr, Word, len);
	280	h->pool->ptr += len;
	281	h->pool->left -= len;
	282	h->HashUsed++;
	283	break;
	284	}
	285	/* Compare the words */
	286	s1 = Word;
	287	s2 = ent->word;
	288	len = *s1 + 1;
	289	for (; len; len--)
	290	if (s1++ != s2++)
	291	break;
	292
	293	if (!len)
	294	break;
	295
	296	hashval = (hashval + step);
	297	if (hashval >= (unsigned)hsize) hashval -= hsize;
	298	}
	299
	300	switch (cdh.novel_method) {
	301	case MG_NOVEL_DELTA:
	302	buffer.delta_encode (ent->ordinal_num, NULL);
	303	break;
	304	case MG_NOVEL_HYBRID:
	305	int k = 0;
	306	int j = ent->ordinal_num - 1;
	307	while (j > blk_end[which][k])
	308	k++;
	309	assert (j - blk_start[which][k] + 1 >= 1 &&
	310	j - blk_start[which][k] + 1 <=
	311	blk_end[which][k] - blk_start[which][k] + 1);
	312
	313	buffer.gamma_encode (k + 1, NULL);
	314	buffer.binary_encode (j - blk_start[which][k] + 1,
	315	blk_end[which][k] - blk_start[which][k] + 1,
	316	NULL);
	317	break;
	318	}
	319
	320	if (h->HashUsed >= h->HashSize >> 1)
	321	{
	322	novel_hash_rec *ht;
	323	unsigned long size;
	324	unsigned long i;
	325	size = prime (h->HashSize * 2);
	326	if (!(ht = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) size)))
	327	{
	328	Message ("Unable to allocate memory for table");
	329	return (COMPERROR);
	330	}
	331	bzero ((char ) ht, sizeof (novel_hash_rec) size);
	332
	333	for (i = 0; i < h->HashSize; i++)
	334	if (h->HashTable[i].word)
	335	{
	336	register u_char *wptr;
	337	register unsigned long hashval, step;
	338
	339	wptr = h->HashTable[i].word;
	340	HASH (hashval, step, wptr, size);
	341	wptr = (ht + hashval)->word;
	342	while (wptr)
	343	{
	344	hashval += step;
	345	if (hashval >= size)
	346	hashval -= size;
	347	wptr = (ht + hashval)->word;
	348	}
	349	ht[hashval] = h->HashTable[i];
	350	}
	351	Xfree (h->HashTable);
	352	h->HashTable = ht;
	353	h->HashSize = size;
	354	}
	355	}
	356	break;
	357	}
	358	}
	359	}
	360	else
	361	{
	362	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
	363	}
	364	}
	365
	366	return COMPALLOK;
	367	}
	368
	369
	370	int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
	371	// we need to remember where the last text element ends
	372	unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
	373	unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
	374
	375	// process each text element
	376	TextElArray::const_iterator here = doc.begin();
	377	TextElArray::const_iterator end = doc.end();
	378	while (here != end) {
	379	// remember current place in compressed text
	380	unsigned char startWhich = whichWordType;
	381	unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
	382	unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
	383
	384	// compress the text
	385	if (compress_text (textOutBuf,
	386	(*here).text.begin(),
	387	(*here).text.size(),
	388	whichWordType,
	389	cth.num_of_words) != COMPALLOK)
	390	return COMPERROR;
	391
	392	endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
	393	endPos = textOutBuf.GetBytesWritten () + headLen;
	394
	395	// update the pointers if this is a level tag
	396	if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
	397	if ((*here).elType == OpenTagE) {
	398	// set the start of the document
	399	textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
	400
	401	} else if ((*here).elType == CloseTagE) {
	402	// set the end of the document
	403	textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
	404
	405	} else {
	406	// TextE, nothing needs to be done
	407	}
	408	}
	409
	410	cth.num_of_bytes += (*here).text.size();
	411	here++;
	412	}
	413
	414	// close off any unclosed tags
	415	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
	416	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
	417	while (tiHere != tiEnd) {
	418	if ((tiHere).second.inDoc) (tiHere).second.SetEnd (endPos, endBit);
	419	tiHere++;
	420	}
	421
	422	// we've processed one more document
	423	cth.num_of_docs++;
	424
	425	return COMPALLOK;
	426	}
	427
	428
	429	static int write_aux_dict (char *fileName) {
	430	int i;
	431	FILE *aux;
	432	if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
	433	MAGIC_AUX_DICT, MG_MESSAGE)))
	434	return COMPERROR;
	435
	436	for (i = 0; i <= 1; i++)
	437	{
	438	aux_frags_header afh;
	439	char_pool *cp;
	440
	441	afh.num_frags = nht[i].HashUsed;
	442	afh.mem_for_frags = 0;
	443	for (cp = nht[i].first_pool; cp; cp = cp->next)
	444	afh.mem_for_frags += POOL_SIZE - cp->left;
	445
	446	/* [RPAP - Jan 97: Endian Ordering] */
	447	HTONUL(afh.num_frags);
	448	HTONUL(afh.mem_for_frags);
	449
	450	fwrite (&afh, sizeof (afh), 1, aux);
	451
	452	for (cp = nht[i].first_pool; cp; cp = cp->next)
	453	fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
	454	}
	455	fclose (aux);
	456	return COMPALLOK;
	457	}
	458
	459
	460	static void estimate_compressed_aux_dict (void) {
	461	int i;
	462	u_long aux_compressed = 0, total_uncomp = 0;
	463	for (i = 0; i <= 1; i++)
	464	{
	465	int j;
	466	long chars[256], fchars[256];
	467	long lens[16], flens[16];
	468	char_pool *cp;
	469	bzero ((char *) chars, sizeof (chars));
	470	bzero ((char *) lens, sizeof (lens));
	471	for (cp = nht[i].first_pool; cp; cp = cp->next)
	472	{
	473	u_char *buf = cp->pool;
	474	while (buf != cp->ptr)
	475	{
	476	int len = *buf++;
	477	lens[len]++;
	478	total_uncomp += len + 4;
	479	for (; len; len--)
	480	chars[*buf++]++;
	481	}
	482	}
	483	for (j = 0; j < 256; j++)
	484	if (!chars[j] && PESINAWORD (j) == i)
	485	fchars[j] = 1;
	486	else
	487	fchars[j] = chars[j];
	488	for (j = 0; j < 16; j++)
	489	if (!lens[j])
	490	flens[j] = 1;
	491	else
	492	flens[j] = lens[j];
	493
	494	aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
	495	Calculate_Huffman_Size (256, fchars, chars)) / 8);
	496
	497	}
	498	}
	499
	500
	501
	502	static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
	503	// create the text index
	504	FILE *textIdxFile;
	505	if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
	506	MAGIC_TEXI, MG_MESSAGE)))
	507	return false;
	508
	509	// output each level
	510	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
	511	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
	512	while (tiHere != tiEnd) {
	513	// remember the level information
	514	TextLevelInfo levelInfo;
	515	levelInfo.levelTag = (*tiHere).first;
	516	levelInfo.textIdxPtr = ftell (textIdxFile);
	517	levelInfo.numEntries = (*tiHere).second.docPtrs.size();
	518	textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
	519
	520	// write out the index array for this level
	521	if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
	522	return false;
	523
	524	tiHere++;
	525	}
	526
	527	// close the file
	528	fclose (textIdxFile);
	529
	530	return true;
	531	}
	532
	533
	534	bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
	535	// cout << textLevels;
	536
	537	// create the text index
	538	FILE *textLevelFile;
	539	if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
	540	MAGIC_TEXT_LEVELS, MG_MESSAGE)))
	541	return false;
	542
	543	// output each level
	544	if (!textLevels.Write (textLevelFile)) return false;
	545
	546	// close the file
	547	fclose (textLevelFile);
	548
	549	return true;
	550	}
	551
	552
	553	int done_text_2 (const TagInfo &/tagInfo/, char *fileName) {
	554	// write out any remaining bits
	555	textOutBuf.encodeDone();
	556
	557	// write out the compressed text header (with updated statistics)
	558	if (fseek (text, sizeof (u_long), SEEK_SET) == -1 \|\| !cth.Write (text))
	559	return COMPERROR;
	560	fclose (text);
	561
	562	// write out the text index file and get the level information
	563	FTextLevel textLevels;
	564	if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
	565
	566	// write out the text level information
	567	if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
	568
	569	// write out the auxiliary dictionary
	570	if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
	571	(cdh.novel_method == MG_NOVEL_DELTA \|\|
	572	cdh.novel_method == MG_NOVEL_HYBRID)) {
	573	if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
	574	estimate_compressed_aux_dict ();
	575
	576	} else {
	577	unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
	578	}
	579
	580	return (COMPALLOK);
	581	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: