Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gsdl/trunk/trunk/mgpp/text/text.pass2.cpp@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.2 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* text.pass2.cpp -- Text compression (Pass 2)
	4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
	5	* Copyright (C) 1999 Rodger McNab
	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation; either version 2 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* This program is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with this program; if not, write to the Free Software
	19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	20	*
	21	**************************************************************************/
	22
[8692]	23	#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(__WIN32__)
	24	#pragma warning(disable:4786)
	25	#endif
	26
[3365]	27	// need this to avoid bizarre compiler problems under VC++ 6.0
	28	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
	29	# include <iostream>
	30	#endif
	31
	32	#if defined (__WIN32__)
	33	# include <stdio.h>
	34	# define unlink _unlink
	35	#endif
	36
	37	#include "sysfuncs.h"
	38	#include "memlib.h"
	39	#include "messages.h"
	40	#include "local_strings.h"
	41	#include "bitio_m_stdio.h"
	42	#include "huffman.h"
	43	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
	44	#include "mg.h"
	45	#include "mg_files.h"
	46	#include "build.h"
	47	#include "words.h"
	48	#include "text.h"
	49	#include "hash.h"
	50	#include "locallib.h"
	51	#include "comp_dict.h"
	52	#include "FText.h"
	53
	54	#define POOL_SIZE 1024*256
	55
	56
	57	struct char_pool {
	58	struct char_pool *next;
	59	u_long left;
	60	u_char *ptr;
	61	u_char pool[POOL_SIZE];
	62	};
	63
	64	struct novel_hash_rec {
	65	u_long ordinal_num;
	66	u_char *word;
	67	};
	68
	69
	70	#define INITIAL_HASH_SIZE 7927
	71
	72	struct novel_hash_table {
	73	novel_hash_rec *HashTable;
	74	u_long HashSize, HashUsed;
	75	char_pool *first_pool;
	76	char_pool *pool;
	77	u_long next_num, binary_start;
	78	novel_hash_rec **code_to_nhr;
	79	};
	80
	81
	82	static FILE *text;
	83	static stdio_bitio_buffer textOutBuf;
	84	static u_long headLen;
	85
	86	static novel_hash_table nht[2];
	87
	88	int blk_start[2][33], blk_end[2][33];
	89
	90	// 0 = non-word, 1 = word
	91	unsigned char whichWordType = 0;
	92
	93	CompressTextInfoMap textInfoMap;
	94
	95
	96	static char_pool new_pool (char_pool pool) {
	97	char_pool p = (char_pool ) Xmalloc (sizeof (char_pool));
	98	if (!p) FatalError (1, "Unable to allocate memory for pool");
	99
	100	if (pool) pool->next = p;
	101	p->next = NULL;
	102	p->left = POOL_SIZE;
	103	p->ptr = p->pool;
	104	return p;
	105	}
	106
	107
	108	int init_text_2 (const TagInfo &/tagInfo/, char *fileName) {
	109	char path[512];
	110
	111	// load the compression dictionary
	112	if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX,
	113	path)) == COMPERROR)
	114	return COMPERROR;
	115
	116	// create the compressed text file and attach it to the text buffer
	117	if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b",
	118	MAGIC_TEXT, MG_MESSAGE)))
	119	return COMPERROR;
	120	textOutBuf.attachFile (text);
	121	textOutBuf.encodeStart ();
	122
	123	// write out the compressed text header
	124	memset (&cth, '\0', sizeof (cth));
	125	if (fwrite (&cth, sizeof (cth), 1, text) != 1)
	126	return COMPERROR;
	127
	128	// the length of the compressed header is the size of the
	129	// magic number plus the size of the compressed text header
	130	headLen = sizeof (u_long) + sizeof (cth);
	131
	132	// start off with non-word
	133	whichWordType = 0;
	134
	135	// clear the text pointers
	136	textInfoMap.erase (textInfoMap.begin(), textInfoMap.end());
	137
	138	// set up the hash table of novel words
	139	int i;
	140	if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
[8692]	141	for (i = 0; i <= 1; ++i)
[3365]	142	{
	143	nht[i].HashSize = INITIAL_HASH_SIZE;
	144	nht[i].HashTable = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) nht[i].HashSize);
	145	memset (nht[i].HashTable, '\0',
	146	sizeof (novel_hash_rec) * nht[i].HashSize);
	147	nht[i].HashUsed = 0;
	148	nht[i].HashSize = INITIAL_HASH_SIZE;
	149	nht[i].pool = nht[i].first_pool = new_pool (NULL);
	150	nht[i].next_num = 1;
	151	nht[i].binary_start = 1;
	152	nht[i].code_to_nhr = NULL;
	153	if (cdh.novel_method == MG_NOVEL_HYBRID)
	154	{
	155	int num;
	156	num = 1;
	157	blk_start[i][0] = 0;
	158	blk_end[i][0] = cdh.num_words[i] - 1;
	159	while (num < 33)
	160	{
	161	blk_start[i][num] = blk_end[i][num - 1] + 1;
	162	blk_end[i][num] = blk_start[i][num] +
	163	(blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
[8692]	164	++num;
[3365]	165	}
	166	}
	167	}
	168
	169	return (COMPALLOK);
	170	}
	171
	172
	173	static int compress_text (bitio_buffer &buffer, const u_char *s_in,
	174	int l_in, unsigned char &which,
	175	unsigned long &numWords) {
	176	const u_char *end = s_in + l_in - 1;
	177
	178	for (; s_in <= end; which = !which) {
	179	u_char Word[MAXWORDLEN + 1];
	180	int res;
	181
[8692]	182	if (which) ++numWords;
[3365]	183
	184	/* First parse a word or non-word out of the string */
	185	if (which) PARSE_WORD (Word, s_in, end);
	186	else PARSE_NON_WORD (Word, s_in, end);
	187
	188	/* Search the hash table for Word */
	189	if (ht[which]) {
	190	register unsigned long hashval, step;
	191	register int tsize = ht[which]->size;
	192	register u_char **wptr;
	193	HASH (hashval, step, Word, tsize);
	194	for (;;) {
	195	register u_char *s1;
	196	register u_char *s2;
	197	register int len;
	198	wptr = ht[which]->table[hashval];
	199	if (wptr == NULL) {
	200	res = COMPERROR;
	201	break;
	202	}
	203
	204	/* Compare the words */
	205	s1 = Word;
	206	s2 = *wptr;
	207	len = *s1 + 1;
[8692]	208	for (; len; --len)
[3365]	209	if (s1++ != s2++) break;
	210
	211	if (len) {
	212	hashval += step;
	213	if (hashval >= (unsigned long)tsize) hashval -= tsize;
	214
	215	} else {
	216	res = ht[which]->table[hashval] - ht[which]->words;
	217	break;
	218	}
	219	}
	220	}
	221	else
	222	res = COMPERROR;
	223	/* Check that the word was found in the dictionary */
	224	if (res == COMPERROR)
	225	{
	226	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
	227	{
	228	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
	229	return (COMPERROR);
	230	}
	231	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
	232	{
	233	u_long i;
	234	if (ht[which])
	235	{
	236	res = ht[which]->hd->num_codes - 1;
	237	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
	238	}
	239	buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL);
[8692]	240	for (i = 0; i < Word[0]; ++i)
[3365]	241	buffer.huff_encode (Word[i + 1], char_codes[which],
	242	char_huff[which].clens, NULL);
	243	}
	244	if (cdh.dict_type == MG_SEED_DICTIONARY)
	245	{
	246	if (ht[which])
	247	{
	248	res = ht[which]->hd->num_codes - 1;
	249	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
	250	}
	251	switch (cdh.novel_method)
	252	{
	253	case MG_NOVEL_HUFFMAN_CHARS:
	254	{
	255	u_long i;
	256	buffer.huff_encode (Word[0], lens_codes[which],
	257	lens_huff[which].clens, NULL);
[8692]	258	for (i = 0; i < Word[0]; ++i)
[3365]	259	buffer.huff_encode (Word[i + 1], char_codes[which],
	260	char_huff[which].clens, NULL);
	261	}
	262	break;
	263	case MG_NOVEL_DELTA:
	264	case MG_NOVEL_HYBRID:
	265	{
	266	register unsigned long hashval, step;
	267	register novel_hash_table *h = &nht[which];
	268	register int hsize = h->HashSize;
	269	register novel_hash_rec *ent;
	270	HASH (hashval, step, Word, hsize);
	271	for (;;)
	272	{
	273	register u_char s1, s2;
	274	register int len;
	275	ent = h->HashTable + hashval;
	276	if (!ent->word)
	277	{
	278	int len = *Word + 1;
	279	if ((unsigned)len > h->pool->left)
	280	h->pool = new_pool (h->pool);
	281	ent->word = h->pool->ptr;
	282	ent->ordinal_num = h->next_num++;
	283	memcpy (h->pool->ptr, Word, len);
	284	h->pool->ptr += len;
	285	h->pool->left -= len;
[8692]	286	++h->HashUsed;
[3365]	287	break;
	288	}
	289	/* Compare the words */
	290	s1 = Word;
	291	s2 = ent->word;
	292	len = *s1 + 1;
[8692]	293	for (; len; --len)
[3365]	294	if (s1++ != s2++)
	295	break;
	296
	297	if (!len)
	298	break;
	299
	300	hashval = (hashval + step);
	301	if (hashval >= (unsigned)hsize) hashval -= hsize;
	302	}
	303
	304	switch (cdh.novel_method) {
	305	case MG_NOVEL_DELTA:
	306	buffer.delta_encode (ent->ordinal_num, NULL);
	307	break;
	308	case MG_NOVEL_HYBRID:
	309	int k = 0;
	310	int j = ent->ordinal_num - 1;
	311	while (j > blk_end[which][k])
[8692]	312	++k;
[3365]	313	assert (j - blk_start[which][k] + 1 >= 1 &&
	314	j - blk_start[which][k] + 1 <=
	315	blk_end[which][k] - blk_start[which][k] + 1);
	316
	317	buffer.gamma_encode (k + 1, NULL);
	318	buffer.binary_encode (j - blk_start[which][k] + 1,
	319	blk_end[which][k] - blk_start[which][k] + 1,
	320	NULL);
	321	break;
	322	}
	323
	324	if (h->HashUsed >= h->HashSize >> 1)
	325	{
	326	novel_hash_rec *ht;
	327	unsigned long size;
	328	unsigned long i;
	329	size = prime (h->HashSize * 2);
	330	if (!(ht = (novel_hash_rec ) Xmalloc (sizeof (novel_hash_rec) size)))
	331	{
	332	Message ("Unable to allocate memory for table");
	333	return (COMPERROR);
	334	}
	335	memset (ht, '\0', sizeof (novel_hash_rec) * size);
	336
[8692]	337	for (i = 0; i < h->HashSize; ++i)
[3365]	338	if (h->HashTable[i].word)
	339	{
	340	register u_char *wptr;
	341	register unsigned long hashval, step;
	342
	343	wptr = h->HashTable[i].word;
	344	HASH (hashval, step, wptr, size);
	345	wptr = (ht + hashval)->word;
	346	while (wptr)
	347	{
	348	hashval += step;
	349	if (hashval >= size)
	350	hashval -= size;
	351	wptr = (ht + hashval)->word;
	352	}
	353	ht[hashval] = h->HashTable[i];
	354	}
	355	Xfree (h->HashTable);
	356	h->HashTable = ht;
	357	h->HashSize = size;
	358	}
	359	}
	360	break;
	361	}
	362	}
	363	}
	364	else
	365	{
	366	buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL);
	367	}
	368	}
	369
	370	return COMPALLOK;
	371	}
	372
	373
	374	int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) {
	375	// we need to remember where the last text element ends
	376	unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
	377	unsigned long endPos = textOutBuf.GetBytesWritten () + headLen;
	378
	379	// process each text element
	380	TextElArray::const_iterator here = doc.begin();
	381	TextElArray::const_iterator end = doc.end();
	382	while (here != end) {
	383	// remember current place in compressed text
	384	unsigned char startWhich = whichWordType;
	385	unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
	386	unsigned long startPos = textOutBuf.GetBytesWritten () + headLen;
	387
	388	// compress the text
	389	if (compress_text (textOutBuf,
	390	&(here->text[0]),
	391	(*here).text.size(),
	392	whichWordType,
	393	cth.num_of_words) != COMPALLOK)
	394	return COMPERROR;
	395
	396	endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit
	397	endPos = textOutBuf.GetBytesWritten () + headLen;
	398
	399	// update the pointers if this is a level tag
	400	if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) {
	401	if ((*here).elType == OpenTagE) {
	402	// set the start of the document
	403	textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich);
	404
	405	} else if ((*here).elType == CloseTagE) {
	406	// set the end of the document
	407	textInfoMap[(*here).tagName].SetEnd (endPos, endBit);
	408
	409	} else {
	410	// TextE, nothing needs to be done
	411	}
	412	}
	413
	414	cth.num_of_bytes += (*here).text.size();
[8692]	415	++here;
[3365]	416	}
	417
	418	// close off any unclosed tags
	419	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
	420	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
	421	while (tiHere != tiEnd) {
	422	if ((tiHere).second.inDoc) (tiHere).second.SetEnd (endPos, endBit);
[8692]	423	++tiHere;
[3365]	424	}
	425
	426	// we've processed one more document
[8692]	427	++cth.num_of_docs;
[3365]	428
	429	return COMPALLOK;
	430	}
	431
	432
	433	static int write_aux_dict (char *fileName) {
	434	int i;
	435	FILE *aux;
	436	if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb",
	437	MAGIC_AUX_DICT, MG_MESSAGE)))
	438	return COMPERROR;
	439
[8692]	440	for (i = 0; i <= 1; ++i)
[3365]	441	{
	442	aux_frags_header afh;
	443	char_pool *cp;
	444
	445	afh.num_frags = nht[i].HashUsed;
	446	afh.mem_for_frags = 0;
	447	for (cp = nht[i].first_pool; cp; cp = cp->next)
	448	afh.mem_for_frags += POOL_SIZE - cp->left;
	449
	450	/* [RPAP - Jan 97: Endian Ordering] */
	451	HTONUL(afh.num_frags);
	452	HTONUL(afh.mem_for_frags);
	453
	454	fwrite (&afh, sizeof (afh), 1, aux);
	455
	456	for (cp = nht[i].first_pool; cp; cp = cp->next)
	457	fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
	458	}
	459	fclose (aux);
	460	return COMPALLOK;
	461	}
	462
	463
	464	static void estimate_compressed_aux_dict (void) {
	465	int i;
	466	u_long aux_compressed = 0, total_uncomp = 0;
[8692]	467	for (i = 0; i <= 1; ++i)
[3365]	468	{
	469	int j;
	470	long chars[256], fchars[256];
	471	long lens[16], flens[16];
	472	char_pool *cp;
	473	memset (chars, '\0', sizeof (chars));
	474	memset (lens, '\0', sizeof (lens));
	475	for (cp = nht[i].first_pool; cp; cp = cp->next)
	476	{
	477	u_char *buf = cp->pool;
	478	while (buf != cp->ptr)
	479	{
	480	int len = *buf++;
[8692]	481	++lens[len];
[3365]	482	total_uncomp += len + 4;
[8692]	483	for (; len; --len)
	484	++chars[*buf++];
[3365]	485	}
	486	}
[8692]	487	for (j = 0; j < 256; ++j)
[3365]	488	if (!chars[j] && PESINAWORD (j) == i)
	489	fchars[j] = 1;
	490	else
	491	fchars[j] = chars[j];
[8692]	492	for (j = 0; j < 16; ++j)
[3365]	493	if (!lens[j])
	494	flens[j] = 1;
	495	else
	496	flens[j] = lens[j];
	497
	498	aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) +
	499	Calculate_Huffman_Size (256, fchars, chars)) / 8);
	500
	501	}
	502	}
	503
	504
	505
	506	static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) {
	507	// create the text index
	508	FILE *textIdxFile;
	509	if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b",
	510	MAGIC_TEXI, MG_MESSAGE)))
	511	return false;
	512
	513	// output each level
	514	CompressTextInfoMap::iterator tiHere = textInfoMap.begin ();
	515	CompressTextInfoMap::iterator tiEnd = textInfoMap.end ();
	516	while (tiHere != tiEnd) {
	517	// remember the level information
	518	TextLevelInfo levelInfo;
	519	levelInfo.levelTag = (*tiHere).first;
	520	levelInfo.textIdxPtr = ftell (textIdxFile);
	521	levelInfo.numEntries = (*tiHere).second.docPtrs.size();
	522	textLevels.levelInfo[levelInfo.levelTag] = levelInfo;
	523
	524	// write out the index array for this level
	525	if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs))
	526	return false;
	527
[8692]	528	++tiHere;
[3365]	529	}
	530
	531	// close the file
	532	fclose (textIdxFile);
	533
	534	return true;
	535	}
	536
	537
	538	bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) {
	539	// cout << textLevels;
	540
	541	// create the text index
	542	FILE *textLevelFile;
	543	if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b",
	544	MAGIC_TEXT_LEVELS, MG_MESSAGE)))
	545	return false;
	546
	547	// output each level
	548	if (!textLevels.Write (textLevelFile)) return false;
	549
	550	// close the file
	551	fclose (textLevelFile);
	552
	553	return true;
	554	}
	555
	556
	557	int done_text_2 (const TagInfo &/tagInfo/, char *fileName) {
	558	// write out any remaining bits
	559	textOutBuf.encodeDone();
	560
	561	// write out the compressed text header (with updated statistics)
	562	if (fseek (text, sizeof (u_long), SEEK_SET) == -1 \|\| !cth.Write (text))
	563	return COMPERROR;
	564	fclose (text);
	565
	566	// write out the text index file and get the level information
	567	FTextLevel textLevels;
	568	if (!WriteTextIdx (fileName, textLevels)) return COMPERROR;
	569
	570	// write out the text level information
	571	if (!WriteTextLevels (fileName, textLevels)) return COMPERROR;
	572
	573	// write out the auxiliary dictionary
	574	if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
	575	(cdh.novel_method == MG_NOVEL_DELTA \|\|
	576	cdh.novel_method == MG_NOVEL_HYBRID)) {
	577	if (write_aux_dict (fileName) == COMPERROR) return COMPERROR;
	578	estimate_compressed_aux_dict ();
	579
	580	} else {
	581	unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL));
	582	}
	583
	584	return (COMPALLOK);
	585	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: