Context Navigation

source: trunk/gsdl/src/mgpp/text/mg_compression_dict.cpp@ 860

Last change on this file since 860 was 860, checked in by rjmcnab, 24 years ago
Fixed a couple of bugs and made building silent if needed.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.7 KB

Rev	Line
[856]	1	/**************************************************************************
	2	*
	3	* mg_compression_dict.cpp -- Routines for creating compression dictionary
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	* $Id: mg_compression_dict.cpp 860 2000-01-18 03:53:24Z rjmcnab $
	21	*
	22	**************************************************************************/
	23
	24	#include "sysfuncs.h"
	25
	26	#include "memlib.h"
	27	#include "messages.h"
	28	#include "local_strings.h"
	29	#include "bitio_gen.h"
	30	#include "bitio_m_stdio.h"
	31	#include "mgheap.h"
	32	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
	33
	34	#include "mg_files.h"
	35	#include "locallib.h"
	36	#include "invf.h"
	37	#include "text.h"
	38	#include "words.h"
	39	#include "mg.h"
	40	#include "WordData.h"
	41
	42	#define MAXBITS (sizeof(unsigned long) * 8)
	43
	44	/*
	45	$Log$
[860]	46	Revision 1.2 2000/01/18 03:53:22 rjmcnab
	47	Fixed a couple of bugs and made building silent if needed.
	48
[856]	49	Revision 1.1 2000/01/14 02:26:11 sjboddie
	50	Rodgers new C++ mg
	51
	52	Revision 1.3 1999/10/17 23:43:25 cs025
	53	Changes to eradicate Xmalloc
	54
	55	Revision 1.2 1999/10/11 22:03:03 cs025
	56	Changed mistaken use of NTOHUL on a double to NTOHD as per report
	57	from Craig Neville-Manning of problems installing mg/gsdl.
	58
	59	Revision 1.1 1999/10/11 02:57:46 cs025
	60	Base install of MG-PP
	61
	62	Revision 1.1 1999/08/10 21:18:04 sjboddie
	63	renamed mg-1.3d directory mg
	64
	65	Revision 1.4 1999/01/15 03:05:51 rjmcnab
	66
	67	Renamed lib/heap to be lib/mgheap (it was causing some problems with
	68	some versions of the STL libraries which contained a heap.h)
	69
	70	Revision 1.3 1998/12/17 09:12:52 rjmcnab
	71
	72	Altered mg to process utf-8 encoded Unicode. The main changes
	73	are in the parsing of the input, the casefolding, and the stemming.
	74
	75	Revision 1.2 1998/11/25 07:55:44 rjmcnab
	76
	77	Modified mg to that you can specify the stemmer you want
	78	to use via a command line option. You specify it to
	79	mg_passes during the build process. The number of the
	80	stemmer that you used is stored within the inverted
	81	dictionary header and the stemmed dictionary header so
	82	the correct stemmer is used in later stages of building
	83	and querying.
	84
	85	Revision 1.1 1998/11/17 09:34:52 rjmcnab
	86	* empty log message *
	87
	88	* Revision 1.3 1994/10/20 03:56:54 tes
	89	* I have rewritten the boolean query optimiser and abstracted out the
	90	* components of the boolean query.
	91	*
	92	* Revision 1.2 1994/09/20 04:41:45 tes
	93	* For version 1.1
	94	*
	95	*/
	96
	97	/* #define DEBUG1 */
	98
	99	/* #define DEBUG */
	100
	101	#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
	102
	103	#define MAX_RECALCULATIONS 100
	104
	105	typedef struct DictWordData : public WordData
	106	{
	107	float saving;
	108	float char_bit_cost;
	109	u_long num_trans;
	110	u_char *word;
	111	}
	112	DictWordData;
	113
	114	static DictWordData *Words[2];
	115	static u_long Num[2];
	116	static u_long chars[2];
	117
	118	#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
	119	#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
	120	#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
	121
	122
	123	typedef struct DictData
	124	{
	125	DictWordData **wd;
	126	u_long num_wds;
	127	u_long chars;
	128	}
	129	DictData;
	130
	131	typedef DictData DictInfo[2];
	132
	133	static DictInfo keep, discard, all;
	134	static compression_stats_header csh;
	135
	136	static char *file_name = "";
	137	static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
	138
	139
	140	static void ReadInWords (char *);
	141	static void Select_on (int k, heap_comp hc);
	142	static void Method3 (int k);
	143	static void Select_all (void);
	144	static u_long WriteOutWords (char *, u_long, int);
	145	static int DecFreqIncWL (void a, void b);
	146	static int OccuranceOrder (void a, void b);
	147
	148
	149
	150	int main (int argc, char **argv)
	151	{
	152	int ch;
	153	char type = 'C';
	154	char mode = '0';
	155	int lookback = 2;
	156	double k = 0;
	157	u_long mem_reqd;
	158	opterr = 0;
	159	msg_prefix = argv[0];
	160	while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1)
	161	switch (ch)
	162	{
	163	case 'H':
	164	novel_method = MG_NOVEL_HUFFMAN_CHARS;
	165	break;
	166	case 'D':
	167	novel_method = MG_NOVEL_DELTA;
	168	break;
	169	case 'Y':
	170	novel_method = MG_NOVEL_HYBRID;
	171	break;
	172	case 'f': /* input file */
	173	file_name = optarg;
	174	break;
	175	case 'd':
	176	set_basepath (optarg);
	177	break;
	178	case 'C':
	179	case 'P':
	180	case 'S':
	181	type = ch;
	182	break;
	183	case '0':
	184	case '1':
	185	case '2':
	186	case '3':
	187	mode = ch;
	188	break;
	189	case 'l':
	190	lookback = atoi (optarg);
	191	if (!is_power_of_two (lookback))
	192	FatalError (1, "The lookback value must be a power of 2");
	193	lookback = floorlog_2 (lookback);
	194	break;
	195	case 'k':
	196	k = atof (optarg) * 1024;
	197	break;
	198	case 'h':
	199	case '?':
	200	fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
	201	"[-d data directory] [-h] [-0\|-1\|-2\|-3] [-C\|-P\|-S] [-k mem (Kb)]\n",
	202	argv[0]);
	203	exit (1);
	204	}
	205
	206	ReadInWords (file_name);
	207
	208	if (type == 'C')
	209	{
	210	Select_all ();
	211	mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
	212	}
	213	else
	214	{
	215	switch (mode)
	216	{
	217	case '0':
	218	Select_all ();
	219	break;
	220	case '1':
[860]	221	#ifndef SILENT
[856]	222	Message ("Dictionary limit of %.2f Kb", k / 1024);
[860]	223	#endif
[856]	224	Select_on ((int) k, OccuranceOrder);
	225	break;
	226	case '2':
[860]	227	#ifndef SILENT
[856]	228	Message ("Dictionary limit of %.2f Kb", k / 1024);
[860]	229	#endif
[856]	230	Select_on ((int) k, DecFreqIncWL);
	231	break;
	232	case '3':
[860]	233	#ifndef SILENT
[856]	234	Message ("Dictionary limit of %.2f Kb", k / 1024);
[860]	235	#endif
[856]	236	Method3 ((int) k);
	237	break;
	238	}
	239	if (type == 'P')
	240	{
	241	mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
	242	}
	243	else
	244	{
	245	mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
	246	}
	247	}
	248
[860]	249	#ifndef SILENT
[856]	250	Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
	251	Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
	252	Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
	253	Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
	254	Message ("Mem usage : %8u -> %8u\n",
	255	(Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
	256	(keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
	257	keep[0].chars + keep[1].chars);
	258	Message ("Actual mem required : %8u\n", mem_reqd);
[860]	259	#endif
[856]	260	exit (0);
	261	}
	262
	263
	264
	265
	266	static void
	267	ReadInWords (char *filename)
	268	{
	269	FILE *f;
	270	unsigned long i;
	271	f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
	272	MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
	273
	274	fread (&csh, sizeof (csh), 1, f);
	275
	276	/* [RPAP - Jan 97: Endian Ordering] */
	277	NTOHUL(csh.num_docs);
	278	NTOHD(csh.num_bytes);
	279
	280	for (i = 0; i < 2; i++)
	281	{
	282	frags_stats_header fsh;
	283	char *buf;
	284	DictWordData *wd;
	285	chars[i] = 0;
	286
	287	fread (&fsh, sizeof (fsh), 1, f);
	288	/* [RPAP - Jan 97: Endian Ordering] */
	289	NTOHUL(fsh.num_frags);
	290	NTOHUL(fsh.mem_for_frags);
	291
	292	Num[i] = all[i].num_wds = fsh.num_frags;
	293
	294	/* The +1 on the following line is to leave room for the esc code. */
	295	all[i].wd = (DictWordData *) Xmalloc (sizeof (DictWordData ) * Num[i] + 1);
	296
	297	buf = (char *) Xmalloc (fsh.mem_for_frags);
	298	wd = Words[i] = (DictWordData ) Xmalloc (sizeof (DictWordData) Num[i]);
	299	unsigned int j;
	300	for (j = 0; j < Num[i]; j++, wd++)
	301	{
	302	int len;
	303
	304	// read docCount and wordCount
	305	wd->read(f);
	306
	307	// read an mgString
	308	len = fgetc (f);
	309	wd->word = (u_char *) buf;
	310	*buf++ = len;
	311	fread (buf, len, 1, f);
	312	buf += len;
	313	all[i].wd[j] = wd;
	314	}
	315	chars[i] = fsh.mem_for_frags - fsh.num_frags;
	316	}
	317	fclose (f);
	318	}
	319
	320
	321	static void
	322	Alloc_keep_discard (void)
	323	{
	324	keep[0].num_wds = 0;
	325	keep[0].wd = (DictWordData *) Xmalloc ((Num[0] + 1) sizeof (DictWordData *));
	326	keep[1].num_wds = 0;
	327	keep[1].wd = (DictWordData *) Xmalloc ((Num[1] + 1) sizeof (DictWordData *));
	328	discard[0].num_wds = 0;
	329	discard[0].wd = (DictWordData *) Xmalloc ((Num[0] + 1) sizeof (DictWordData *));
	330	discard[1].num_wds = 0;
	331	discard[1].wd = (DictWordData *) Xmalloc ((Num[1] + 1) sizeof (DictWordData *));
	332	}
	333
	334
	335	static int
	336	sort_comp (const void a, const void b)
	337	{
	338	DictWordData aa = (DictWordData **) a;
	339	DictWordData bb = (DictWordData **) b;
	340	return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
	341	}
	342
	343
	344
	345	static void
	346	SortAndCount_DictData (DictData * dd)
	347	{
	348	unsigned int i;
	349	DictWordData **wd;
	350	qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp);
	351	dd->chars = 0;
	352	wd = dd->wd;
	353	for (i = 0; i < dd->num_wds; i++, wd++)
	354	dd->chars += (*wd)->word[0];
	355	}
	356
	357
	358	static void
	359	Select_all (void)
	360	{
	361	unsigned int i;
	362	Alloc_keep_discard ();
	363	keep[0].num_wds = Num[0];
	364	for (i = 0; i < Num[0]; i++)
	365	keep[0].wd[i] = Words[0] + i;
	366	keep[1].num_wds = Num[1];
	367	for (i = 0; i < Num[1]; i++)
	368	keep[1].wd[i] = Words[1] + i;
	369	SortAndCount_DictData (&keep[0]);
	370	SortAndCount_DictData (&keep[1]);
	371	}
	372
	373
	374
	375
	376	static int
	377	DecFreqIncWL (void a, void b)
	378	{
	379	DictWordData aa = (DictWordData **) a;
	380	DictWordData bb = (DictWordData **) b;
	381	if (aa->docCount > bb->docCount)
	382	return -1;
	383	else if (aa->docCount < bb->docCount)
	384	return 1;
	385	else
	386	return bb->word[0] - aa->word[0];
	387	}
	388
	389
	390	static int
	391	OccuranceOrder (void a, void b)
	392	{
	393	DictWordData aa = (DictWordData **) a;
	394	DictWordData bb = (DictWordData **) b;
	395	if (aa->words() > bb->words())
	396	return 1;
	397	else if (aa->words() < bb->words())
	398	return -1;
	399	else
	400	return 0;
	401	}
	402
	403
	404	static void
	405	Select_on (int k, heap_comp hc)
	406	{
	407	int i, num, mem;
	408	DictWordData **wd;
	409
	410	Alloc_keep_discard ();
	411
	412	num = Num[0] + Num[1];
	413	wd = (DictWordData *) Xmalloc (num sizeof (DictWordData *));
	414	for (i = 0; (unsigned int)i < Num[0]; i++)
	415	wd[i] = Words[0] + i;
	416	for (i = 0; (unsigned int)i < Num[1]; i++)
	417	wd[i + Num[0]] = Words[1] + i;
	418
	419	heap_build (wd, sizeof (*wd), num, hc);
	420
	421	mem = 0;
	422	while (k > mem && num)
	423	{
	424	int idx;
	425	DictWordData *word = wd[0];
	426	#ifdef DEBUG
	427	fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
	428	keep[0].num_wds, keep[1].num_wds,
	429	mem, word->documents(), word->word, word2str (word->word));
	430	#endif
	431	mem += sizeof (u_char *) + word->word[0];
	432	heap_deletehead (wd, sizeof (*wd), &num, hc);
	433	idx = KIND (word);
	434	keep[idx].wd[keep[idx].num_wds++] = word;
	435	}
	436
	437	for (i = 0; i < num; i++)
	438	{
	439	DictWordData *word = wd[i];
	440	int idx = KIND (word);
	441	discard[idx].wd[discard[idx].num_wds++] = word;
	442	}
	443	SortAndCount_DictData (&keep[0]);
	444	SortAndCount_DictData (&keep[1]);
	445	SortAndCount_DictData (&discard[0]);
	446	SortAndCount_DictData (&discard[1]);
	447
	448	assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
	449	assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
	450
	451	}
	452
	453
	454
	455	static int
	456	BigSaving (void a, void b)
	457	{
	458	DictWordData aa = (DictWordData **) a;
	459	DictWordData bb = (DictWordData **) b;
	460	return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
	461	}
	462
	463	static int
	464	SmallSaving (void a, void b)
	465	{
	466	DictWordData aa = (DictWordData **) a;
	467	DictWordData bb = (DictWordData **) b;
	468	return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
	469	}
	470
	471
	472	static void
	473	CalcCharCounts (DictWordData ** wd, int num,
	474	char char_lens[2], char len_lens[2],
	475	u_long escape[2])
	476	{
	477	long char_freqs[2][256];
	478	long len_freqs[2][16];
	479	huff_data hd;
	480	int i;
	481	escape[0] = 0;
	482	escape[1] = 0;
	483	bzero ((char *) char_freqs, sizeof (char_freqs));
	484	bzero ((char *) len_freqs, sizeof (len_freqs));
	485	for (i = 0; i < num; i++, wd++)
	486	{
	487	u_long freq = (*wd)->documents();
	488	u_char buf = (wd)->word;
	489	int len = *buf++;
	490	int idx = KIND (*wd);
	491	len_freqs[idx][len] += freq;
	492	escape[idx] += freq;
	493	for (; len; len--, buf++)
	494	char_freqs[idx][(u_long) (*buf)] += freq;
	495	}
	496	Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
	497	char_lens[0] = hd.clens;
	498	Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
	499	char_lens[1] = hd.clens;
	500
	501	Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
	502	len_lens[0] = hd.clens;
	503	Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
	504	len_lens[1] = hd.clens;
	505	}
	506
	507
	508
	509
	510
	511	inline void CalcBitCostForWordData(DictWordData **word, int num,
	512	double freqs_trans[2], u_long escape[2],
	513	char * char_lens[2], char *len_lens[2],
	514	double esc[2], int num_trans)
	515	{
	516	int j;
	517
	518	for (j = 0; j < num; j++, word++)
	519	{
	520	float cbc, wbc;
	521	u_char buf = (word)->word;
	522	int len = *buf++;
	523	u_long freq = (*word)->documents();
	524	int idx = KIND (*word);
	525
	526	cbc = len_lens[idx][len];
	527	for (; len; len--, buf++)
	528	cbc += char_lens[idx][(u_long) (*buf)];
	529
	530	(word)->char_bit_cost = (cbc + esc[idx]) freq;
	531
	532	wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
	533
	534	(word)->saving = ((word)->char_bit_cost - wbc) /
	535	(sizeof (char ) + (word)->word[0]);
	536
	537	(*word)->num_trans = num_trans;
	538	}
	539	}
	540
	541
	542
	543	void
	544	CalcBitCost (DictWordData ** discard_word, int discard_num,
	545	DictWordData ** keep_word, int keep_num, double freqs_trans[2],
	546	u_long escape[2], int num_trans)
	547	{
	548	char *char_lens[2];
	549	char *len_lens[2];
	550	double esc[2];
	551	CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
	552	esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
	553	esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
	554	CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape,
	555	char_lens, len_lens, esc, num_trans);
	556	CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape,
	557	char_lens, len_lens, esc, num_trans);
	558	Xfree (char_lens[0]);
	559	Xfree (char_lens[1]);
	560	Xfree (len_lens[0]);
	561	Xfree (len_lens[1]);
	562	}
	563
	564	inline void m3_transferWord(DictWordData *toHeap, int toNum, heap_comp toSaving,
	565	DictWordData *fromHeap, int fromNum, heap_comp fromSaving,
	566	int num_trans, double freqs_trans[], int mem, double *total,
	567	int *count)
	568	{
	569	/* Transfer the word at the top of the keep heap to the
	570	discard heap. */
	571	DictWordData *word = fromHeap[0];
	572	int idx = KIND (word);
	573	heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving);
	574	toHeap[*toNum] = word;
	575	heap_additem (toHeap, sizeof (word), toNum, toSaving);
	576	freqs_trans[idx] -= word->documents();
	577	mem = (mem) - sizeof (u_char *) + word->word[0];
	578	*num_trans += 1;
	579	*total += word->saving;
	580	*count += 1;
	581	#ifdef DEBUG
	582	fprintf (stderr,
	583	"KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
	584	*mem, word->documents(), word->char_bit_cost,
	585	word->saving, word2str (word->word));
	586	#endif
	587	}
	588
	589	inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans,
	590	double freqs_trans[], u_long escape[], heap_comp hc)
	591	{
	592	DictWordData *word = wordHeap[0];
	593	int idx = KIND (word);
	594	float wbc;
	595	#ifdef DEBUG1
	596	fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
	597	word->saving);
	598	#endif
	599	wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) *
	600	word->documents();
	601	word->saving = (word->char_bit_cost - wbc) /
	602	(sizeof (char *) + word->word[0]);
	603	#ifdef DEBUG1
	604	fprintf (stderr, " %.2f\n", word->saving);
	605	#endif
	606	word->num_trans = num_trans;
	607	heap_changedhead (wordHeap, sizeof (word), wordNum, hc);
	608	}
	609
	610
	611	static void
	612	Method3 (int k)
	613	{
	614	int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
	615	int keep_to_discard = 0;
	616	int discard_to_keep = 0;
	617	int recalcs = 0;
	618	double freqs_trans[2], total;
	619	u_long escape[2];
	620	DictWordData keep_heap, discard_heap;
	621
	622	freqs_trans[0] = freqs_trans[1] = 0;
	623	num_trans = 0;
	624
	625	discard_num = Num[0] + Num[1];
	626	discard_heap = (DictWordData *) Xmalloc (discard_num sizeof (DictWordData *));
	627
	628	keep_num = 0;
	629	keep_heap = (DictWordData *) Xmalloc (discard_num sizeof (DictWordData *));
	630
	631
	632	for (i = 0; (unsigned int)i < Num[0]; i++)
	633	discard_heap[i] = Words[0] + i;
	634	for (i = 0; (unsigned int)i < Num[1]; i++)
	635	discard_heap[i + Num[0]] = Words[1] + i;
	636
	637	heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
	638
	639	mem = 0;
	640	while (k > mem && discard_num)
	641	{
	642	DictWordData *word = discard_heap[0];
	643	mem += sizeof (u_char *) + word->word[0];
	644	heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
	645	keep_heap[keep_num++] = word;
	646	freqs_trans[KIND (word)] += word->documents();
	647	num_trans++;
	648	}
	649
	650	CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
	651	freqs_trans, escape, num_trans);
	652	heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
	653	heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
	654
	655	total = 0;
	656	recalc_reqd = 0;
	657	while (keep_num && discard_num)
	658	{
	659	if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) \|\|
	660	(discard_num && discard_heap[0]->num_trans < (unsigned)num_trans))
	661	{
	662	if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans)
	663	{
	664	m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving);
	665	}
	666
	667	if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)
	668	{
	669	m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving);
	670	}
	671	}
	672	else if (keep_heap[0]->saving < discard_heap[0]->saving)
	673	{
	674	assert (keep_num && discard_num);
	675	if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k)
	676	{
	677	/* Transfer the word at the top of the keep heap to the
	678	discard heap. */
	679	m3_transferWord(discard_heap, &discard_num, BigSaving,
	680	keep_heap, &keep_num, SmallSaving,
	681	&num_trans, freqs_trans, &mem, &total, &keep_to_discard);
	682	}
	683	else
	684	{
	685	/* Transfer the word at the top of the discard heap to the
	686	keep heap. */
	687	m3_transferWord(keep_heap, &keep_num, SmallSaving,
	688	discard_heap, &discard_num, BigSaving,
	689	&num_trans, freqs_trans, &mem, &total, &discard_to_keep);
	690	}
	691
	692	recalc_reqd = 1;
	693
	694	}
	695	else
	696	{
	697	if (recalc_reqd == 0)
	698	break;
	699	#ifdef DEBUG1
	700	fprintf (stderr, "--------------\n");
	701	#endif
	702	if (recalcs == MAX_RECALCULATIONS)
	703	break;
	704	CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
	705	freqs_trans, escape, num_trans);
	706	heap_build (discard_heap, sizeof (*discard_heap),
	707	discard_num, BigSaving);
	708	heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
	709	recalc_reqd = 0;
	710	recalcs++;
	711	}
	712	}
	713
	714	Alloc_keep_discard ();
	715
	716	for (i = 0; i < discard_num; i++)
	717	{
	718	DictWordData *word = discard_heap[i];
	719	int idx = KIND (word);
	720	assert (IsWord (word) \|\| IsNonWord (word));
	721	discard[idx].wd[discard[idx].num_wds++] = word;
	722	}
	723	for (i = 0; i < keep_num; i++)
	724	{
	725	DictWordData *word = keep_heap[i];
	726	int idx = KIND (word);
	727	assert (IsWord (word) \|\| IsNonWord (word));
	728	keep[idx].wd[keep[idx].num_wds++] = word;
	729	}
	730	SortAndCount_DictData (&keep[0]);
	731	SortAndCount_DictData (&keep[1]);
	732	SortAndCount_DictData (&discard[0]);
	733	SortAndCount_DictData (&discard[1]);
	734
	735	assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
	736	assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
[860]	737	#ifndef SILENT
[856]	738	Message ("Keep -> Discard : %8d", keep_to_discard);
	739	Message ("Discard -> Keep : %8d", discard_to_keep);
	740	Message ("Huffman Recalculations : %8d", recalcs);
	741	if (recalcs == MAX_RECALCULATIONS)
	742	Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
[860]	743	#endif
[856]	744	}
	745
	746
	747
	748
	749
	750
	751
	752
	753
	754
	755	/****************************************************************************
	756	*** ***
	757	*** Dictionary saving code ***
	758	*** ***
	759	****************************************************************************/
	760
	761
	762
	763	static void
	764	Write_cdh (FILE * f, compression_dict_header * cdh)
	765	{
	766	/* [RPAP - Jan 97: Endian Ordering] */
	767	int i;
	768	compression_dict_header tmp = *cdh;
	769	HTONUL(tmp.dict_type);
	770	HTONUL(tmp.novel_method);
	771	for (i = 0; i < TEXT_PARAMS; i++)
	772	HTONUL(tmp.params[i]);
	773	HTONUL(tmp.num_words[0]);
	774	HTONUL(tmp.num_words[1]);
	775	HTONUL(tmp.num_word_chars[0]);
	776	HTONUL(tmp.num_word_chars[1]);
	777	HTONUL(tmp.lookback);
	778
	779	fwrite (&tmp, sizeof (tmp), 1, f);
	780	}
	781
	782
	783	static void
	784	Write_words (FILE * f, DictData * dd)
	785	{
	786	unsigned int i;
	787	u_char curr, prev = NULL;
	788	for (i = 0; i < dd->num_wds; i++)
	789	{
	790	int len;
	791	curr = dd->wd[i]->word;
	792	if (prev)
	793	/* look for prefix match with prev string */
	794	len = prefixlen (prev, curr);
	795	else
	796	len = 0;
	797	fputc ((len << 4) + (curr[0] - len), f);
	798	fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
	799	prev = curr;
	800	}
	801
	802	}
	803
	804
	805	static int
	806	Uncompressed_size (DictData * dd)
	807	{
	808	unsigned int i, us;
	809	for (us = i = 0; i < dd->num_wds; i++)
	810	us += dd->wd[i]->word[0];
	811	return us;
	812	}
	813
	814
	815	static u_long
	816	Write_data (FILE * f, DictData * dd, int lookback)
	817	{
	818	u_long mem_reqd;
	819	huff_data *hd;
	820	int i;
	821	u_long us = dd->num_wds;
	822	long *freqs;
	823	u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
	824	u_long lencounts[MAX_HUFFCODE_LEN + 1];
	825	u_char *lastword[MAX_HUFFCODE_LEN + 1];
	826
	827	if (!(freqs = new long [dd->num_wds]))
	828	FatalError (1, "Unable to allocate memory for freqs");
	829
	830	for (i = 0; (unsigned)i < dd->num_wds; i++)
	831	{
	832	freqs[i] = dd->wd[i]->documents();
	833	us += dd->wd[i]->word[0];
	834	}
	835
	836	if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
	837	FatalError (1, "Unable to allocate memory for huffman data");
	838
	839	delete (freqs);
	840	freqs = NULL;
	841
	842	if (Write_Huffman_Data (f, hd) == -1)
	843	FatalError (1, "Unable to write huffman data");
	844
	845	HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
	846	fwrite (&us, sizeof (us), 1, f);
	847	NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
	848
	849
	850	/* Calculate the amount of memory that will be required to store the text for
	851	each different huffman code len. Every 1<<lookback words for each different
	852	codelen length will not be prefixed by previous strings. */
	853
	854
	855	bzero ((char *) &huff_words_size, sizeof (huff_words_size));
	856	bzero ((char *) &lencounts, sizeof (lencounts));
	857
	858	mem_reqd = 0;
	859
	860	for (i = 0; (unsigned)i < dd->num_wds; i++)
	861	{
	862	int codelen = hd->clens[i];
	863	u_char *word = dd->wd[i]->word;
	864
	865	if (!codelen)
	866	FatalError (1, "The length of a code for a word was zero");
	867
	868	huff_words_size[codelen] += word[0] + 1;
	869	mem_reqd += word[0] + (lookback != 0);
	870	#if 0
	871	if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
	872	lastword[codelen] = word;
	873	else
	874	huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
	875	#else
	876	if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
	877	{
	878	int save = prefixlen (lastword[codelen], word);
	879	mem_reqd -= save;
	880	huff_words_size[codelen] -= save;
	881	}
	882	else
	883	{
	884	mem_reqd += sizeof (u_char *);
	885	}
	886	lastword[codelen] = word;
	887	#endif
	888	lencounts[codelen]++;
	889	}
	890
	891	/* [RPAP - Jan 97: Endian Ordering] */
	892	for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
	893	HTONUL(huff_words_size[i]);
	894
	895	fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
	896	hd->maxcodelen - hd->mincodelen + 1, f);
	897
	898	/* [RPAP - Jan 97: Endian Ordering] */
	899	for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
	900	NTOHUL(huff_words_size[i]);
	901
	902	Write_words (f, dd);
	903
	904	delete hd->clens;
	905	delete hd;
	906
	907	return mem_reqd;
	908	}
	909
	910
	911
	912	static void
	913	Write_charfreqs (FILE * f, DictData * dd, int words,
	914	int zero_freq_permitted)
	915	{
	916	unsigned int j;
	917	long freqs[256];
	918	DictWordData **wd = dd->wd;
	919	huff_data *hd;
	920
	921	bzero ((char *) freqs, sizeof (freqs));
	922
	923	for (j = 0; j < dd->num_wds; j++, wd++)
	924	{
	925	u_char buf = (wd)->word;
	926	int len = *buf++;
	927	for (; len; len--, buf++)
	928	freqs[(u_long) (buf)] += (wd)->documents();
	929	}
	930
	931	if (!zero_freq_permitted)
	932	for (j = 0; j < 256; j++)
	933	if (!freqs[j] && PESINAWORD (j) == words)
	934	freqs[j] = 1;
	935
	936	if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
	937	FatalError (1, "Unable to allocate memory for huffman data");
	938
	939	if (Write_Huffman_Data (f, hd) == -1)
	940	FatalError (1, "Unable to write huffman data");
	941
	942	delete hd->clens;
	943	delete hd;
	944	}
	945
	946
	947
	948
	949	static void
	950	Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
	951	{
	952	unsigned int j;
	953	long freqs[16];
	954	DictWordData **wd = dd->wd;
	955	huff_data *hd;
	956
	957	bzero ((char *) freqs, sizeof (freqs));
	958
	959	for (j = 0; j < dd->num_wds; j++, wd++)
	960	freqs[(wd)->word[0]] += (wd)->documents();
	961
	962	if (!zero_freq_permitted)
	963	for (j = 0; j < 16; j++)
	964	if (!freqs[j])
	965	freqs[j] = 1;
	966
	967	if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
	968	FatalError (1, "Unable to allocate memory for huffman data");
	969
	970	if (Write_Huffman_Data (f, hd) == -1)
	971	FatalError (1, "Unable to write huffman data");
	972
	973
	974	delete hd->clens;
	975	delete hd;
	976	}
	977
	978
	979
	980	static u_long
	981	WriteOutWords (char *file_name, u_long type, int lookback)
	982	{
	983	FILE *f;
	984	int i;
	985	u_long mem_reqd = 0;
	986
	987	compression_dict_header cdh;
	988	f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
	989	MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
	990
	991	bzero((char *) &cdh, sizeof(cdh));
	992
	993	cdh.dict_type = type;
	994	cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
	995	MG_NOVEL_HUFFMAN_CHARS;
	996
	997	cdh.num_words[1] = keep[1].num_wds;
	998	cdh.num_words[0] = keep[0].num_wds;
	999	cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
	1000	cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
	1001	cdh.lookback = lookback;
	1002
	1003	Write_cdh (f, &cdh);
	1004
	1005	for (i = 0; i < 2; i++)
	1006	switch (type)
	1007	{
	1008	case MG_COMPLETE_DICTIONARY:
	1009	{
	1010	mem_reqd += Write_data (f, &keep[i], lookback);
	1011	}
	1012	break;
	1013	case MG_PARTIAL_DICTIONARY:
	1014	{
	1015	if (keep[i].num_wds)
	1016	{
	1017	int j;
	1018	DictWordData esc;
	1019	esc.docCount = 0;
	1020	esc.word = (u_char *) "";
	1021	keep[i].wd[keep[i].num_wds++] = &esc;
	1022	for (j = 0; (unsigned)j < discard[i].num_wds; j++)
	1023	esc.docCount += discard[i].wd[j]->documents();
	1024	if (!esc.docCount)
	1025	esc.docCount++;
	1026	mem_reqd += Write_data (f, &keep[i], lookback);
	1027	}
	1028	Write_charfreqs (f, &discard[i], i, 1);
	1029	Write_wordlens (f, &discard[i], 1);
	1030	}
	1031	break;
	1032	case MG_SEED_DICTIONARY:
	1033	{
	1034	if (keep[i].num_wds)
	1035	{
	1036	int j;
	1037	DictWordData esc;
	1038	esc.docCount = 0;
	1039	esc.word = (u_char *) "";
	1040	keep[i].wd[keep[i].num_wds++] = &esc;
	1041	for (j = 0; (unsigned)j < all[i].num_wds; j++)
	1042	if (all[i].wd[j]->documents() == 1)
	1043	esc.docCount++;
	1044	if (!esc.docCount)
	1045	esc.docCount++;
	1046	mem_reqd += Write_data (f, &keep[i], lookback);
	1047	}
	1048	switch (novel_method)
	1049	{
	1050	case MG_NOVEL_HUFFMAN_CHARS:
	1051	Write_charfreqs (f, &all[i], i, 0);
	1052	Write_wordlens (f, &all[i], 0);
	1053	break;
	1054	case MG_NOVEL_DELTA:
	1055	break;
	1056	case MG_NOVEL_HYBRID:
	1057	break;
	1058	default:
	1059	FatalError (1, "Bad novel method");
	1060	}
	1061	}
	1062	break;
	1063	}
	1064	fclose (f);
	1065	return mem_reqd;
	1066	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: