Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/mgpp/text/text.pass1.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago
Rodgers new C++ mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.2 KB

Line
1	/**************************************************************************
2	*
3	* text.pass1.cpp -- Text compression (Pass 1)
4	* Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5	* Copyright (C) 1999 Rodger McNab
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	* $Id: text.pass1.cpp 856 2000-01-14 02:26:25Z sjboddie $
22	*
23	**************************************************************************/
24
25	#include "sysfuncs.h"
26
27	#include "memlib.h"
28	#include "messages.h"
29	#include "huffman.h"
30	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
31
32
33	#include "mg_files.h"
34	#include "mg.h"
35	#include "build.h"
36	#include "locallib.h"
37	#include "words.h"
38	#include "text.h"
39	#include "hash.h"
40	#include "local_strings.h"
41
42	#include "TextEl.h"
43
44
45	/*
46	$Log$
47	Revision 1.1 2000/01/14 02:26:23 sjboddie
48	Rodgers new C++ mg
49
50	Revision 1.1 1999/10/11 02:58:37 cs025
51	Base install of MG-PP
52
53	Revision 1.1 1999/08/10 21:18:25 sjboddie
54	renamed mg-1.3d directory mg
55
56	Revision 1.2 1998/12/17 09:12:54 rjmcnab
57
58	Altered mg to process utf-8 encoded Unicode. The main changes
59	are in the parsing of the input, the casefolding, and the stemming.
60
61	Revision 1.1 1998/11/17 09:35:46 rjmcnab
62	* empty log message *
63
64	* Revision 1.4 1994/11/25 03:47:47 tes
65	* Committing files before adding the merge stuff.
66	*
67	* Revision 1.3 1994/10/20 03:57:09 tes
68	* I have rewritten the boolean query optimiser and abstracted out the
69	* components of the boolean query.
70	*
71	* Revision 1.2 1994/09/20 04:42:13 tes
72	* For version 1.1
73	*
74	*/
75
76
77	#define POOL_SIZE 1024*1024
78	#define INITIAL_HASH_SIZE 7927
79
80
81
82
83
84
85
86	typedef struct hash_rec
87	{
88	unsigned long wcnt; /* word frequency */
89	unsigned long occurance_num;
90	u_char *word;
91	}
92	hash_rec;
93
94	typedef struct dict_data
95	{
96	hash_rec *HashTable;
97	unsigned long HashSize;
98	unsigned long HashUsed;
99	unsigned long wordnum;
100	unsigned long words_read;
101	unsigned long bytes_diff;
102	huff_data hd;
103	}
104	dict_data;
105
106
107
108	static unsigned long longestDoc = 0;
109	static unsigned long occurance_num = 0;
110	static dict_data DictData[2];
111
112	static u_char *Pool;
113	static int PoolLeft;
114	static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
115	static unsigned long MaxMemInUse = 0;
116	static unsigned long MemInUse = 0;
117	static compression_stats_header csh = {0, 0, 0.0}; /* [RJM 07/97: 4G limit] */
118
119
120	static void ChangeMem (int Change) {
121	MemInUse += Change;
122	if (MemInUse > MaxMemInUse) MaxMemInUse = MemInUse;
123	}
124
125
126	int init_text_1 (const TagInfo &/tagInfo/, char * /FileName/) {
127	int which;
128
129	if (!(Pool = (u_char *) Xmalloc (POOL_SIZE))) {
130	Message ("Unable to allocate memory for pool");
131	return (COMPERROR);
132	}
133	PoolLeft = POOL_SIZE;
134	ChangeMem (POOL_SIZE);
135
136	for (which = 1; which >= 0; which--) {
137	u_char *word;
138	hash_rec *ent;
139	dict_data *dd = &DictData[which];
140
141	dd->wordnum = 0;
142	dd->words_read = 0;
143	dd->bytes_diff = 0;
144	dd->HashSize = INITIAL_HASH_SIZE;
145	dd->HashUsed = 0;
146
147	if (!(dd->HashTable = (hash_rec ) Xmalloc (sizeof (hash_rec) dd->HashSize))) {
148	Message ("Unable to allocate memory for table");
149	return (COMPERROR);
150	}
151	ChangeMem (sizeof (hash_rec) * dd->HashSize);
152	bzero ((char ) (dd->HashTable), sizeof (hash_rec) dd->HashSize);
153
154	word = Pool;
155	*Pool++ = '\0';
156	PoolLeft--;
157	{
158	register u_char *wptr;
159	register int hsize = dd->HashSize;
160	register unsigned long hashval, step;
161
162	HASH (hashval, step, word, hsize);
163	wptr = (dd->HashTable + hashval)->word;
164	while (wptr) {
165	hashval += step;
166	if (hashval >= (unsigned long)hsize)
167	hashval -= hsize;
168	wptr = (dd->HashTable + hashval)->word;
169	}
170	ent = dd->HashTable + hashval;
171	}
172	ent->wcnt = 1;
173	ent->word = word;
174	dd->HashUsed = 1;
175	}
176	return (COMPALLOK);
177	}
178
179
180	static int process_text_element (const u_char *s_in, int l_in) {
181	const u_char *end = s_in + l_in - 1;
182
183	/*
184	** Alternately parse off words and non-words from the input
185	** stream beginning with a non-word. Each token is then
186	** inserted into the set if it does not exist or has it's
187	** frequency count incremented if it does.
188	*/
189
190	bool which = false; // non-word
191	for (; s_in <= end; which = !which) {
192	u_char Word[MAXWORDLEN + 1];
193	dict_data *dd = &DictData[which];
194
195	/* First parse a word or non-word out of the string */
196	if (which) PARSE_WORD (Word, s_in, end);
197	else PARSE_NON_WORD (Word, s_in, end);
198
199	dd->wordnum++;
200	inputbytes += *Word;
201	dd->words_read++;
202
203	/* Search the hash table for Word */
204	{
205	register unsigned long hashval, step;
206	register int hsize = dd->HashSize;
207	HASH (hashval, step, Word, hsize);
208	for (;;) {
209	register u_char *s1;
210	register u_char *s2;
211	register int len;
212	register hash_rec *ent;
213	ent = dd->HashTable + hashval;
214	if (!ent->word) {
215	int len = *Word + 1;
216	if (len > PoolLeft) {
217	if (!(Pool = (u_char *) Xmalloc (POOL_SIZE))) {
218	Message ("Unable to allocate memory for pool");
219	return (COMPERROR);
220	}
221	PoolLeft = POOL_SIZE;
222	ChangeMem (POOL_SIZE);
223	}
224	ent->occurance_num = occurance_num++;
225	ent->wcnt = 1;
226	ent->word = Pool;
227	memcpy (Pool, Word, len);
228	Pool += len;
229	PoolLeft -= len;
230	dd->HashUsed++;
231	dd->bytes_diff += Word[0];
232	break;
233	}
234
235	/* Compare the words */
236	s1 = Word;
237	s2 = ent->word;
238	len = *s1 + 1;
239	for (; len; len--)
240	if (s1++ != s2++) break;
241
242	if (len) {
243	hashval = (hashval + step);
244	if (hashval >= (unsigned long)hsize) hashval -= hsize;
245	} else {
246	ent->wcnt++;
247	break;
248	}
249	}
250	}
251
252
253	if (dd->HashUsed >= dd->HashSize >> 1) {
254	hash_rec *ht;
255	unsigned long size;
256	unsigned long i;
257	size = prime (dd->HashSize * 2);
258	if (!(ht = (hash_rec ) Xmalloc (sizeof (hash_rec) size))) {
259	Message ("Unable to allocate memory for table");
260	return (COMPERROR);
261	}
262	ChangeMem (sizeof (hash_rec) * size);
263	bzero ((char ) ht, sizeof (hash_rec) size);
264
265	for (i = 0; i < dd->HashSize; i++)
266	if (dd->HashTable[i].word) {
267	register u_char *wptr;
268	register unsigned long hashval, step;
269
270	wptr = dd->HashTable[i].word;
271	HASH (hashval, step, wptr, size);
272	wptr = (ht + hashval)->word;
273	while (wptr) {
274	hashval += step;
275	if (hashval >= size) hashval -= size;
276	wptr = (ht + hashval)->word;
277	}
278	ht[hashval] = dd->HashTable[i];
279	}
280	Xfree (dd->HashTable);
281	ChangeMem (-sizeof (hash_rec) * dd->HashSize);
282	dd->HashTable = ht;
283	dd->HashSize = size;
284	}
285	}
286
287	return COMPALLOK;
288	}
289
290
291	int process_text_1 (const TagInfo &/tagInfo/, const TextElArray &doc) {
292	unsigned long textLen = 0;
293	unsigned long docLen = 0;
294	int retValue;
295
296	// process each text element in this document
297	TextElArray::const_iterator here = doc.begin();
298	TextElArray::const_iterator end = doc.end();
299	while (here != end) {
300	textLen = (*here).text.size();
301	docLen += textLen;
302
303	retValue = process_text_element ((*here).text.begin(), textLen);
304	if (retValue != COMPALLOK) return retValue;
305
306	here++;
307	}
308
309	// get max document length
310	if (docLen > longestDoc) longestDoc = docLen;
311
312	// update header information
313	csh.num_docs++;
314	csh.num_bytes += docLen;
315
316	return COMPALLOK;
317	}
318
319
320	static int PackHashTable (dict_data * dd) {
321	int s, d;
322	for (s = d = 0; (unsigned int)s < dd->HashSize; s++)
323	if (dd->HashTable[s].word)
324	dd->HashTable[d++] = dd->HashTable[s];
325
326	ChangeMem (-sizeof (hash_rec) * dd->HashSize);
327	ChangeMem (sizeof (hash_rec) * dd->HashUsed);
328
329	if (!(dd->HashTable = (hash_rec *) Xrealloc (dd->HashTable,
330	sizeof (hash_rec) * dd->HashUsed))) {
331	Message ("Out of memory");
332	return COMPERROR;
333	}
334	dd->HashSize = dd->HashUsed;
335	return COMPALLOK;
336	}
337
338
339	static int ent_comp (const void s1, const void s2) {
340	return casecompare (((hash_rec ) s1)->word, ((hash_rec ) s2)->word);
341	}
342
343
344	static void WriteHashTable (FILE * fp, dict_data * dd) {
345	frags_stats_header fsh;
346	u_long j = 0;
347	u_char *curr;
348
349	if (PackHashTable (dd) == COMPERROR) return;
350
351	qsort (dd->HashTable, dd->HashUsed, sizeof (hash_rec), ent_comp);
352
353	fsh.num_frags = dd->HashSize;
354	fsh.mem_for_frags = dd->HashSize;
355	for (j = 0; j < dd->HashSize; j++)
356	fsh.mem_for_frags += dd->HashTable[j].word[0];
357
358	/* [RPAP - Jan 97: Endian Ordering] */
359	HTONUL(fsh.num_frags);
360	HTONUL(fsh.mem_for_frags);
361
362	fwrite (&fsh, sizeof (fsh), 1, fp);
363
364	for (j = 0; j < dd->HashSize; j++) {
365	curr = dd->HashTable[j].word;
366
367	/* [RPAP - Jan 97: Endian Ordering] */
368	HTONUL(dd->HashTable[j].wcnt);
369	HTONUL(dd->HashTable[j].occurance_num);
370
371	fwrite (&dd->HashTable[j].wcnt, sizeof (dd->HashTable[j].wcnt), 1, fp);
372	fwrite (&dd->HashTable[j].occurance_num,
373	sizeof (dd->HashTable[j].occurance_num), 1, fp);
374
375	/* [RPAP - Jan 97: Endian Ordering] */
376	NTOHUL(dd->HashTable[j].wcnt);
377	NTOHUL(dd->HashTable[j].occurance_num);
378
379	fwrite (curr, sizeof (u_char), curr[0] + 1, fp);
380	}
381	}
382
383
384	int done_text_1 (const TagInfo &/tagInfo/, char *file_name) {
385	char *temp_str;
386	FILE *fp;
387
388	if (!(fp = create_file (file_name, TEXT_STATS_DICT_SUFFIX, "wb",
389	MAGIC_STATS_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
390	return COMPERROR;
391
392	temp_str = msg_prefix;
393	msg_prefix = "text.pass1";
394
395	/* [RPAP - Jan 97: Endian Ordering] */
396	HTONUL(csh.num_docs);
397	HTOND(csh.num_bytes); /* [RJM 07/97: 4G limit] */
398
399	fwrite (&csh, sizeof (csh), 1, fp);
400
401	/* [RPAP - Jan 97: Endian Ordering] */
402	NTOHUL(csh.num_docs);
403	NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
404
405	WriteHashTable (fp, &DictData[0]);
406	WriteHashTable (fp, &DictData[1]);
407	msg_prefix = temp_str;
408	return COMPALLOK;
409	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: