1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * ivf.pass2.c -- Memory efficient pass 2 inversion
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: ivf.pass2.c 439 1999-08-10 21:23:37Z sjboddie $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | /*
|
---|
25 | $Log$
|
---|
26 | Revision 1.1 1999/08/10 21:17:54 sjboddie
|
---|
27 | renamed mg-1.3d directory mg
|
---|
28 |
|
---|
29 | Revision 1.3 1998/12/17 09:12:51 rjmcnab
|
---|
30 |
|
---|
31 | Altered mg to process utf-8 encoded Unicode. The main changes
|
---|
32 | are in the parsing of the input, the casefolding, and the stemming.
|
---|
33 |
|
---|
34 | Revision 1.2 1998/11/25 07:55:43 rjmcnab
|
---|
35 |
|
---|
36 | Modified mg to that you can specify the stemmer you want
|
---|
37 | to use via a command line option. You specify it to
|
---|
38 | mg_passes during the build process. The number of the
|
---|
39 | stemmer that you used is stored within the inverted
|
---|
40 | dictionary header and the stemmed dictionary header so
|
---|
41 | the correct stemmer is used in later stages of building
|
---|
42 | and querying.
|
---|
43 |
|
---|
44 | Revision 1.1 1998/11/17 09:34:45 rjmcnab
|
---|
45 | *** empty log message ***
|
---|
46 |
|
---|
47 | * Revision 1.3 1994/10/20 03:56:49 tes
|
---|
48 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
49 | * components of the boolean query.
|
---|
50 | *
|
---|
51 | * Revision 1.2 1994/09/20 04:41:35 tes
|
---|
52 | * For version 1.1
|
---|
53 | *
|
---|
54 | */
|
---|
55 |
|
---|
56 | static char *RCSID = "$Id: ivf.pass2.c 439 1999-08-10 21:23:37Z sjboddie $";
|
---|
57 |
|
---|
58 | #include "local_strings.h"
|
---|
59 | #include "sysfuncs.h"
|
---|
60 | #include "memlib.h"
|
---|
61 | #include "messages.h"
|
---|
62 | #include "stemmer.h"
|
---|
63 | #include "perf_hash.h"
|
---|
64 | #include "bitio_m.h"
|
---|
65 | #include "bitio_m_mems.h"
|
---|
66 | #include "bitio_gen.h"
|
---|
67 | #include "bitio_random.h"
|
---|
68 | #include "bitio_stdio.h"
|
---|
69 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
70 |
|
---|
71 | #include "mg_files.h"
|
---|
72 | #include "invf.h"
|
---|
73 | #include "locallib.h"
|
---|
74 | #include "mg.h"
|
---|
75 | #include "build.h"
|
---|
76 | #include "words.h"
|
---|
77 | #include "hash.h"
|
---|
78 |
|
---|
79 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
80 | #ifdef __WIN32__
|
---|
81 | #include <io.h>
|
---|
82 | #endif
|
---|
83 |
|
---|
84 | #ifndef RND_BUF_SIZE
|
---|
85 | #define RND_BUF_SIZE 8*1024
|
---|
86 | /*#define RND_BUF_SIZE 128 */
|
---|
87 | #endif
|
---|
88 |
|
---|
89 | #define print_fsize(file)\
|
---|
90 | do\
|
---|
91 | {\
|
---|
92 | struct stat file_state;\
|
---|
93 | fstat(fileno(invf_out), &file_state);\
|
---|
94 | Message("len(invf) = %ld", file_state.st_size);\
|
---|
95 | }while(0)
|
---|
96 |
|
---|
97 | typedef struct word_rec
|
---|
98 | {
|
---|
99 | unsigned long ptr;
|
---|
100 | unsigned long last;
|
---|
101 | }
|
---|
102 | word_rec;
|
---|
103 |
|
---|
104 | typedef struct invf_state_rec
|
---|
105 | {
|
---|
106 | unsigned long Disk_Ptr;
|
---|
107 | unsigned long Disk_Last;
|
---|
108 | unsigned long Disk_B;
|
---|
109 | }
|
---|
110 | invf_state_rec;
|
---|
111 |
|
---|
112 | typedef struct chunk
|
---|
113 | {
|
---|
114 | unsigned long start_doc;
|
---|
115 | unsigned long params_pos;
|
---|
116 | unsigned long disk_pos;
|
---|
117 | unsigned long N;
|
---|
118 | }
|
---|
119 | chunk;
|
---|
120 |
|
---|
121 |
|
---|
122 | static FILE *dict; /* Stemmed dictionary file */
|
---|
123 | static FILE *hash; /* Stemmed dictionary hash file */
|
---|
124 | static FILE *invf; /* Inverted file */
|
---|
125 | static FILE *invf_in; /* Inverted file */
|
---|
126 | static FILE *invf_out; /* Inverted file */
|
---|
127 | static FILE *invf_idx; /* Inverted index file */
|
---|
128 | static FILE *count; /* Count file */
|
---|
129 | static FILE *count_trans; /* Count translation file */
|
---|
130 | static FILE *invf_state; /* Inverted file State */
|
---|
131 | static FILE *chunk_state; /* Chunk state */
|
---|
132 | static FILE *chunks; /* Chunk state */
|
---|
133 | static FILE *invf_para = NULL; /* Paragraph counts file */
|
---|
134 | static FILE *weights = NULL; /* Weights file */
|
---|
135 |
|
---|
136 | static stdio_bitio_state sbs;
|
---|
137 | static random_bitio_state crbs;
|
---|
138 | static chunk *chunk_data = NULL;
|
---|
139 | static random_bitio_state rbs, rbsp;
|
---|
140 |
|
---|
141 | static int docs_left = 0, next_docs_left = 0;
|
---|
142 | static unsigned long N;
|
---|
143 |
|
---|
144 | static word_rec *WordRecs;
|
---|
145 | static u_char *lg_bs;
|
---|
146 | static float *idf = NULL;
|
---|
147 |
|
---|
148 | static char *MemoryBuffer = NULL;
|
---|
149 | static unsigned long MemBufSize;
|
---|
150 | static unsigned long BufToUse;
|
---|
151 | static struct invf_dict_header idh;
|
---|
152 |
|
---|
153 | static perf_hash_data *phd;
|
---|
154 |
|
---|
155 | static unsigned long *word_list = NULL;
|
---|
156 | static unsigned long wl_size = 0;
|
---|
157 |
|
---|
158 | static unsigned long dict_size;
|
---|
159 | static unsigned long no_of_ptrs = 0;
|
---|
160 | static unsigned long chunks_read = 0;
|
---|
161 | static unsigned long Disk_pos = 0;
|
---|
162 | static unsigned long callnum = 0;
|
---|
163 | static unsigned long wordnum = 0;
|
---|
164 |
|
---|
165 | static unsigned long totalIbytes = 0;
|
---|
166 | static unsigned long totalDbytes = 0;
|
---|
167 | static unsigned long totalHbytes = 0;
|
---|
168 |
|
---|
169 | static unsigned long MemInUse = 0;
|
---|
170 | static unsigned long MaxMemInUse = 0;
|
---|
171 | static unsigned long max_buffer_len;
|
---|
172 |
|
---|
173 | void
|
---|
174 | ChangeMemInUse (int mem)
|
---|
175 | {
|
---|
176 | MemInUse += mem;
|
---|
177 | if (MemInUse > MaxMemInUse)
|
---|
178 | MaxMemInUse = MemInUse;
|
---|
179 | }
|
---|
180 |
|
---|
181 |
|
---|
182 |
|
---|
183 |
|
---|
184 | static int
|
---|
185 | open_files (char *file_name)
|
---|
186 | {
|
---|
187 | char FName[200];
|
---|
188 |
|
---|
189 | if (!(dict = open_file (file_name, INVF_DICT_SUFFIX, "rb",
|
---|
190 | MAGIC_STEM_BUILD, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
191 | return (COMPERROR);
|
---|
192 |
|
---|
193 | if (!(hash = open_file (file_name, INVF_DICT_HASH_SUFFIX, "rb",
|
---|
194 | MAGIC_HASH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
195 | return (COMPERROR);
|
---|
196 |
|
---|
197 | if (!(count = open_file (file_name, INVF_CHUNK_SUFFIX, "rb",
|
---|
198 | MAGIC_CHUNK, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
199 | return (COMPERROR);
|
---|
200 | fread (&max_buffer_len, sizeof (max_buffer_len), 1, count);
|
---|
201 | NTOHUL(max_buffer_len); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
202 |
|
---|
203 | BIO_Stdio_Decode_Start (count, &sbs);
|
---|
204 | next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
205 |
|
---|
206 | if (!(count_trans = open_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "rb",
|
---|
207 | MAGIC_CHUNK_TRANS, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
208 | return (COMPERROR);
|
---|
209 |
|
---|
210 | if (!(invf = create_file (file_name, INVF_SUFFIX, "w+b",
|
---|
211 | MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
212 | return (COMPERROR);
|
---|
213 | fflush (invf);
|
---|
214 | if (!(invf_in = open_file (file_name, INVF_SUFFIX, "rb",
|
---|
215 | MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
216 | return (COMPERROR);
|
---|
217 | if (!(invf_out = create_file (file_name, INVF_SUFFIX, "wb",
|
---|
218 | MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
219 | return (COMPERROR);
|
---|
220 | BIO_Random_Start (invf, RND_BUF_SIZE, &rbs);
|
---|
221 | BIO_Random_Start (invf, RND_BUF_SIZE, &rbsp);
|
---|
222 | ChangeMemInUse (RND_BUF_SIZE * 2);
|
---|
223 |
|
---|
224 | if (!(invf_idx = create_file (file_name, INVF_IDX_SUFFIX, "wb",
|
---|
225 | MAGIC_INVI, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
226 | return (COMPERROR);
|
---|
227 |
|
---|
228 | if (InvfLevel == 3)
|
---|
229 | if (!(invf_para = create_file (file_name, INVF_PARAGRAPH_SUFFIX, "wb",
|
---|
230 | MAGIC_PARAGRAPH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
231 | return (COMPERROR);
|
---|
232 |
|
---|
233 | sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
|
---|
234 | ".invf.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
235 | if (!(invf_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
236 | {
|
---|
237 | Message ("Unable to create \"%s\"", FName);
|
---|
238 | return (COMPERROR);
|
---|
239 | }
|
---|
240 | unlink (FName);
|
---|
241 |
|
---|
242 | sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
|
---|
243 | ".chunk.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
244 | if (!(chunk_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
245 | {
|
---|
246 | Message ("Unable to create \"%s\"", FName);
|
---|
247 | return (COMPERROR);
|
---|
248 | }
|
---|
249 | unlink (FName);
|
---|
250 | BIO_Random_Start (chunk_state, RND_BUF_SIZE, &crbs);
|
---|
251 | ChangeMemInUse (RND_BUF_SIZE);
|
---|
252 |
|
---|
253 | sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
|
---|
254 | ".chunks", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
255 | if (!(chunks = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
256 | {
|
---|
257 | Message ("Unable to create \"%s\"", FName);
|
---|
258 | return (COMPERROR);
|
---|
259 | }
|
---|
260 | unlink (FName);
|
---|
261 |
|
---|
262 | return (COMPALLOK);
|
---|
263 | }
|
---|
264 |
|
---|
265 |
|
---|
266 |
|
---|
267 |
|
---|
268 |
|
---|
269 | #define ISR_CACHE 1024
|
---|
270 | #define ISR_ENTRY_SIZE (sizeof(unsigned long)*2 + sizeof(unsigned long))
|
---|
271 |
|
---|
272 | invf_state_rec *
|
---|
273 | in_cache (int pos)
|
---|
274 | {
|
---|
275 | static char isr_data[ISR_CACHE * ISR_ENTRY_SIZE];
|
---|
276 | static invf_state_rec isr;
|
---|
277 | static int isr_base = 0, isr_num = -1, isr_pos = -1;
|
---|
278 | if (isr_pos >= 0)
|
---|
279 | bcopy ((char *) &isr, &isr_data[isr_pos * ISR_ENTRY_SIZE], ISR_ENTRY_SIZE);
|
---|
280 | if (pos < isr_base || pos >= isr_base + isr_num)
|
---|
281 | {
|
---|
282 | if (isr_num >= 0)
|
---|
283 | {
|
---|
284 | fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
|
---|
285 | fwrite (isr_data, 1, ISR_ENTRY_SIZE * isr_num, invf_state);
|
---|
286 | }
|
---|
287 | isr_base = pos;
|
---|
288 | fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
|
---|
289 | fread (isr_data, 1, ISR_ENTRY_SIZE * ISR_CACHE, invf_state);
|
---|
290 | isr_num = ISR_CACHE;
|
---|
291 | }
|
---|
292 | isr_pos = pos - isr_base;
|
---|
293 | bcopy (&isr_data[isr_pos * ISR_ENTRY_SIZE], (char *) &isr, ISR_ENTRY_SIZE);
|
---|
294 | return &isr;
|
---|
295 | }
|
---|
296 |
|
---|
297 |
|
---|
298 |
|
---|
299 |
|
---|
300 |
|
---|
301 | unsigned long
|
---|
302 | occur_to_lexical (long occ)
|
---|
303 | {
|
---|
304 | static long pos = -1;
|
---|
305 | static random_bitio_state rbs;
|
---|
306 | static int val = 0;
|
---|
307 | if (pos == -1)
|
---|
308 | {
|
---|
309 | BIO_Random_Start (count_trans, RND_BUF_SIZE, &rbs);
|
---|
310 | pos = 0x7fffffff;
|
---|
311 | }
|
---|
312 | if (occ < pos)
|
---|
313 | {
|
---|
314 | if (occ == -1)
|
---|
315 | {
|
---|
316 | BIO_Random_Done (&rbs);
|
---|
317 | return 0;
|
---|
318 | }
|
---|
319 | BIO_Random_Seek (32, &rbs);
|
---|
320 | pos = 0;
|
---|
321 | }
|
---|
322 | while (pos <= occ)
|
---|
323 | {
|
---|
324 | val = BIO_Random_Binary_Decode (dict_size + 1, &rbs, NULL) - 1;
|
---|
325 | pos++;
|
---|
326 | }
|
---|
327 | return (val);
|
---|
328 | }
|
---|
329 |
|
---|
330 |
|
---|
331 | void
|
---|
332 | add_chunk_state (unsigned long pos, unsigned long start_doc,
|
---|
333 | unsigned long N)
|
---|
334 | {
|
---|
335 | chunk_data[chunks_read].params_pos = pos;
|
---|
336 | chunk_data[chunks_read].start_doc = start_doc;
|
---|
337 | chunk_data[chunks_read].N = N;
|
---|
338 | chunks_read++;
|
---|
339 | }
|
---|
340 |
|
---|
341 |
|
---|
342 | int
|
---|
343 | init_ivf_2 (char *file_name)
|
---|
344 | {
|
---|
345 | u_char prev[MAXSTEMLEN + 1];
|
---|
346 | int i;
|
---|
347 | unsigned long totalIbits;
|
---|
348 | double logN = 0.0;
|
---|
349 |
|
---|
350 | if (open_files (file_name) == COMPERROR)
|
---|
351 | return COMPERROR;
|
---|
352 |
|
---|
353 |
|
---|
354 | /* Read in the stemmed dictionary file header */
|
---|
355 | fread ((char *) &idh, sizeof (idh), 1, dict);
|
---|
356 |
|
---|
357 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
358 | NTOHUL(idh.lookback);
|
---|
359 | NTOHUL(idh.dict_size);
|
---|
360 | NTOHUL(idh.total_bytes);
|
---|
361 | NTOHUL(idh.index_string_bytes);
|
---|
362 | NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
363 | NTOHUL(idh.num_of_docs);
|
---|
364 | NTOHUL(idh.static_num_of_docs);
|
---|
365 | NTOHUL(idh.num_of_words);
|
---|
366 | NTOHUL(idh.stemmer_num);
|
---|
367 | NTOHUL(idh.stem_method);
|
---|
368 |
|
---|
369 | dict_size = idh.dict_size;
|
---|
370 |
|
---|
371 | N = idh.num_of_docs;
|
---|
372 |
|
---|
373 | if (!(phd = read_perf_hash_data (hash)))
|
---|
374 | {
|
---|
375 | Message ("Unable to read in hash data");
|
---|
376 | return COMPERROR;
|
---|
377 | }
|
---|
378 | totalHbytes = sizeof (perf_hash_data) + sizeof (u_char) * 256 +
|
---|
379 | sizeof (int) * (phd->MAX_N + 1) + sizeof (int *) * 3 * phd->MAX_CH +
|
---|
380 | sizeof (long) * phd->MAX_CH * phd->MAX_L;
|
---|
381 |
|
---|
382 | if (!(WordRecs = Xmalloc (sizeof (word_rec) * idh.dict_size)))
|
---|
383 | {
|
---|
384 | Message ("No memory for word entries");
|
---|
385 | return COMPERROR;
|
---|
386 | }
|
---|
387 | totalDbytes += sizeof (word_rec) * idh.dict_size;
|
---|
388 |
|
---|
389 | /* separate storage for the log(b) values, one byte each */
|
---|
390 | if (!(lg_bs = Xmalloc (sizeof (u_char) * idh.dict_size)))
|
---|
391 | {
|
---|
392 | Message ("No memory for lg b's");
|
---|
393 | return COMPERROR;
|
---|
394 | }
|
---|
395 | totalDbytes += sizeof (u_char) * idh.dict_size;
|
---|
396 |
|
---|
397 | if (MakeWeights)
|
---|
398 | {
|
---|
399 | /* separate storage for the idf values, one single each */
|
---|
400 | if (!(idf = Xmalloc (sizeof (float) * idh.dict_size)))
|
---|
401 | {
|
---|
402 | Message ("No memory for idf's");
|
---|
403 | return COMPERROR;
|
---|
404 | }
|
---|
405 | totalDbytes += sizeof (float) * idh.dict_size;
|
---|
406 |
|
---|
407 | if (!(weights = create_file (file_name, WEIGHTS_SUFFIX, "wb",
|
---|
408 | MAGIC_WGHT, MG_CONTINUE))) { /* [RPAP - Feb 97: WIN32 Port] */
|
---|
409 | Message ("Couldn't open weights file for writing");
|
---|
410 | return (COMPERROR);
|
---|
411 | }
|
---|
412 | }
|
---|
413 | else
|
---|
414 | {
|
---|
415 | unlink (make_name (file_name, WEIGHTS_SUFFIX, NULL));
|
---|
416 | }
|
---|
417 |
|
---|
418 | chunk_data = Xmalloc (sizeof (chunk) * (ChunkLimit + 2));
|
---|
419 | totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
|
---|
420 |
|
---|
421 | totalIbits = 32; /* The magic number */
|
---|
422 | totalIbits += 8 * 100; /* A 100 byte gap */
|
---|
423 |
|
---|
424 | if (MakeWeights)
|
---|
425 | {
|
---|
426 | wl_size = 1024;
|
---|
427 | if (!(word_list = Xmalloc (sizeof (*word_list) * wl_size)))
|
---|
428 | {
|
---|
429 | Message ("No memory for word_list");
|
---|
430 | return COMPERROR;
|
---|
431 | }
|
---|
432 |
|
---|
433 | logN = log ((double) N);
|
---|
434 | }
|
---|
435 |
|
---|
436 | for (i = 0; i < idh.dict_size; i++)
|
---|
437 | {
|
---|
438 | invf_state_rec *isr;
|
---|
439 | register unsigned long copy, suff, p;
|
---|
440 | unsigned long fcnt, wcnt;
|
---|
441 |
|
---|
442 | copy = fgetc (dict);
|
---|
443 | suff = fgetc (dict);
|
---|
444 | *prev = copy + suff;
|
---|
445 | fread (prev + copy + 1, sizeof (u_char), suff, dict);
|
---|
446 |
|
---|
447 | fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
|
---|
448 | fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
|
---|
449 |
|
---|
450 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
451 | NTOHUL(fcnt);
|
---|
452 | NTOHUL(wcnt);
|
---|
453 |
|
---|
454 | WordRecs[i].last = 0;
|
---|
455 | WordRecs[i].ptr = 0;
|
---|
456 |
|
---|
457 | p = fcnt;
|
---|
458 |
|
---|
459 | if (MakeWeights)
|
---|
460 | idf[i] = logN - log ((double) fcnt);
|
---|
461 |
|
---|
462 |
|
---|
463 | isr = in_cache (i);
|
---|
464 |
|
---|
465 | isr->Disk_Last = 0;
|
---|
466 | isr->Disk_Ptr = totalIbits;
|
---|
467 |
|
---|
468 | isr->Disk_B = BIO_Bblock_Init (N, p);
|
---|
469 |
|
---|
470 | totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
|
---|
471 |
|
---|
472 | if (InvfLevel >= 2)
|
---|
473 | totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
|
---|
474 |
|
---|
475 | totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
|
---|
476 |
|
---|
477 | }
|
---|
478 |
|
---|
479 |
|
---|
480 | /* now convert to bytes, and actually get the space */
|
---|
481 | totalIbytes = (totalIbits + 7ul) >> 3ul;
|
---|
482 |
|
---|
483 |
|
---|
484 | return (COMPALLOK);
|
---|
485 |
|
---|
486 | }
|
---|
487 |
|
---|
488 |
|
---|
489 |
|
---|
490 |
|
---|
491 |
|
---|
492 | static void
|
---|
493 | LoadCounts (void)
|
---|
494 | {
|
---|
495 | unsigned long numwords, i, last_total;
|
---|
496 | static unsigned long local_N = 0;
|
---|
497 | unsigned long totalIbits, crbs_pos;
|
---|
498 | word_rec *wr;
|
---|
499 | unsigned long *counts;
|
---|
500 |
|
---|
501 | if (MemoryBuffer == NULL)
|
---|
502 | {
|
---|
503 | MemBufSize = sizeof (unsigned long) * dict_size;
|
---|
504 | if (max_buffer_len > MemBufSize)
|
---|
505 | MemBufSize = max_buffer_len;
|
---|
506 | if (!(MemoryBuffer = Xmalloc (MemBufSize)))
|
---|
507 | FatalError (1, "Unable to allocate memory for buffer");
|
---|
508 | ChangeMemInUse (MemBufSize);
|
---|
509 | }
|
---|
510 |
|
---|
511 | counts = (unsigned long *) MemoryBuffer;
|
---|
512 | bzero ((char *) counts, sizeof (unsigned long) * dict_size);
|
---|
513 |
|
---|
514 | docs_left = next_docs_left;
|
---|
515 | if (!docs_left)
|
---|
516 | FatalError (1, "The number of docs in the current chunk is 0");
|
---|
517 |
|
---|
518 | BufToUse = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
519 |
|
---|
520 | numwords = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
521 |
|
---|
522 | local_N = docs_left;
|
---|
523 |
|
---|
524 |
|
---|
525 |
|
---|
526 | for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
|
---|
527 | wr->ptr = 0;
|
---|
528 |
|
---|
529 | bzero ((char *) lg_bs, dict_size);
|
---|
530 |
|
---|
531 | for (i = 0; i < numwords; i++)
|
---|
532 | {
|
---|
533 | unsigned long word_num, wcnt, fcnt, p;
|
---|
534 | word_num = occur_to_lexical (i);
|
---|
535 |
|
---|
536 | wr = &WordRecs[word_num];
|
---|
537 |
|
---|
538 | wcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
539 | if (wcnt >= 2)
|
---|
540 | fcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL);
|
---|
541 | else
|
---|
542 | fcnt = wcnt;
|
---|
543 |
|
---|
544 | p = fcnt;
|
---|
545 |
|
---|
546 | if (wcnt)
|
---|
547 | {
|
---|
548 | register unsigned long length;
|
---|
549 | counts[word_num] = p;
|
---|
550 | length = BIO_Bblock_Bound (local_N, p);
|
---|
551 | if (InvfLevel >= 2)
|
---|
552 | length += wcnt;
|
---|
553 | wr->ptr = length;
|
---|
554 | lg_bs[word_num] = floorlog_2 (BIO_Bblock_Init_W (local_N, p));
|
---|
555 | }
|
---|
556 |
|
---|
557 | }
|
---|
558 |
|
---|
559 | crbs_pos = BIO_Random_Tell (&crbs);
|
---|
560 |
|
---|
561 | totalIbits = 0;
|
---|
562 | last_total = 0;
|
---|
563 | for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
|
---|
564 | {
|
---|
565 | register unsigned long length;
|
---|
566 | length = wr->ptr;
|
---|
567 | wr->last = callnum;
|
---|
568 | BIO_Random_Gamma_Encode (counts[i] + 1, &crbs, NULL);
|
---|
569 | if (counts[i])
|
---|
570 | {
|
---|
571 | if (i)
|
---|
572 | BIO_Random_Delta_Encode (totalIbits - last_total + 1, &crbs, NULL);
|
---|
573 | else
|
---|
574 | BIO_Random_Delta_Encode (1, &crbs, NULL);
|
---|
575 |
|
---|
576 | last_total = totalIbits;
|
---|
577 | }
|
---|
578 | wr->ptr = totalIbits;
|
---|
579 | totalIbits += length;
|
---|
580 | }
|
---|
581 | add_chunk_state (crbs_pos, callnum, local_N);
|
---|
582 |
|
---|
583 | if ((totalIbits + 7ul) >> 3ul > BufToUse)
|
---|
584 | FatalError (1, "Pointers exceed buffer size");
|
---|
585 |
|
---|
586 | next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
587 | }
|
---|
588 |
|
---|
589 |
|
---|
590 |
|
---|
591 |
|
---|
592 | static void
|
---|
593 | DumpChunk (void)
|
---|
594 | {
|
---|
595 | chunk_data[chunks_read - 1].disk_pos = Disk_pos << 3;
|
---|
596 | fseek (chunks, Disk_pos, 0);
|
---|
597 | fwrite (MemoryBuffer, sizeof (char), BufToUse, chunks);
|
---|
598 | Disk_pos += BufToUse;
|
---|
599 | }
|
---|
600 |
|
---|
601 |
|
---|
602 |
|
---|
603 |
|
---|
604 | static void
|
---|
605 | DiskMerge (void)
|
---|
606 | {
|
---|
607 | random_bitio_state *rbsi;
|
---|
608 | random_bitio_state *chks = NULL;
|
---|
609 | unsigned long *chunk_ptrs;
|
---|
610 | int i;
|
---|
611 |
|
---|
612 | BIO_Random_Flush (&crbs);
|
---|
613 |
|
---|
614 | chunk_ptrs = Xmalloc (chunks_read * sizeof (unsigned long));
|
---|
615 | ChangeMemInUse (chunks_read * sizeof (unsigned long));
|
---|
616 | bzero ((char *) chunk_ptrs, chunks_read * sizeof (unsigned long));
|
---|
617 |
|
---|
618 | rbsi = Xmalloc (chunks_read * sizeof (random_bitio_state));
|
---|
619 | ChangeMemInUse (chunks_read * sizeof (random_bitio_state));
|
---|
620 | for (i = 0; i < chunks_read; i++)
|
---|
621 | {
|
---|
622 | rbsi[i] = crbs;
|
---|
623 | rbsi[i].Buf = Xmalloc (rbsi[i].len);
|
---|
624 | ChangeMemInUse (rbsi[i].len);
|
---|
625 | bcopy ((char *) (crbs.Buf), (char *) (rbsi[i].Buf), rbsi[i].len);
|
---|
626 | BIO_Random_Seek (chunk_data[i].params_pos, &rbsi[i]);
|
---|
627 | }
|
---|
628 |
|
---|
629 | if (chunks_read > 1)
|
---|
630 | {
|
---|
631 | int j;
|
---|
632 | chks = Xmalloc ((chunks_read - 1) * sizeof (random_bitio_state));
|
---|
633 | ChangeMemInUse ((chunks_read - 1) * sizeof (random_bitio_state));
|
---|
634 | BIO_Random_Start (chunks, RND_BUF_SIZE, &chks[0]);
|
---|
635 | ChangeMemInUse (RND_BUF_SIZE);
|
---|
636 | for (j = 1; j < chunks_read - 1; j++)
|
---|
637 | {
|
---|
638 | chks[j] = chks[0];
|
---|
639 | chks[j].Buf = Xmalloc (chks[0].len);
|
---|
640 | ChangeMemInUse (chks[0].len);
|
---|
641 | bcopy ((char *) (chks[0].Buf), (char *) (chks[j].Buf), chks[0].len);
|
---|
642 | }
|
---|
643 | }
|
---|
644 | for (i = 0; i < dict_size; i++)
|
---|
645 | {
|
---|
646 | int j;
|
---|
647 | invf_state_rec *isr = in_cache (i);
|
---|
648 | register int B;
|
---|
649 |
|
---|
650 | BIO_Random_Seek (isr->Disk_Ptr, &rbs); /* Position in invf file */
|
---|
651 |
|
---|
652 | B = isr->Disk_B;
|
---|
653 |
|
---|
654 | for (j = 0; j < chunks_read; j++)
|
---|
655 | {
|
---|
656 | int p;
|
---|
657 | p = BIO_Random_Gamma_Decode (&rbsi[j], NULL) - 1;
|
---|
658 |
|
---|
659 | if (p)
|
---|
660 | {
|
---|
661 | int ptr, b;
|
---|
662 | chunk_ptrs[j] += BIO_Random_Delta_Decode (&rbsi[j], NULL) - 1;
|
---|
663 | ptr = chunk_ptrs[j];
|
---|
664 | b = 1 << floorlog_2 (BIO_Bblock_Init_W (chunk_data[j].N, p));
|
---|
665 |
|
---|
666 | if (j == chunks_read - 1)
|
---|
667 | {
|
---|
668 | int k, CurrDoc;
|
---|
669 | DECODE_START ((u_char *) MemoryBuffer, ptr)
|
---|
670 | CurrDoc = isr->Disk_Last;
|
---|
671 | for (k = 0; k < p; k++)
|
---|
672 | {
|
---|
673 | register unsigned long x, tf;
|
---|
674 | BBLOCK_DECODE (x, b);
|
---|
675 | if (k == 0)
|
---|
676 | x = x + chunk_data[j].start_doc - isr->Disk_Last;
|
---|
677 | CurrDoc += x;
|
---|
678 | BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
|
---|
679 | if (InvfLevel >= 2)
|
---|
680 | {
|
---|
681 | UNARY_DECODE (tf);
|
---|
682 | BIO_Random_Gamma_Encode (tf, &rbs, NULL);
|
---|
683 | }
|
---|
684 | }
|
---|
685 | DECODE_DONE
|
---|
686 | isr->Disk_Last = CurrDoc;
|
---|
687 | }
|
---|
688 | else
|
---|
689 | {
|
---|
690 | int k, CurrDoc;
|
---|
691 | random_bitio_state *Chks = chks + j;
|
---|
692 | BIO_Random_Seek (chunk_data[j].disk_pos + ptr, Chks);
|
---|
693 | CurrDoc = isr->Disk_Last;
|
---|
694 | for (k = 0; k < p; k++)
|
---|
695 | {
|
---|
696 | register unsigned long x, tf;
|
---|
697 | x = BIO_Random_Bblock_Decode (b, Chks, NULL);
|
---|
698 | if (k == 0)
|
---|
699 | x = x + chunk_data[j].start_doc - isr->Disk_Last;
|
---|
700 | CurrDoc += x;
|
---|
701 | BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
|
---|
702 | if (InvfLevel >= 2)
|
---|
703 | {
|
---|
704 | tf = BIO_Random_Unary_Decode (Chks, NULL);
|
---|
705 | BIO_Random_Gamma_Encode (tf, &rbs, NULL);
|
---|
706 | }
|
---|
707 | }
|
---|
708 | isr->Disk_Last = CurrDoc;
|
---|
709 | }
|
---|
710 | }
|
---|
711 | }
|
---|
712 |
|
---|
713 | isr->Disk_Ptr = BIO_Random_Tell (&rbs);
|
---|
714 |
|
---|
715 | }
|
---|
716 | if (chunks_read > 1)
|
---|
717 | {
|
---|
718 | int j;
|
---|
719 | for (j = 0; j < chunks_read - 1; j++)
|
---|
720 | {
|
---|
721 | Xfree (chks[j].Buf);
|
---|
722 | ChangeMemInUse (-chks[j].len);
|
---|
723 | }
|
---|
724 | Xfree (chks);
|
---|
725 | ChangeMemInUse (-(chunks_read - 1) * sizeof (random_bitio_state));
|
---|
726 | }
|
---|
727 |
|
---|
728 | for (i = 0; i < chunks_read; i++)
|
---|
729 | {
|
---|
730 | Xfree (rbsi[i].Buf);
|
---|
731 | ChangeMemInUse (-rbsi[i].len);
|
---|
732 | }
|
---|
733 | Xfree (rbsi);
|
---|
734 | ChangeMemInUse (-chunks_read * sizeof (random_bitio_state));
|
---|
735 | chunks_read = 0;
|
---|
736 | Xfree (chunk_ptrs);
|
---|
737 | ChangeMemInUse (-chunks_read * sizeof (unsigned long));
|
---|
738 | Disk_pos = 0;
|
---|
739 | BIO_Random_Seek (0, &crbs);
|
---|
740 | }
|
---|
741 |
|
---|
742 | static void
|
---|
743 | MergeIn (void)
|
---|
744 | {
|
---|
745 | static int disk_chunks = 0;
|
---|
746 | static header = 0;
|
---|
747 | if (!header)
|
---|
748 | {
|
---|
749 | fprintf (stderr, "ivf.pass2 : ");
|
---|
750 | header = 1;
|
---|
751 | }
|
---|
752 | if (disk_chunks == ChunkLimit || next_docs_left == 0)
|
---|
753 | {
|
---|
754 | fprintf (stderr, "M");
|
---|
755 | DiskMerge ();
|
---|
756 | disk_chunks = 0;
|
---|
757 | }
|
---|
758 | else
|
---|
759 | {
|
---|
760 | fprintf (stderr, "-");
|
---|
761 | DumpChunk ();
|
---|
762 | disk_chunks++;
|
---|
763 | }
|
---|
764 | if (next_docs_left == 0)
|
---|
765 | fprintf (stderr, "\n");
|
---|
766 | }
|
---|
767 |
|
---|
768 |
|
---|
769 | static int
|
---|
770 | wl_comp (const void *a, const void *b)
|
---|
771 | {
|
---|
772 | return *((int *) a) - *((int *) b);
|
---|
773 | }
|
---|
774 |
|
---|
775 | static int
|
---|
776 | process_doc (u_char * s_in, int l_in)
|
---|
777 | {
|
---|
778 | int res;
|
---|
779 | u_char *end = s_in + l_in - 1;
|
---|
780 | unsigned long tocode;
|
---|
781 | unsigned long wl_pos = 0;
|
---|
782 |
|
---|
783 | if (!docs_left)
|
---|
784 | LoadCounts ();
|
---|
785 |
|
---|
786 | callnum++;
|
---|
787 |
|
---|
788 | if (!inaword (s_in, end))
|
---|
789 | if (SkipSGML)
|
---|
790 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
791 | else
|
---|
792 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
793 |
|
---|
794 | while (s_in <= end)
|
---|
795 | {
|
---|
796 | u_char Word[MAXSTEMLEN + 1];
|
---|
797 |
|
---|
798 | PARSE_STEM_WORD (Word, s_in, end);
|
---|
799 | stemmer (idh.stem_method, idh.stemmer_num, Word);
|
---|
800 | if (SkipSGML)
|
---|
801 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
802 | else
|
---|
803 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
804 |
|
---|
805 | if (*Word == 0)
|
---|
806 | continue;
|
---|
807 |
|
---|
808 | res = perf_hash (phd, Word);
|
---|
809 |
|
---|
810 | {
|
---|
811 | word_rec *arr = &WordRecs[res];
|
---|
812 | int b = 1 << lg_bs[res];
|
---|
813 | wordnum++;
|
---|
814 |
|
---|
815 | tocode = callnum;
|
---|
816 |
|
---|
817 | ENCODE_START ((u_char *) MemoryBuffer, arr->ptr)
|
---|
818 |
|
---|
819 | if (tocode > arr->last)
|
---|
820 | {
|
---|
821 | register int x;
|
---|
822 | x = tocode - arr->last - 1;
|
---|
823 | BBLOCK_ENCODE (x + 1, b);
|
---|
824 | if (InvfLevel >= 2)
|
---|
825 | ENCODE_BIT (1);
|
---|
826 | no_of_ptrs++;
|
---|
827 | arr->last = tocode;
|
---|
828 | }
|
---|
829 | else if (InvfLevel >= 2)
|
---|
830 | {
|
---|
831 | __pos--;
|
---|
832 | ENCODE_BIT (0);
|
---|
833 | ENCODE_BIT (1);
|
---|
834 | }
|
---|
835 | arr->ptr = __pos;
|
---|
836 | ENCODE_DONE
|
---|
837 | }
|
---|
838 |
|
---|
839 | if (MakeWeights)
|
---|
840 | {
|
---|
841 | if (wl_pos >= wl_size)
|
---|
842 | {
|
---|
843 | wl_size += (wl_size >> 1);
|
---|
844 | word_list = Xrealloc (word_list, sizeof (*word_list) * wl_size);
|
---|
845 | }
|
---|
846 | word_list[wl_pos++] = res;
|
---|
847 | }
|
---|
848 | }
|
---|
849 | if (MakeWeights)
|
---|
850 | {
|
---|
851 | float doc_weight = 0.0;
|
---|
852 | if (wl_pos)
|
---|
853 | {
|
---|
854 | unsigned long *wl = word_list;
|
---|
855 | unsigned long i, count, val;
|
---|
856 | qsort (wl, wl_pos, sizeof (*wl), wl_comp);
|
---|
857 | count = 1;
|
---|
858 | val = *wl++;
|
---|
859 | for (i = 1; i <= wl_pos; i++, wl++)
|
---|
860 | if (i == wl_pos || val != *wl)
|
---|
861 | {
|
---|
862 | double weight = count * idf[val];
|
---|
863 | doc_weight += weight * weight;
|
---|
864 | count = 1;
|
---|
865 | val = *wl;
|
---|
866 | }
|
---|
867 | else
|
---|
868 | count++;
|
---|
869 | }
|
---|
870 | HTONF(doc_weight); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
871 | fwrite ((char *) &doc_weight, sizeof (doc_weight), 1, weights);
|
---|
872 | }
|
---|
873 | docs_left--;
|
---|
874 | if (!docs_left)
|
---|
875 | MergeIn ();
|
---|
876 |
|
---|
877 | return COMPALLOK;
|
---|
878 | }
|
---|
879 |
|
---|
880 | int
|
---|
881 | process_ivf_2 (u_char * s_in, int l_in)
|
---|
882 | {
|
---|
883 | if (InvfLevel <= 2)
|
---|
884 | return process_doc (s_in, l_in);
|
---|
885 | else
|
---|
886 | {
|
---|
887 | int count = 0;
|
---|
888 | int pos = 0;
|
---|
889 | u_char *start = s_in;
|
---|
890 | while (pos < l_in)
|
---|
891 | {
|
---|
892 | if (s_in[pos] == TERMPARAGRAPH)
|
---|
893 | {
|
---|
894 | int len = pos + s_in + 1 - start;
|
---|
895 | if (process_doc (start, len) != COMPALLOK)
|
---|
896 | return (COMPERROR);
|
---|
897 | start = s_in + pos + 1;
|
---|
898 | count++;
|
---|
899 | }
|
---|
900 | pos++;
|
---|
901 | }
|
---|
902 | if (start < s_in + pos)
|
---|
903 | {
|
---|
904 | if (process_doc (start, pos + s_in - start) != COMPALLOK)
|
---|
905 | return (COMPERROR);
|
---|
906 | count++;
|
---|
907 | }
|
---|
908 | HTONSI(count); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
909 | fwrite ((char *) &count, sizeof (count), 1, invf_para);
|
---|
910 | }
|
---|
911 | return COMPALLOK;
|
---|
912 | }
|
---|
913 |
|
---|
914 |
|
---|
915 |
|
---|
916 |
|
---|
917 |
|
---|
918 | static void
|
---|
919 | stats (unsigned long len)
|
---|
920 | {
|
---|
921 | #ifndef SILENT
|
---|
922 | fseek (count, 0, 2);
|
---|
923 | fseek (count_trans, 0, 2);
|
---|
924 | fseek (invf_state, 0, 2);
|
---|
925 | fseek (invf, 0, 0);
|
---|
926 | fseek (invf, 0, 2);
|
---|
927 | fseek (chunks, 0, 2);
|
---|
928 | fseek (chunk_state, 0, 2);
|
---|
929 | Message ("File sizes\n");
|
---|
930 | Message (" Chunk desc : %10u bytes\n", ftell (count));
|
---|
931 | Message (" Chunk trans : %10u bytes\n", ftell (count_trans));
|
---|
932 | Message (" Chunks : %10u bytes\n", ftell (chunks));
|
---|
933 | Message (" Chunk state : %10u bytes\n", ftell (chunk_state));
|
---|
934 | Message (" Invf state : %10u bytes\n", ftell (invf_state));
|
---|
935 | Message (" Peak invf : %10u bytes\n", len);
|
---|
936 | Message (" Final invf : %10u bytes\n", ftell (invf));
|
---|
937 | Message ("Peak disk usage : %10.2f %%\n",
|
---|
938 | (double) (ftell (count) + ftell (count_trans) +
|
---|
939 | ftell (invf_state) + ftell (chunks) +
|
---|
940 | ftell (chunk_state) + len) / ftell (invf) * 100.0);
|
---|
941 | #endif
|
---|
942 | }
|
---|
943 |
|
---|
944 |
|
---|
945 | /* ARGSUSED */
|
---|
946 | int
|
---|
947 | done_ivf_2 (char *FileName)
|
---|
948 | {
|
---|
949 | long i;
|
---|
950 | unsigned long totalIbits;
|
---|
951 | unsigned long invf_len;
|
---|
952 | unsigned long bytes_output;
|
---|
953 | struct invf_file_header ifh;
|
---|
954 |
|
---|
955 | if (weights)
|
---|
956 | fclose (weights);
|
---|
957 | if (invf_para)
|
---|
958 | fclose (invf_para);
|
---|
959 |
|
---|
960 | free_perf_hash (phd);
|
---|
961 |
|
---|
962 | free (MemoryBuffer);
|
---|
963 | ChangeMemInUse (-MemBufSize);
|
---|
964 |
|
---|
965 | BIO_Random_Done (&rbs);
|
---|
966 | BIO_Random_Done (&rbsp);
|
---|
967 | fflush (invf);
|
---|
968 |
|
---|
969 | fseek (invf, 0, 2);
|
---|
970 | invf_len = ftell (invf);
|
---|
971 |
|
---|
972 | fseek (invf_out, sizeof (long), 0);
|
---|
973 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
974 | HTONUL2(dict_size, ifh.no_of_words);
|
---|
975 | HTONUL2(no_of_ptrs, ifh.no_of_ptrs);
|
---|
976 | ifh.skip_mode = 0;
|
---|
977 | bzero ((char *) ifh.params, sizeof (ifh.params));
|
---|
978 | HTONUL2(InvfLevel, ifh.InvfLevel);
|
---|
979 | fwrite ((char *) &ifh, sizeof (ifh), 1, invf_out);
|
---|
980 |
|
---|
981 | bytes_output = ftell (invf_out);
|
---|
982 |
|
---|
983 | totalIbits = 32; /* The magic number */
|
---|
984 | totalIbits += 8 * 100; /* A 100 byte gap */
|
---|
985 |
|
---|
986 | /* find the right place in the file to start reading p values */
|
---|
987 | fseek (dict, sizeof (unsigned long) + sizeof (struct invf_dict_header), 0);
|
---|
988 | for (i = 0; i < dict_size; i++)
|
---|
989 | {
|
---|
990 | invf_state_rec *isr;
|
---|
991 | unsigned long fcnt, wcnt, s, e;
|
---|
992 | register unsigned long p;
|
---|
993 | u_char dummy1, dummy2[MAXSTEMLEN + 1];
|
---|
994 |
|
---|
995 | /* output location to the invf_idx */
|
---|
996 | HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
997 | fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
|
---|
998 | NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
999 |
|
---|
1000 | /* read an entry for a word, just to get p value */
|
---|
1001 | dummy1 = fgetc (dict);
|
---|
1002 | dummy1 = fgetc (dict);
|
---|
1003 | fread (dummy2, sizeof (u_char), dummy1, dict);
|
---|
1004 | fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
|
---|
1005 | fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
|
---|
1006 |
|
---|
1007 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1008 | NTOHUL(fcnt);
|
---|
1009 | NTOHUL(wcnt);
|
---|
1010 |
|
---|
1011 | p = fcnt;
|
---|
1012 |
|
---|
1013 | isr = in_cache (i);
|
---|
1014 |
|
---|
1015 | e = (isr->Disk_Ptr + 7ul) >> 3ul;
|
---|
1016 | s = totalIbits >> 3;
|
---|
1017 |
|
---|
1018 | fseek (invf_in, s, 0);
|
---|
1019 | while (s < e)
|
---|
1020 | {
|
---|
1021 | u_char c = getc (invf_in);
|
---|
1022 | if (s == e - 1)
|
---|
1023 | {
|
---|
1024 | u_char ands[8] =
|
---|
1025 | {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
|
---|
1026 | c &= ands[isr->Disk_Ptr & 7ul];
|
---|
1027 | }
|
---|
1028 | putc (c, invf_out);
|
---|
1029 | bytes_output++;
|
---|
1030 | s++;
|
---|
1031 | }
|
---|
1032 |
|
---|
1033 | totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
|
---|
1034 | if (InvfLevel >= 2)
|
---|
1035 | totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
|
---|
1036 | totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
|
---|
1037 |
|
---|
1038 | }
|
---|
1039 |
|
---|
1040 | fclose (invf_in);
|
---|
1041 |
|
---|
1042 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1043 | #ifdef __WIN32__
|
---|
1044 | if (!(_chsize (_fileno (invf_out), bytes_output)))
|
---|
1045 | Message ("Could not truncate invf.");
|
---|
1046 | #else
|
---|
1047 | ftruncate (fileno (invf_out), bytes_output);
|
---|
1048 | #endif
|
---|
1049 |
|
---|
1050 | fclose (invf_out);
|
---|
1051 |
|
---|
1052 | HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1053 | fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
|
---|
1054 | NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1055 |
|
---|
1056 | fclose (invf_idx);
|
---|
1057 |
|
---|
1058 | #ifndef SILENT
|
---|
1059 | {
|
---|
1060 | char *temp_str = msg_prefix;
|
---|
1061 | unsigned long total;
|
---|
1062 | msg_prefix = "ivf.pass2";
|
---|
1063 | stats (invf_len);
|
---|
1064 | Message ("Pass two data structures : %6.3f Mbyte\n",
|
---|
1065 | (double) totalDbytes / 1024 / 1024);
|
---|
1066 | total = totalDbytes;
|
---|
1067 | Message ("Pass two hash structure(s) : %6.3f Mbyte\n",
|
---|
1068 | (double) totalHbytes / 1024 / 1024);
|
---|
1069 | total += totalHbytes;
|
---|
1070 | Message ("Peak extra memory in use : %6.3f Mbyte\n",
|
---|
1071 | (double) MaxMemInUse / 1024 / 1024);
|
---|
1072 | total += MaxMemInUse;
|
---|
1073 | Message ("Peak total memory in use : %6.3f Mbyte\n",
|
---|
1074 | (double) total / 1024 / 1024);
|
---|
1075 | msg_prefix = temp_str;
|
---|
1076 | }
|
---|
1077 | #endif
|
---|
1078 |
|
---|
1079 | Xfree (WordRecs);
|
---|
1080 | Xfree (lg_bs);
|
---|
1081 |
|
---|
1082 | /* Free the memory allocated for the BIO_Random */
|
---|
1083 | occur_to_lexical (-1);
|
---|
1084 |
|
---|
1085 | BIO_Random_Done (&crbs);
|
---|
1086 |
|
---|
1087 | fclose (invf);
|
---|
1088 | fclose (dict);
|
---|
1089 | fclose (hash);
|
---|
1090 | fclose (count);
|
---|
1091 | fclose (count_trans);
|
---|
1092 | fclose (chunk_state);
|
---|
1093 | fclose (chunks);
|
---|
1094 | fclose (invf_state);
|
---|
1095 | return (COMPALLOK);
|
---|
1096 | }
|
---|