1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * ivf.pass2.c -- Memory efficient pass 2 inversion
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: ivf.pass2.c 16583 2008-07-29 10:20:36Z davidb $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | /*
|
---|
25 | $Log$
|
---|
26 | Revision 1.3 2004/06/10 05:07:43 kjdon
|
---|
27 | have to declare vars before calling functions!!
|
---|
28 |
|
---|
29 | Revision 1.2 2004/06/10 03:02:05 kjdon
|
---|
30 | fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2
|
---|
31 |
|
---|
32 | Revision 1.1 2003/02/20 21:18:23 mdewsnip
|
---|
33 | Addition of MG package for search and retrieval
|
---|
34 |
|
---|
35 | Revision 1.2 2001/09/21 12:46:42 kjm18
|
---|
36 | updated mg to be in line with mg_1.3f. Now uses long long for some variables
|
---|
37 | to enable indexing of very large collections.
|
---|
38 |
|
---|
39 | * Revision 1.2 1997/08/02 05:01:57 wew
|
---|
40 | * changed literal values of 32 for the bit size of magic numbers of
|
---|
41 | * files to sizeof (unsigned long) * 8, increased the gap at the start
|
---|
42 | * of the invf during processing to 200 bytes
|
---|
43 |
|
---|
44 | Revision 1.1 1999/08/10 21:17:54 sjboddie
|
---|
45 | renamed mg-1.3d directory mg
|
---|
46 |
|
---|
47 | Revision 1.3 1998/12/17 09:12:51 rjmcnab
|
---|
48 |
|
---|
49 | Altered mg to process utf-8 encoded Unicode. The main changes
|
---|
50 | are in the parsing of the input, the casefolding, and the stemming.
|
---|
51 |
|
---|
52 | Revision 1.2 1998/11/25 07:55:43 rjmcnab
|
---|
53 |
|
---|
54 | Modified mg to that you can specify the stemmer you want
|
---|
55 | to use via a command line option. You specify it to
|
---|
56 | mg_passes during the build process. The number of the
|
---|
57 | stemmer that you used is stored within the inverted
|
---|
58 | dictionary header and the stemmed dictionary header so
|
---|
59 | the correct stemmer is used in later stages of building
|
---|
60 | and querying.
|
---|
61 |
|
---|
62 | Revision 1.1 1998/11/17 09:34:45 rjmcnab
|
---|
63 | *** empty log message ***
|
---|
64 |
|
---|
65 | * Revision 1.3 1994/10/20 03:56:49 tes
|
---|
66 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
67 | * components of the boolean query.
|
---|
68 | *
|
---|
69 | * Revision 1.2 1994/09/20 04:41:35 tes
|
---|
70 | * For version 1.1
|
---|
71 | *
|
---|
72 | */
|
---|
73 |
|
---|
74 | /*
|
---|
75 | * Modified:
|
---|
76 | * - long long disk pointers and bit counts for inverted file
|
---|
77 | * (1999-08-03 Tim Bell <[email protected]>)
|
---|
78 | * Code provided by Owen de Kretser <[email protected]>
|
---|
79 | */
|
---|
80 |
|
---|
81 | static char *RCSID = "$Id: ivf.pass2.c 16583 2008-07-29 10:20:36Z davidb $";
|
---|
82 |
|
---|
83 | #include "local_strings.h"
|
---|
84 | #include "sysfuncs.h"
|
---|
85 | #include "memlib.h"
|
---|
86 | #include "messages.h"
|
---|
87 | #include "stemmer.h"
|
---|
88 | #include "perf_hash.h"
|
---|
89 | #include "bitio_m.h"
|
---|
90 | #include "bitio_m_mems.h"
|
---|
91 | #include "bitio_gen.h"
|
---|
92 | #include "bitio_random.h"
|
---|
93 | #include "bitio_stdio.h"
|
---|
94 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
95 |
|
---|
96 | #include "mg_files.h"
|
---|
97 | #include "invf.h"
|
---|
98 | #include "locallib.h"
|
---|
99 | #include "mg.h"
|
---|
100 | #include "build.h"
|
---|
101 | #include "words.h"
|
---|
102 | #include "hash.h"
|
---|
103 |
|
---|
104 | #include "longlong.h"
|
---|
105 |
|
---|
106 | #ifdef USE_LONG_LONG
|
---|
107 | #define BIO_Random_Seek_X BIO_Random_Seek_LL
|
---|
108 | #define BIO_Random_Tell_X BIO_Random_Tell_LL
|
---|
109 | #else
|
---|
110 | #define BIO_Random_Seek_X BIO_Random_Seek
|
---|
111 | #define BIO_Random_Tell_X BIO_Random_Tell
|
---|
112 | #endif
|
---|
113 |
|
---|
114 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
115 | #ifdef __WIN32__
|
---|
116 | #include <io.h>
|
---|
117 | #endif
|
---|
118 |
|
---|
119 | #ifndef RND_BUF_SIZE
|
---|
120 | #define RND_BUF_SIZE 8*1024
|
---|
121 | /*#define RND_BUF_SIZE 128 */
|
---|
122 | #endif
|
---|
123 |
|
---|
124 | #define print_fsize(file)\
|
---|
125 | do\
|
---|
126 | {\
|
---|
127 | struct stat file_state;\
|
---|
128 | fstat(fileno(invf_out), &file_state);\
|
---|
129 | Message("len(invf) = %ld", file_state.st_size);\
|
---|
130 | }while(0)
|
---|
131 |
|
---|
132 | typedef struct word_rec
|
---|
133 | {
|
---|
134 | unsigned long ptr;
|
---|
135 | unsigned long last;
|
---|
136 | }
|
---|
137 | word_rec;
|
---|
138 |
|
---|
139 | typedef struct invf_state_rec
|
---|
140 | {
|
---|
141 | mg_ullong Disk_Ptr;
|
---|
142 | mg_ullong Disk_Last;
|
---|
143 | unsigned long Disk_B;
|
---|
144 | }
|
---|
145 | invf_state_rec;
|
---|
146 |
|
---|
147 | typedef struct chunk
|
---|
148 | {
|
---|
149 | unsigned long start_doc;
|
---|
150 | unsigned long params_pos;
|
---|
151 | unsigned long disk_pos;
|
---|
152 | unsigned long N;
|
---|
153 | }
|
---|
154 | chunk;
|
---|
155 |
|
---|
156 |
|
---|
157 | static FILE *dict; /* Stemmed dictionary file */
|
---|
158 | static FILE *hash; /* Stemmed dictionary hash file */
|
---|
159 | static FILE *invf; /* Inverted file */
|
---|
160 | static FILE *invf_in; /* Inverted file */
|
---|
161 | static FILE *invf_out; /* Inverted file */
|
---|
162 | static FILE *invf_idx; /* Inverted index file */
|
---|
163 | static FILE *count; /* Count file */
|
---|
164 | static FILE *count_trans; /* Count translation file */
|
---|
165 | static FILE *invf_state; /* Inverted file State */
|
---|
166 | static FILE *chunk_state; /* Chunk state */
|
---|
167 | static FILE *chunks; /* Chunk state */
|
---|
168 | static FILE *invf_para = NULL; /* Paragraph counts file */
|
---|
169 | static FILE *weights = NULL; /* Weights file */
|
---|
170 |
|
---|
171 | static stdio_bitio_state sbs;
|
---|
172 | static random_bitio_state crbs;
|
---|
173 | static chunk *chunk_data = NULL;
|
---|
174 | static random_bitio_state rbs, rbsp;
|
---|
175 |
|
---|
176 | static int docs_left = 0, next_docs_left = 0;
|
---|
177 | static unsigned long N;
|
---|
178 |
|
---|
179 | static word_rec *WordRecs;
|
---|
180 | static u_char *lg_bs;
|
---|
181 | static float *idf = NULL;
|
---|
182 |
|
---|
183 | static char *MemoryBuffer = NULL;
|
---|
184 | static unsigned long MemBufSize;
|
---|
185 | static unsigned long BufToUse;
|
---|
186 | static struct invf_dict_header idh;
|
---|
187 |
|
---|
188 | static perf_hash_data *phd;
|
---|
189 |
|
---|
190 | static unsigned long *word_list = NULL;
|
---|
191 | static unsigned long wl_size = 0;
|
---|
192 |
|
---|
193 | static unsigned long dict_size;
|
---|
194 | static unsigned long no_of_ptrs = 0;
|
---|
195 | static unsigned long chunks_read = 0;
|
---|
196 | static unsigned long Disk_pos = 0;
|
---|
197 | static unsigned long callnum = 0;
|
---|
198 | static unsigned long wordnum = 0;
|
---|
199 |
|
---|
200 | static unsigned long totalIbytes = 0;
|
---|
201 | static unsigned long totalDbytes = 0;
|
---|
202 | static unsigned long totalHbytes = 0;
|
---|
203 |
|
---|
204 | static unsigned long MemInUse = 0;
|
---|
205 | static unsigned long MaxMemInUse = 0;
|
---|
206 | static unsigned long max_buffer_len;
|
---|
207 |
|
---|
208 | void
|
---|
209 | ChangeMemInUse (int mem)
|
---|
210 | {
|
---|
211 | MemInUse += mem;
|
---|
212 | if (MemInUse > MaxMemInUse)
|
---|
213 | MaxMemInUse = MemInUse;
|
---|
214 | }
|
---|
215 |
|
---|
216 | void
|
---|
217 | ResetStaticI2Vars()
|
---|
218 | {
|
---|
219 | docs_left = 0;
|
---|
220 | next_docs_left = 0;
|
---|
221 | N = 0;
|
---|
222 | MemBufSize=0;
|
---|
223 | BufToUse=0;
|
---|
224 | memset(&idh, 0, sizeof(idh));
|
---|
225 | wl_size = 0;
|
---|
226 |
|
---|
227 | dict_size = 0;
|
---|
228 | no_of_ptrs = 0;
|
---|
229 | chunks_read = 0;
|
---|
230 | Disk_pos = 0;
|
---|
231 | callnum = 0;
|
---|
232 | wordnum = 0;
|
---|
233 |
|
---|
234 | totalIbytes = 0;
|
---|
235 | totalDbytes = 0;
|
---|
236 | totalHbytes = 0;
|
---|
237 |
|
---|
238 | MemInUse = 0;
|
---|
239 | MaxMemInUse = 0;
|
---|
240 | max_buffer_len = 0;
|
---|
241 |
|
---|
242 | }
|
---|
243 |
|
---|
244 |
|
---|
245 | static int
|
---|
246 | open_files (char *file_name)
|
---|
247 | {
|
---|
248 | char FName[200];
|
---|
249 |
|
---|
250 | if (!(dict = open_file (file_name, INVF_DICT_SUFFIX, "rb",
|
---|
251 | MAGIC_STEM_BUILD, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
252 | return (COMPERROR);
|
---|
253 |
|
---|
254 | if (!(hash = open_file (file_name, INVF_DICT_HASH_SUFFIX, "rb",
|
---|
255 | MAGIC_HASH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
256 | return (COMPERROR);
|
---|
257 |
|
---|
258 | if (!(count = open_file (file_name, INVF_CHUNK_SUFFIX, "rb",
|
---|
259 | MAGIC_CHUNK, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
260 | return (COMPERROR);
|
---|
261 | fread (&max_buffer_len, sizeof (max_buffer_len), 1, count);
|
---|
262 | NTOHUL(max_buffer_len); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
263 |
|
---|
264 | BIO_Stdio_Decode_Start (count, &sbs);
|
---|
265 | next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
266 |
|
---|
267 | if (!(count_trans = open_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "rb",
|
---|
268 | MAGIC_CHUNK_TRANS, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
269 | return (COMPERROR);
|
---|
270 |
|
---|
271 | if (!(invf = create_file (file_name, INVF_SUFFIX, "w+b",
|
---|
272 | MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
273 | return (COMPERROR);
|
---|
274 | fflush (invf);
|
---|
275 | if (!(invf_in = open_file (file_name, INVF_SUFFIX, "rb",
|
---|
276 | MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
277 | return (COMPERROR);
|
---|
278 | if (!(invf_out = create_file (file_name, INVF_SUFFIX, "wb",
|
---|
279 | MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
280 | return (COMPERROR);
|
---|
281 | BIO_Random_Start (invf, RND_BUF_SIZE, &rbs);
|
---|
282 | BIO_Random_Start (invf, RND_BUF_SIZE, &rbsp);
|
---|
283 | ChangeMemInUse (RND_BUF_SIZE * 2);
|
---|
284 |
|
---|
285 | if (!(invf_idx = create_file (file_name, INVF_IDX_SUFFIX, "wb",
|
---|
286 | MAGIC_INVI, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
287 | return (COMPERROR);
|
---|
288 |
|
---|
289 | if (InvfLevel == 3)
|
---|
290 | if (!(invf_para = create_file (file_name, INVF_PARAGRAPH_SUFFIX, "wb",
|
---|
291 | MAGIC_PARAGRAPH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
292 | return (COMPERROR);
|
---|
293 |
|
---|
294 | sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
|
---|
295 | ".invf.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
296 | if (!(invf_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
297 | {
|
---|
298 | Message ("Unable to create \"%s\"", FName);
|
---|
299 | return (COMPERROR);
|
---|
300 | }
|
---|
301 | unlink (FName);
|
---|
302 |
|
---|
303 | sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
|
---|
304 | ".chunk.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
305 | if (!(chunk_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
306 | {
|
---|
307 | Message ("Unable to create \"%s\"", FName);
|
---|
308 | return (COMPERROR);
|
---|
309 | }
|
---|
310 | unlink (FName);
|
---|
311 | BIO_Random_Start (chunk_state, RND_BUF_SIZE, &crbs);
|
---|
312 | ChangeMemInUse (RND_BUF_SIZE);
|
---|
313 |
|
---|
314 | sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
|
---|
315 | ".chunks", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
316 | if (!(chunks = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
317 | {
|
---|
318 | Message ("Unable to create \"%s\"", FName);
|
---|
319 | return (COMPERROR);
|
---|
320 | }
|
---|
321 | unlink (FName);
|
---|
322 |
|
---|
323 | return (COMPALLOK);
|
---|
324 | }
|
---|
325 |
|
---|
326 |
|
---|
327 |
|
---|
328 |
|
---|
329 |
|
---|
330 | #define ISR_CACHE 1024
|
---|
331 | #define ISR_ENTRY_SIZE (sizeof(mg_ullong)*2 + sizeof(unsigned long))
|
---|
332 |
|
---|
333 | invf_state_rec *
|
---|
334 | in_cache (int pos)
|
---|
335 | {
|
---|
336 | static char isr_data[ISR_CACHE * ISR_ENTRY_SIZE];
|
---|
337 | static invf_state_rec isr;
|
---|
338 | static int isr_base = 0, isr_num = -1, isr_pos = -1;
|
---|
339 | if (isr_pos >= 0)
|
---|
340 | bcopy ((char *) &isr, &isr_data[isr_pos * ISR_ENTRY_SIZE], ISR_ENTRY_SIZE);
|
---|
341 | if (pos < isr_base || pos >= isr_base + isr_num)
|
---|
342 | {
|
---|
343 | if (isr_num >= 0)
|
---|
344 | {
|
---|
345 | fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
|
---|
346 | fwrite (isr_data, 1, ISR_ENTRY_SIZE * isr_num, invf_state);
|
---|
347 | }
|
---|
348 | isr_base = pos;
|
---|
349 | fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
|
---|
350 | fread (isr_data, 1, ISR_ENTRY_SIZE * ISR_CACHE, invf_state);
|
---|
351 | isr_num = ISR_CACHE;
|
---|
352 | }
|
---|
353 | isr_pos = pos - isr_base;
|
---|
354 | bcopy (&isr_data[isr_pos * ISR_ENTRY_SIZE], (char *) &isr, ISR_ENTRY_SIZE);
|
---|
355 | return &isr;
|
---|
356 | }
|
---|
357 |
|
---|
358 |
|
---|
359 |
|
---|
360 |
|
---|
361 |
|
---|
362 | unsigned long
|
---|
363 | occur_to_lexical (long occ, int clear_state)
|
---|
364 | {
|
---|
365 | static long pos = -1;
|
---|
366 | static random_bitio_state rbs;
|
---|
367 | static int val = 0;
|
---|
368 | if (clear_state) {
|
---|
369 | pos = -1;
|
---|
370 | val = 0;
|
---|
371 | return 0;
|
---|
372 | }
|
---|
373 | if (pos == -1)
|
---|
374 | {
|
---|
375 | BIO_Random_Start (count_trans, RND_BUF_SIZE, &rbs);
|
---|
376 | pos = 0x7fffffff;
|
---|
377 | }
|
---|
378 | if (occ < pos)
|
---|
379 | {
|
---|
380 | if (occ == -1)
|
---|
381 | {
|
---|
382 | BIO_Random_Done (&rbs);
|
---|
383 | return 0;
|
---|
384 | }
|
---|
385 | BIO_Random_Seek_X (sizeof (unsigned long) * 8, &rbs);
|
---|
386 | pos = 0;
|
---|
387 | }
|
---|
388 | while (pos <= occ)
|
---|
389 | {
|
---|
390 | val = BIO_Random_Binary_Decode (dict_size + 1, &rbs, NULL) - 1;
|
---|
391 | pos++;
|
---|
392 | }
|
---|
393 | return (val);
|
---|
394 | }
|
---|
395 |
|
---|
396 |
|
---|
397 | void
|
---|
398 | add_chunk_state (unsigned long pos, unsigned long start_doc,
|
---|
399 | unsigned long N)
|
---|
400 | {
|
---|
401 | chunk_data[chunks_read].params_pos = pos;
|
---|
402 | chunk_data[chunks_read].start_doc = start_doc;
|
---|
403 | chunk_data[chunks_read].N = N;
|
---|
404 | chunks_read++;
|
---|
405 | }
|
---|
406 |
|
---|
407 |
|
---|
408 | int
|
---|
409 | init_ivf_2 (char *file_name)
|
---|
410 | {
|
---|
411 | u_char prev[MAXSTEMLEN + 1];
|
---|
412 | int i;
|
---|
413 | mg_ullong totalIbits;
|
---|
414 | mg_ullong lasttotalIbits;
|
---|
415 | double logN = 0.0;
|
---|
416 |
|
---|
417 | ResetStaticI2Vars(); /* clear the global statics */
|
---|
418 | occur_to_lexical(0, 1); /* clear the statics in here*/
|
---|
419 |
|
---|
420 | if (open_files (file_name) == COMPERROR)
|
---|
421 | return COMPERROR;
|
---|
422 |
|
---|
423 |
|
---|
424 | /* Read in the stemmed dictionary file header */
|
---|
425 | fread ((char *) &idh, sizeof (idh), 1, dict);
|
---|
426 |
|
---|
427 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
428 | NTOHUL(idh.lookback);
|
---|
429 | NTOHUL(idh.dict_size);
|
---|
430 | NTOHUL(idh.total_bytes);
|
---|
431 | NTOHUL(idh.index_string_bytes);
|
---|
432 | NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
433 | NTOHUL(idh.num_of_docs);
|
---|
434 | NTOHUL(idh.static_num_of_docs);
|
---|
435 | NTOHUL(idh.num_of_words);
|
---|
436 | NTOHUL(idh.stemmer_num);
|
---|
437 | NTOHUL(idh.stem_method);
|
---|
438 |
|
---|
439 | dict_size = idh.dict_size;
|
---|
440 |
|
---|
441 | N = idh.num_of_docs;
|
---|
442 |
|
---|
443 | if (!(phd = read_perf_hash_data (hash)))
|
---|
444 | {
|
---|
445 | Message ("Unable to read in hash data");
|
---|
446 | return COMPERROR;
|
---|
447 | }
|
---|
448 | totalHbytes = sizeof (perf_hash_data) + sizeof (u_char) * 256 +
|
---|
449 | sizeof (int) * (phd->MAX_N + 1) + sizeof (int *) * 3 * phd->MAX_CH +
|
---|
450 | sizeof (long) * phd->MAX_CH * phd->MAX_L;
|
---|
451 |
|
---|
452 | if (!(WordRecs = Xmalloc (sizeof (word_rec) * idh.dict_size)))
|
---|
453 | {
|
---|
454 | Message ("No memory for word entries");
|
---|
455 | return COMPERROR;
|
---|
456 | }
|
---|
457 | totalDbytes += sizeof (word_rec) * idh.dict_size;
|
---|
458 | /* separate storage for the log(b) values, one byte each */
|
---|
459 | if (!(lg_bs = Xmalloc (sizeof (u_char) * idh.dict_size)))
|
---|
460 | {
|
---|
461 | Message ("No memory for lg b's");
|
---|
462 | return COMPERROR;
|
---|
463 | }
|
---|
464 | totalDbytes += sizeof (u_char) * idh.dict_size;
|
---|
465 |
|
---|
466 | if (MakeWeights)
|
---|
467 | {
|
---|
468 | /* separate storage for the idf values, one single each */
|
---|
469 | if (!(idf = Xmalloc (sizeof (float) * idh.dict_size)))
|
---|
470 | {
|
---|
471 | Message ("No memory for idf's");
|
---|
472 | return COMPERROR;
|
---|
473 | }
|
---|
474 | totalDbytes += sizeof (float) * idh.dict_size;
|
---|
475 |
|
---|
476 | if (!(weights = create_file (file_name, WEIGHTS_SUFFIX, "wb",
|
---|
477 | MAGIC_WGHT, MG_CONTINUE))) { /* [RPAP - Feb 97: WIN32 Port] */
|
---|
478 | Message ("Couldn't open weights file for writing");
|
---|
479 | return (COMPERROR);
|
---|
480 | }
|
---|
481 | }
|
---|
482 | else
|
---|
483 | {
|
---|
484 | unlink (make_name (file_name, WEIGHTS_SUFFIX, NULL));
|
---|
485 | }
|
---|
486 |
|
---|
487 | chunk_data = Xmalloc (sizeof (chunk) * (ChunkLimit + 2));
|
---|
488 | totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
|
---|
489 |
|
---|
490 | totalIbits = sizeof (unsigned long) * 8; /* The magic number */
|
---|
491 | totalIbits += 8 * 200; /* A 200 byte gap */
|
---|
492 |
|
---|
493 | if (MakeWeights)
|
---|
494 | {
|
---|
495 | wl_size = 1024;
|
---|
496 | if (!(word_list = Xmalloc (sizeof (*word_list) * wl_size)))
|
---|
497 | {
|
---|
498 | Message ("No memory for word_list");
|
---|
499 | return COMPERROR;
|
---|
500 | }
|
---|
501 |
|
---|
502 | logN = log ((double) N);
|
---|
503 | }
|
---|
504 |
|
---|
505 | for (i = 0; i < idh.dict_size; i++)
|
---|
506 | {
|
---|
507 | invf_state_rec *isr;
|
---|
508 | register unsigned long copy, suff, p;
|
---|
509 | unsigned long fcnt, wcnt;
|
---|
510 |
|
---|
511 | lasttotalIbits = totalIbits;
|
---|
512 |
|
---|
513 | copy = fgetc (dict);
|
---|
514 | suff = fgetc (dict);
|
---|
515 | *prev = copy + suff;
|
---|
516 | fread (prev + copy + 1, sizeof (u_char), suff, dict);
|
---|
517 |
|
---|
518 | fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
|
---|
519 | fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
|
---|
520 |
|
---|
521 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
522 | NTOHUL(fcnt);
|
---|
523 | NTOHUL(wcnt);
|
---|
524 |
|
---|
525 | WordRecs[i].last = 0;
|
---|
526 | WordRecs[i].ptr = 0;
|
---|
527 |
|
---|
528 | p = fcnt;
|
---|
529 |
|
---|
530 | if (MakeWeights)
|
---|
531 | idf[i] = logN - log ((double) fcnt);
|
---|
532 |
|
---|
533 |
|
---|
534 | isr = in_cache (i);
|
---|
535 |
|
---|
536 | isr->Disk_Last = 0;
|
---|
537 | isr->Disk_Ptr = totalIbits;
|
---|
538 |
|
---|
539 | isr->Disk_B = BIO_Bblock_Init (N, p);
|
---|
540 |
|
---|
541 | totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
|
---|
542 |
|
---|
543 | if (InvfLevel >= 2)
|
---|
544 | totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
|
---|
545 |
|
---|
546 | #ifdef USE_LONG_LONG
|
---|
547 | totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
|
---|
548 | #else
|
---|
549 | totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
|
---|
550 | #endif
|
---|
551 |
|
---|
552 | if (totalIbits < lasttotalIbits) {
|
---|
553 | fprintf(stderr, "ERROR: The totalIbits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
|
---|
554 | if (sizeof (mg_ullong) < 8) {
|
---|
555 | fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
|
---|
556 | }
|
---|
557 | fprintf(stderr, " Build aborted.\n");
|
---|
558 | exit(1);
|
---|
559 | }
|
---|
560 | }
|
---|
561 |
|
---|
562 |
|
---|
563 | /* now convert to bytes, and actually get the space */
|
---|
564 | #ifdef USE_LONG_LONG
|
---|
565 | totalIbytes = (totalIbits + 7ull) >> 3ull;
|
---|
566 | #else
|
---|
567 | totalIbytes = (totalIbits + 7ul) >> 3ul;
|
---|
568 | #endif
|
---|
569 | return (COMPALLOK);
|
---|
570 |
|
---|
571 | }
|
---|
572 |
|
---|
573 |
|
---|
574 |
|
---|
575 |
|
---|
576 |
|
---|
577 | static void
|
---|
578 | LoadCounts (void)
|
---|
579 | {
|
---|
580 | unsigned long numwords, i, last_total;
|
---|
581 | static unsigned long local_N = 0;
|
---|
582 | unsigned long totalIbits, crbs_pos;
|
---|
583 | word_rec *wr;
|
---|
584 | unsigned long *counts;
|
---|
585 |
|
---|
586 | if (MemoryBuffer == NULL)
|
---|
587 | {
|
---|
588 | MemBufSize = sizeof (unsigned long) * dict_size;
|
---|
589 | if (max_buffer_len > MemBufSize)
|
---|
590 | MemBufSize = max_buffer_len;
|
---|
591 | if (!(MemoryBuffer = Xmalloc (MemBufSize)))
|
---|
592 | FatalError (1, "Unable to allocate memory for buffer");
|
---|
593 | ChangeMemInUse (MemBufSize);
|
---|
594 | }
|
---|
595 | counts = (unsigned long *) MemoryBuffer;
|
---|
596 | /* bzero ((char *) counts, sizeof (unsigned long) * dict_size); */
|
---|
597 | bzero ((char *) counts, MemBufSize);
|
---|
598 | docs_left = next_docs_left;
|
---|
599 | if (!docs_left)
|
---|
600 | FatalError (1, "The number of docs in the current chunk is 0");
|
---|
601 |
|
---|
602 | BufToUse = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
603 |
|
---|
604 | numwords = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
605 |
|
---|
606 | local_N = docs_left;
|
---|
607 |
|
---|
608 | for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
|
---|
609 | wr->ptr = 0;
|
---|
610 |
|
---|
611 | bzero ((char *) lg_bs, dict_size);
|
---|
612 |
|
---|
613 | for (i = 0; i < numwords; i++)
|
---|
614 | {
|
---|
615 | unsigned long word_num, wcnt, fcnt, p;
|
---|
616 | word_num = occur_to_lexical (i,0);
|
---|
617 |
|
---|
618 | wr = &WordRecs[word_num];
|
---|
619 |
|
---|
620 | wcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
621 | if (wcnt >= 2)
|
---|
622 | fcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL);
|
---|
623 | else
|
---|
624 | fcnt = wcnt;
|
---|
625 |
|
---|
626 |
|
---|
627 | p = fcnt;
|
---|
628 | if (wcnt)
|
---|
629 | {
|
---|
630 | register unsigned long length;
|
---|
631 | counts[word_num] = p;
|
---|
632 | length = BIO_Bblock_Bound (local_N, p);
|
---|
633 | if (InvfLevel >= 2)
|
---|
634 | length += wcnt;
|
---|
635 | wr->ptr = length;
|
---|
636 | lg_bs[word_num] = floorlog_2 (BIO_Bblock_Init_W (local_N, p));
|
---|
637 | }
|
---|
638 |
|
---|
639 |
|
---|
640 | }
|
---|
641 | crbs_pos = BIO_Random_Tell (&crbs);
|
---|
642 | totalIbits = 0;
|
---|
643 | last_total = 0;
|
---|
644 | for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
|
---|
645 | {
|
---|
646 | register unsigned long length;
|
---|
647 | length = wr->ptr;
|
---|
648 | wr->last = callnum;
|
---|
649 | BIO_Random_Gamma_Encode (counts[i] + 1, &crbs, NULL);
|
---|
650 | if (counts[i])
|
---|
651 | {
|
---|
652 | if (i)
|
---|
653 | BIO_Random_Delta_Encode (totalIbits - last_total + 1, &crbs, NULL);
|
---|
654 | else
|
---|
655 | BIO_Random_Delta_Encode (1, &crbs, NULL);
|
---|
656 |
|
---|
657 | last_total = totalIbits;
|
---|
658 | }
|
---|
659 | wr->ptr = totalIbits;
|
---|
660 | totalIbits += length;
|
---|
661 | }
|
---|
662 | add_chunk_state (crbs_pos, callnum, local_N);
|
---|
663 |
|
---|
664 | if ((totalIbits + 7ul) >> 3ul > BufToUse)
|
---|
665 | FatalError (1, "Pointers exceed buffer size");
|
---|
666 |
|
---|
667 | next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
|
---|
668 | }
|
---|
669 |
|
---|
670 |
|
---|
671 |
|
---|
672 |
|
---|
673 | static void
|
---|
674 | DumpChunk (void)
|
---|
675 | {
|
---|
676 | chunk_data[chunks_read - 1].disk_pos = Disk_pos << 3;
|
---|
677 | fseek (chunks, Disk_pos, 0);
|
---|
678 | fwrite (MemoryBuffer, sizeof (char), BufToUse, chunks);
|
---|
679 | Disk_pos += BufToUse;
|
---|
680 | }
|
---|
681 |
|
---|
682 |
|
---|
683 |
|
---|
684 |
|
---|
685 | static void
|
---|
686 | DiskMerge (void)
|
---|
687 | {
|
---|
688 | random_bitio_state *rbsi;
|
---|
689 | random_bitio_state *chks = NULL;
|
---|
690 | unsigned long *chunk_ptrs;
|
---|
691 | int i;
|
---|
692 |
|
---|
693 | BIO_Random_Flush (&crbs);
|
---|
694 |
|
---|
695 | chunk_ptrs = Xmalloc (chunks_read * sizeof (unsigned long));
|
---|
696 | ChangeMemInUse (chunks_read * sizeof (unsigned long));
|
---|
697 | bzero ((char *) chunk_ptrs, chunks_read * sizeof (unsigned long));
|
---|
698 |
|
---|
699 | rbsi = Xmalloc (chunks_read * sizeof (random_bitio_state));
|
---|
700 | ChangeMemInUse (chunks_read * sizeof (random_bitio_state));
|
---|
701 | for (i = 0; i < chunks_read; i++)
|
---|
702 | {
|
---|
703 | rbsi[i] = crbs;
|
---|
704 | rbsi[i].Buf = Xmalloc (rbsi[i].len);
|
---|
705 | ChangeMemInUse (rbsi[i].len);
|
---|
706 | bcopy ((char *) (crbs.Buf), (char *) (rbsi[i].Buf), rbsi[i].len);
|
---|
707 | BIO_Random_Seek (chunk_data[i].params_pos, &rbsi[i]);
|
---|
708 | }
|
---|
709 |
|
---|
710 | if (chunks_read > 1)
|
---|
711 | {
|
---|
712 | int j;
|
---|
713 | chks = Xmalloc ((chunks_read - 1) * sizeof (random_bitio_state));
|
---|
714 | ChangeMemInUse ((chunks_read - 1) * sizeof (random_bitio_state));
|
---|
715 | BIO_Random_Start (chunks, RND_BUF_SIZE, &chks[0]);
|
---|
716 | ChangeMemInUse (RND_BUF_SIZE);
|
---|
717 | for (j = 1; j < chunks_read - 1; j++)
|
---|
718 | {
|
---|
719 | chks[j] = chks[0];
|
---|
720 | chks[j].Buf = Xmalloc (chks[0].len);
|
---|
721 | ChangeMemInUse (chks[0].len);
|
---|
722 | bcopy ((char *) (chks[0].Buf), (char *) (chks[j].Buf), chks[0].len);
|
---|
723 | }
|
---|
724 | }
|
---|
725 | for (i = 0; i < dict_size; i++)
|
---|
726 | {
|
---|
727 | int j;
|
---|
728 | invf_state_rec *isr = in_cache (i);
|
---|
729 | register int B;
|
---|
730 |
|
---|
731 | BIO_Random_Seek_X (isr->Disk_Ptr, &rbs); /* Position in invf file */
|
---|
732 |
|
---|
733 | B = isr->Disk_B;
|
---|
734 |
|
---|
735 | for (j = 0; j < chunks_read; j++)
|
---|
736 | {
|
---|
737 | int p;
|
---|
738 | p = BIO_Random_Gamma_Decode (&rbsi[j], NULL) - 1;
|
---|
739 |
|
---|
740 | if (p)
|
---|
741 | {
|
---|
742 | int ptr, b;
|
---|
743 | chunk_ptrs[j] += BIO_Random_Delta_Decode (&rbsi[j], NULL) - 1;
|
---|
744 | ptr = chunk_ptrs[j];
|
---|
745 | b = 1 << floorlog_2 (BIO_Bblock_Init_W (chunk_data[j].N, p));
|
---|
746 |
|
---|
747 | if (j == chunks_read - 1)
|
---|
748 | {
|
---|
749 | int k, CurrDoc;
|
---|
750 | DECODE_START ((u_char *) MemoryBuffer, ptr)
|
---|
751 | CurrDoc = isr->Disk_Last;
|
---|
752 | for (k = 0; k < p; k++)
|
---|
753 | {
|
---|
754 | register unsigned long x, tf;
|
---|
755 | BBLOCK_DECODE (x, b);
|
---|
756 | if (k == 0)
|
---|
757 | x = x + chunk_data[j].start_doc - isr->Disk_Last;
|
---|
758 | CurrDoc += x;
|
---|
759 | BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
|
---|
760 | if (InvfLevel >= 2)
|
---|
761 | {
|
---|
762 | UNARY_DECODE (tf);
|
---|
763 | BIO_Random_Gamma_Encode (tf, &rbs, NULL);
|
---|
764 | }
|
---|
765 | }
|
---|
766 | DECODE_DONE
|
---|
767 | isr->Disk_Last = CurrDoc;
|
---|
768 | }
|
---|
769 | else
|
---|
770 | {
|
---|
771 | int k, CurrDoc;
|
---|
772 | random_bitio_state *Chks = chks + j;
|
---|
773 | BIO_Random_Seek (chunk_data[j].disk_pos + ptr, Chks);
|
---|
774 | CurrDoc = isr->Disk_Last;
|
---|
775 | for (k = 0; k < p; k++)
|
---|
776 | {
|
---|
777 | register unsigned long x, tf;
|
---|
778 | x = BIO_Random_Bblock_Decode (b, Chks, NULL);
|
---|
779 | if (k == 0)
|
---|
780 | x = x + chunk_data[j].start_doc - isr->Disk_Last;
|
---|
781 | CurrDoc += x;
|
---|
782 | BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
|
---|
783 | if (InvfLevel >= 2)
|
---|
784 | {
|
---|
785 | tf = BIO_Random_Unary_Decode (Chks, NULL);
|
---|
786 | BIO_Random_Gamma_Encode (tf, &rbs, NULL);
|
---|
787 | }
|
---|
788 | }
|
---|
789 | isr->Disk_Last = CurrDoc;
|
---|
790 | }
|
---|
791 | }
|
---|
792 | }
|
---|
793 |
|
---|
794 | isr->Disk_Ptr = BIO_Random_Tell_X (&rbs);
|
---|
795 |
|
---|
796 | }
|
---|
797 | if (chunks_read > 1)
|
---|
798 | {
|
---|
799 | int j;
|
---|
800 | for (j = 0; j < chunks_read - 1; j++)
|
---|
801 | {
|
---|
802 | Xfree (chks[j].Buf);
|
---|
803 | ChangeMemInUse (-chks[j].len);
|
---|
804 | }
|
---|
805 | Xfree (chks);
|
---|
806 | ChangeMemInUse (-(chunks_read - 1) * sizeof (random_bitio_state));
|
---|
807 | }
|
---|
808 |
|
---|
809 | for (i = 0; i < chunks_read; i++)
|
---|
810 | {
|
---|
811 | Xfree (rbsi[i].Buf);
|
---|
812 | ChangeMemInUse (-rbsi[i].len);
|
---|
813 | }
|
---|
814 | Xfree (rbsi);
|
---|
815 | ChangeMemInUse (-chunks_read * sizeof (random_bitio_state));
|
---|
816 | /* chunks_read = 0; */
|
---|
817 | Xfree (chunk_ptrs);
|
---|
818 | ChangeMemInUse (-chunks_read * sizeof (unsigned long));
|
---|
819 | chunks_read = 0;
|
---|
820 | Disk_pos = 0;
|
---|
821 | BIO_Random_Seek (0, &crbs);
|
---|
822 | }
|
---|
823 |
|
---|
824 | static void
|
---|
825 | MergeIn (void)
|
---|
826 | {
|
---|
827 | static int disk_chunks = 0;
|
---|
828 | static header = 0;
|
---|
829 | if (!header)
|
---|
830 | {
|
---|
831 | fprintf (stderr, "ivf.pass2 : ");
|
---|
832 | header = 1;
|
---|
833 | }
|
---|
834 | if (disk_chunks == ChunkLimit || next_docs_left == 0)
|
---|
835 | {
|
---|
836 | fprintf (stderr, "M");
|
---|
837 | DiskMerge ();
|
---|
838 | disk_chunks = 0;
|
---|
839 | }
|
---|
840 | else
|
---|
841 | {
|
---|
842 | fprintf (stderr, "-");
|
---|
843 | DumpChunk ();
|
---|
844 | disk_chunks++;
|
---|
845 | }
|
---|
846 | if (next_docs_left == 0)
|
---|
847 | fprintf (stderr, "\n");
|
---|
848 | }
|
---|
849 |
|
---|
850 |
|
---|
851 | static int
|
---|
852 | wl_comp (const void *a, const void *b)
|
---|
853 | {
|
---|
854 | return *((int *) a) - *((int *) b);
|
---|
855 | }
|
---|
856 |
|
---|
857 | static int
|
---|
858 | process_doc (u_char * s_in, int l_in)
|
---|
859 | {
|
---|
860 | int res;
|
---|
861 | u_char *end = s_in + l_in - 1;
|
---|
862 | unsigned long tocode;
|
---|
863 | unsigned long wl_pos = 0;
|
---|
864 |
|
---|
865 | if (!docs_left)
|
---|
866 | LoadCounts ();
|
---|
867 |
|
---|
868 | callnum++;
|
---|
869 |
|
---|
870 | if (!inaword (s_in, end))
|
---|
871 | if (SkipSGML)
|
---|
872 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
873 | else
|
---|
874 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
875 |
|
---|
876 | while (s_in <= end)
|
---|
877 | {
|
---|
878 | u_char Word[MAXSTEMLEN + 1];
|
---|
879 |
|
---|
880 | PARSE_STEM_WORD (Word, s_in, end);
|
---|
881 | stemmer (idh.stem_method, idh.stemmer_num, Word);
|
---|
882 | if (SkipSGML)
|
---|
883 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
884 | else
|
---|
885 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
886 |
|
---|
887 | if (*Word == 0)
|
---|
888 | continue;
|
---|
889 |
|
---|
890 | res = perf_hash (phd, Word);
|
---|
891 |
|
---|
892 | {
|
---|
893 | word_rec *arr = &WordRecs[res];
|
---|
894 | int b = 1 << lg_bs[res];
|
---|
895 | wordnum++;
|
---|
896 |
|
---|
897 | tocode = callnum;
|
---|
898 |
|
---|
899 | ENCODE_START ((u_char *) MemoryBuffer, arr->ptr)
|
---|
900 |
|
---|
901 | if (tocode > arr->last)
|
---|
902 | {
|
---|
903 | register int x;
|
---|
904 | x = tocode - arr->last - 1;
|
---|
905 | BBLOCK_ENCODE (x + 1, b);
|
---|
906 | if (InvfLevel >= 2)
|
---|
907 | ENCODE_BIT (1);
|
---|
908 | no_of_ptrs++;
|
---|
909 | arr->last = tocode;
|
---|
910 | }
|
---|
911 | else if (InvfLevel >= 2)
|
---|
912 | {
|
---|
913 | __pos--;
|
---|
914 | ENCODE_BIT (0);
|
---|
915 | ENCODE_BIT (1);
|
---|
916 | }
|
---|
917 | arr->ptr = __pos;
|
---|
918 | ENCODE_DONE
|
---|
919 | }
|
---|
920 |
|
---|
921 | if (MakeWeights)
|
---|
922 | {
|
---|
923 | if (wl_pos >= wl_size)
|
---|
924 | {
|
---|
925 | wl_size += (wl_size >> 1);
|
---|
926 | word_list = Xrealloc (word_list, sizeof (*word_list) * wl_size);
|
---|
927 | }
|
---|
928 | word_list[wl_pos++] = res;
|
---|
929 | }
|
---|
930 | }
|
---|
931 | if (MakeWeights)
|
---|
932 | {
|
---|
933 | float doc_weight = 0.0;
|
---|
934 | if (wl_pos)
|
---|
935 | {
|
---|
936 | unsigned long *wl = word_list;
|
---|
937 | unsigned long i, count, val;
|
---|
938 | qsort (wl, wl_pos, sizeof (*wl), wl_comp);
|
---|
939 | count = 1;
|
---|
940 | val = *wl++;
|
---|
941 | for (i = 1; i <= wl_pos; i++, wl++)
|
---|
942 | if (i == wl_pos || val != *wl)
|
---|
943 | {
|
---|
944 | double weight = count * idf[val];
|
---|
945 | doc_weight += weight * weight;
|
---|
946 | count = 1;
|
---|
947 | val = *wl;
|
---|
948 | }
|
---|
949 | else
|
---|
950 | count++;
|
---|
951 | }
|
---|
952 | HTONF(doc_weight); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
953 | fwrite ((char *) &doc_weight, sizeof (doc_weight), 1, weights);
|
---|
954 | }
|
---|
955 | docs_left--;
|
---|
956 | if (!docs_left)
|
---|
957 | MergeIn ();
|
---|
958 |
|
---|
959 | return COMPALLOK;
|
---|
960 | }
|
---|
961 |
|
---|
962 | int
|
---|
963 | process_ivf_2 (u_char * s_in, int l_in)
|
---|
964 | {
|
---|
965 | if (InvfLevel <= 2)
|
---|
966 | return process_doc (s_in, l_in);
|
---|
967 | else
|
---|
968 | {
|
---|
969 | int count = 0;
|
---|
970 | int pos = 0;
|
---|
971 | u_char *start = s_in;
|
---|
972 | while (pos < l_in)
|
---|
973 | {
|
---|
974 | if (s_in[pos] == TERMPARAGRAPH)
|
---|
975 | {
|
---|
976 | int len = pos + s_in + 1 - start;
|
---|
977 | if (process_doc (start, len) != COMPALLOK)
|
---|
978 | return (COMPERROR);
|
---|
979 | start = s_in + pos + 1;
|
---|
980 | count++;
|
---|
981 | }
|
---|
982 | pos++;
|
---|
983 | }
|
---|
984 | if (start < s_in + pos)
|
---|
985 | {
|
---|
986 | if (process_doc (start, pos + s_in - start) != COMPALLOK)
|
---|
987 | return (COMPERROR);
|
---|
988 | count++;
|
---|
989 | }
|
---|
990 | HTONSI(count); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
991 | fwrite ((char *) &count, sizeof (count), 1, invf_para);
|
---|
992 | }
|
---|
993 | return COMPALLOK;
|
---|
994 | }
|
---|
995 |
|
---|
996 |
|
---|
997 |
|
---|
998 |
|
---|
999 |
|
---|
1000 | static void
|
---|
1001 | stats (unsigned long len)
|
---|
1002 | {
|
---|
1003 | #ifndef SILENT
|
---|
1004 | fseek (count, 0, 2);
|
---|
1005 | fseek (count_trans, 0, 2);
|
---|
1006 | fseek (invf_state, 0, 2);
|
---|
1007 | fseek (invf, 0, 0);
|
---|
1008 | fseek (invf, 0, 2);
|
---|
1009 | fseek (chunks, 0, 2);
|
---|
1010 | fseek (chunk_state, 0, 2);
|
---|
1011 | Message ("File sizes\n");
|
---|
1012 | Message (" Chunk desc : %10u bytes\n", ftell (count));
|
---|
1013 | Message (" Chunk trans : %10u bytes\n", ftell (count_trans));
|
---|
1014 | Message (" Chunks : %10u bytes\n", ftell (chunks));
|
---|
1015 | Message (" Chunk state : %10u bytes\n", ftell (chunk_state));
|
---|
1016 | Message (" Invf state : %10u bytes\n", ftell (invf_state));
|
---|
1017 | Message (" Peak invf : %10u bytes\n", len);
|
---|
1018 | Message (" Final invf : %10u bytes\n", ftell (invf));
|
---|
1019 | Message ("Peak disk usage : %10.2f %%\n",
|
---|
1020 | (double) (ftell (count) + ftell (count_trans) +
|
---|
1021 | ftell (invf_state) + ftell (chunks) +
|
---|
1022 | ftell (chunk_state) + len) / ftell (invf) * 100.0);
|
---|
1023 | #endif
|
---|
1024 | }
|
---|
1025 |
|
---|
1026 |
|
---|
1027 | /* ARGSUSED */
|
---|
1028 | int
|
---|
1029 | done_ivf_2 (char *FileName)
|
---|
1030 | {
|
---|
1031 | long i;
|
---|
1032 | mg_ullong totalIbits;
|
---|
1033 | unsigned long invf_len;
|
---|
1034 | unsigned long bytes_output;
|
---|
1035 | struct invf_file_header ifh;
|
---|
1036 |
|
---|
1037 | if (weights)
|
---|
1038 | fclose (weights);
|
---|
1039 | if (invf_para)
|
---|
1040 | fclose (invf_para);
|
---|
1041 |
|
---|
1042 | free_perf_hash (phd);
|
---|
1043 | phd = NULL;
|
---|
1044 |
|
---|
1045 | Xfree (MemoryBuffer);
|
---|
1046 | MemoryBuffer = NULL;
|
---|
1047 | ChangeMemInUse (-MemBufSize);
|
---|
1048 |
|
---|
1049 | BIO_Random_Done (&rbs);
|
---|
1050 | BIO_Random_Done (&rbsp);
|
---|
1051 | fflush (invf);
|
---|
1052 |
|
---|
1053 | fseek (invf, 0, 2);
|
---|
1054 | invf_len = ftell (invf);
|
---|
1055 |
|
---|
1056 | fseek (invf_out, sizeof (long), 0);
|
---|
1057 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1058 | HTONUL2(dict_size, ifh.no_of_words);
|
---|
1059 | HTONUL2(no_of_ptrs, ifh.no_of_ptrs);
|
---|
1060 | ifh.skip_mode = 0;
|
---|
1061 | bzero ((char *) ifh.params, sizeof (ifh.params));
|
---|
1062 | HTONUL2(InvfLevel, ifh.InvfLevel);
|
---|
1063 | fwrite ((char *) &ifh, sizeof (ifh), 1, invf_out);
|
---|
1064 |
|
---|
1065 | bytes_output = ftell (invf_out);
|
---|
1066 |
|
---|
1067 | totalIbits = sizeof (unsigned long) * 8; /* The magic number */
|
---|
1068 | totalIbits += 8 * 200; /* A 200 byte gap */
|
---|
1069 |
|
---|
1070 | /* find the right place in the file to start reading p values */
|
---|
1071 | fseek (dict, sizeof (unsigned long) + sizeof (struct invf_dict_header), 0);
|
---|
1072 | for (i = 0; i < dict_size; i++)
|
---|
1073 | {
|
---|
1074 | invf_state_rec *isr;
|
---|
1075 | unsigned long fcnt, wcnt, s, e;
|
---|
1076 | register unsigned long p;
|
---|
1077 | u_char dummy1, dummy2[MAXSTEMLEN + 1];
|
---|
1078 |
|
---|
1079 | /* output location to the invf_idx */
|
---|
1080 | HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1081 | fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
|
---|
1082 | NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1083 |
|
---|
1084 | /* read an entry for a word, just to get p value */
|
---|
1085 | dummy1 = fgetc (dict);
|
---|
1086 | dummy1 = fgetc (dict);
|
---|
1087 | fread (dummy2, sizeof (u_char), dummy1, dict);
|
---|
1088 | fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
|
---|
1089 | fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
|
---|
1090 |
|
---|
1091 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1092 | NTOHUL(fcnt);
|
---|
1093 | NTOHUL(wcnt);
|
---|
1094 |
|
---|
1095 | p = fcnt;
|
---|
1096 |
|
---|
1097 | isr = in_cache (i);
|
---|
1098 |
|
---|
1099 | e = (isr->Disk_Ptr + 7ul) >> 3ul;
|
---|
1100 | s = totalIbits >> 3;
|
---|
1101 |
|
---|
1102 | fseek (invf_in, s, 0);
|
---|
1103 | while (s < e)
|
---|
1104 | {
|
---|
1105 | u_char c = getc (invf_in);
|
---|
1106 | if (s == e - 1)
|
---|
1107 | {
|
---|
1108 | u_char ands[8] =
|
---|
1109 | {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
|
---|
1110 | c &= ands[isr->Disk_Ptr & 7ul];
|
---|
1111 | }
|
---|
1112 | putc (c, invf_out);
|
---|
1113 | bytes_output++;
|
---|
1114 | s++;
|
---|
1115 | }
|
---|
1116 |
|
---|
1117 | totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
|
---|
1118 | if (InvfLevel >= 2)
|
---|
1119 | totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
|
---|
1120 | #ifdef USE_LONG_LONG
|
---|
1121 | totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
|
---|
1122 | #else
|
---|
1123 | totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
|
---|
1124 | #endif
|
---|
1125 |
|
---|
1126 | }
|
---|
1127 |
|
---|
1128 | fclose (invf_in);
|
---|
1129 |
|
---|
1130 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1131 | #ifdef __WIN32__
|
---|
1132 | if (!(_chsize (_fileno (invf_out), bytes_output)))
|
---|
1133 | Message ("Could not truncate invf.");
|
---|
1134 | #else
|
---|
1135 | ftruncate (fileno (invf_out), bytes_output);
|
---|
1136 | #endif
|
---|
1137 |
|
---|
1138 | fclose (invf_out);
|
---|
1139 |
|
---|
1140 | HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1141 | fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
|
---|
1142 | NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
1143 |
|
---|
1144 | fclose (invf_idx);
|
---|
1145 |
|
---|
1146 | #ifndef SILENT
|
---|
1147 | {
|
---|
1148 | char *temp_str = msg_prefix;
|
---|
1149 | unsigned long total;
|
---|
1150 | msg_prefix = "ivf.pass2";
|
---|
1151 | stats (invf_len);
|
---|
1152 | Message ("Pass two data structures : %6.3f Mbyte\n",
|
---|
1153 | (double) totalDbytes / 1024 / 1024);
|
---|
1154 | total = totalDbytes;
|
---|
1155 | Message ("Pass two hash structure(s) : %6.3f Mbyte\n",
|
---|
1156 | (double) totalHbytes / 1024 / 1024);
|
---|
1157 | total += totalHbytes;
|
---|
1158 | Message ("Peak extra memory in use : %6.3f Mbyte\n",
|
---|
1159 | (double) MaxMemInUse / 1024 / 1024);
|
---|
1160 | total += MaxMemInUse;
|
---|
1161 | Message ("Peak total memory in use : %6.3f Mbyte\n",
|
---|
1162 | (double) total / 1024 / 1024);
|
---|
1163 | msg_prefix = temp_str;
|
---|
1164 | }
|
---|
1165 | #endif
|
---|
1166 |
|
---|
1167 | Xfree(chunk_data);
|
---|
1168 | chunk_data = NULL;
|
---|
1169 | Xfree (WordRecs);
|
---|
1170 | WordRecs = NULL;
|
---|
1171 | Xfree (lg_bs);
|
---|
1172 | lg_bs = NULL;
|
---|
1173 | Xfree (idf);
|
---|
1174 | idf = NULL;
|
---|
1175 | Xfree (word_list);
|
---|
1176 | word_list = NULL;
|
---|
1177 | /* Free the memory allocated for the BIO_Random */
|
---|
1178 | occur_to_lexical (-1,1);
|
---|
1179 |
|
---|
1180 | BIO_Random_Done (&crbs);
|
---|
1181 |
|
---|
1182 | fclose (invf);
|
---|
1183 | fclose (dict);
|
---|
1184 | fclose (hash);
|
---|
1185 | fclose (count);
|
---|
1186 | fclose (count_trans);
|
---|
1187 | fclose (chunk_state);
|
---|
1188 | fclose (chunks);
|
---|
1189 | fclose (invf_state);
|
---|
1190 | return (COMPALLOK);
|
---|
1191 | }
|
---|