source: trunk/indexers/mg/src/text/ivf.pass2.c@ 7582

Last change on this file since 7582 was 7582, checked in by kjdon, 20 years ago

fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.4 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.c -- Memory efficient pass 2 inversion
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.c 7582 2004-06-10 03:02:05Z kjdon $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.2 2004/06/10 03:02:05 kjdon
27 fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2
28
29 Revision 1.1 2003/02/20 21:18:23 mdewsnip
30 Addition of MG package for search and retrieval
31
32 Revision 1.2 2001/09/21 12:46:42 kjm18
33 updated mg to be in line with mg_1.3f. Now uses long long for some variables
34 to enable indexing of very large collections.
35
36 * Revision 1.2 1997/08/02 05:01:57 wew
37 * changed literal values of 32 for the bit size of magic numbers of
38 * files to sizeof (unsigned long) * 8, increased the gap at the start
39 * of the invf during processing to 200 bytes
40
41 Revision 1.1 1999/08/10 21:17:54 sjboddie
42 renamed mg-1.3d directory mg
43
44 Revision 1.3 1998/12/17 09:12:51 rjmcnab
45
46 Altered mg to process utf-8 encoded Unicode. The main changes
47 are in the parsing of the input, the casefolding, and the stemming.
48
49 Revision 1.2 1998/11/25 07:55:43 rjmcnab
50
51 Modified mg to that you can specify the stemmer you want
52 to use via a command line option. You specify it to
53 mg_passes during the build process. The number of the
54 stemmer that you used is stored within the inverted
55 dictionary header and the stemmed dictionary header so
56 the correct stemmer is used in later stages of building
57 and querying.
58
59 Revision 1.1 1998/11/17 09:34:45 rjmcnab
60 *** empty log message ***
61
62 * Revision 1.3 1994/10/20 03:56:49 tes
63 * I have rewritten the boolean query optimiser and abstracted out the
64 * components of the boolean query.
65 *
66 * Revision 1.2 1994/09/20 04:41:35 tes
67 * For version 1.1
68 *
69 */
70
71/*
72 * Modified:
73 * - long long disk pointers and bit counts for inverted file
74 * (1999-08-03 Tim Bell <[email protected]>)
75 * Code provided by Owen de Kretser <[email protected]>
76 */
77
78static char *RCSID = "$Id: ivf.pass2.c 7582 2004-06-10 03:02:05Z kjdon $";
79
80#include "local_strings.h"
81#include "sysfuncs.h"
82#include "memlib.h"
83#include "messages.h"
84#include "stemmer.h"
85#include "perf_hash.h"
86#include "bitio_m.h"
87#include "bitio_m_mems.h"
88#include "bitio_gen.h"
89#include "bitio_random.h"
90#include "bitio_stdio.h"
91#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
92
93#include "mg_files.h"
94#include "invf.h"
95#include "locallib.h"
96#include "mg.h"
97#include "build.h"
98#include "words.h"
99#include "hash.h"
100
101#include "longlong.h"
102
103#ifdef USE_LONG_LONG
104#define BIO_Random_Seek_X BIO_Random_Seek_LL
105#define BIO_Random_Tell_X BIO_Random_Tell_LL
106#else
107#define BIO_Random_Seek_X BIO_Random_Seek
108#define BIO_Random_Tell_X BIO_Random_Tell
109#endif
110
111/* [RPAP - Feb 97: WIN32 Port] */
112#ifdef __WIN32__
113#include <io.h>
114#endif
115
116#ifndef RND_BUF_SIZE
117#define RND_BUF_SIZE 8*1024
118/*#define RND_BUF_SIZE 128 */
119#endif
120
121#define print_fsize(file)\
122do\
123 {\
124 struct stat file_state;\
125 fstat(fileno(invf_out), &file_state);\
126 Message("len(invf) = %ld", file_state.st_size);\
127 }while(0)
128
129typedef struct word_rec
130 {
131 unsigned long ptr;
132 unsigned long last;
133 }
134word_rec;
135
136typedef struct invf_state_rec
137 {
138 mg_ullong Disk_Ptr;
139 mg_ullong Disk_Last;
140 unsigned long Disk_B;
141 }
142invf_state_rec;
143
144typedef struct chunk
145 {
146 unsigned long start_doc;
147 unsigned long params_pos;
148 unsigned long disk_pos;
149 unsigned long N;
150 }
151chunk;
152
153
154static FILE *dict; /* Stemmed dictionary file */
155static FILE *hash; /* Stemmed dictionary hash file */
156static FILE *invf; /* Inverted file */
157static FILE *invf_in; /* Inverted file */
158static FILE *invf_out; /* Inverted file */
159static FILE *invf_idx; /* Inverted index file */
160static FILE *count; /* Count file */
161static FILE *count_trans; /* Count translation file */
162static FILE *invf_state; /* Inverted file State */
163static FILE *chunk_state; /* Chunk state */
164static FILE *chunks; /* Chunk state */
165static FILE *invf_para = NULL; /* Paragraph counts file */
166static FILE *weights = NULL; /* Weights file */
167
168static stdio_bitio_state sbs;
169static random_bitio_state crbs;
170static chunk *chunk_data = NULL;
171static random_bitio_state rbs, rbsp;
172
173static int docs_left = 0, next_docs_left = 0;
174static unsigned long N;
175
176static word_rec *WordRecs;
177static u_char *lg_bs;
178static float *idf = NULL;
179
180static char *MemoryBuffer = NULL;
181static unsigned long MemBufSize;
182static unsigned long BufToUse;
183static struct invf_dict_header idh;
184
185static perf_hash_data *phd;
186
187static unsigned long *word_list = NULL;
188static unsigned long wl_size = 0;
189
190static unsigned long dict_size;
191static unsigned long no_of_ptrs = 0;
192static unsigned long chunks_read = 0;
193static unsigned long Disk_pos = 0;
194static unsigned long callnum = 0;
195static unsigned long wordnum = 0;
196
197static unsigned long totalIbytes = 0;
198static unsigned long totalDbytes = 0;
199static unsigned long totalHbytes = 0;
200
201static unsigned long MemInUse = 0;
202static unsigned long MaxMemInUse = 0;
203static unsigned long max_buffer_len;
204
205void
206ChangeMemInUse (int mem)
207{
208 MemInUse += mem;
209 if (MemInUse > MaxMemInUse)
210 MaxMemInUse = MemInUse;
211}
212
213void
214ResetStaticI2Vars()
215{
216 docs_left = 0;
217 next_docs_left = 0;
218 N = 0;
219 MemBufSize=0;
220 BufToUse=0;
221 memset(&idh, 0, sizeof(idh));
222 wl_size = 0;
223
224 dict_size = 0;
225 no_of_ptrs = 0;
226 chunks_read = 0;
227 Disk_pos = 0;
228 callnum = 0;
229 wordnum = 0;
230
231 totalIbytes = 0;
232 totalDbytes = 0;
233 totalHbytes = 0;
234
235 MemInUse = 0;
236 MaxMemInUse = 0;
237 max_buffer_len = 0;
238
239}
240
241
242static int
243open_files (char *file_name)
244{
245 char FName[200];
246
247 if (!(dict = open_file (file_name, INVF_DICT_SUFFIX, "rb",
248 MAGIC_STEM_BUILD, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
249 return (COMPERROR);
250
251 if (!(hash = open_file (file_name, INVF_DICT_HASH_SUFFIX, "rb",
252 MAGIC_HASH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
253 return (COMPERROR);
254
255 if (!(count = open_file (file_name, INVF_CHUNK_SUFFIX, "rb",
256 MAGIC_CHUNK, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
257 return (COMPERROR);
258 fread (&max_buffer_len, sizeof (max_buffer_len), 1, count);
259 NTOHUL(max_buffer_len); /* [RPAP - Jan 97: Endian Ordering] */
260
261 BIO_Stdio_Decode_Start (count, &sbs);
262 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
263
264 if (!(count_trans = open_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "rb",
265 MAGIC_CHUNK_TRANS, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
266 return (COMPERROR);
267
268 if (!(invf = create_file (file_name, INVF_SUFFIX, "w+b",
269 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
270 return (COMPERROR);
271 fflush (invf);
272 if (!(invf_in = open_file (file_name, INVF_SUFFIX, "rb",
273 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
274 return (COMPERROR);
275 if (!(invf_out = create_file (file_name, INVF_SUFFIX, "wb",
276 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
277 return (COMPERROR);
278 BIO_Random_Start (invf, RND_BUF_SIZE, &rbs);
279 BIO_Random_Start (invf, RND_BUF_SIZE, &rbsp);
280 ChangeMemInUse (RND_BUF_SIZE * 2);
281
282 if (!(invf_idx = create_file (file_name, INVF_IDX_SUFFIX, "wb",
283 MAGIC_INVI, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
284 return (COMPERROR);
285
286 if (InvfLevel == 3)
287 if (!(invf_para = create_file (file_name, INVF_PARAGRAPH_SUFFIX, "wb",
288 MAGIC_PARAGRAPH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
289 return (COMPERROR);
290
291 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
292 ".invf.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
293 if (!(invf_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
294 {
295 Message ("Unable to create \"%s\"", FName);
296 return (COMPERROR);
297 }
298 unlink (FName);
299
300 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
301 ".chunk.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
302 if (!(chunk_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
303 {
304 Message ("Unable to create \"%s\"", FName);
305 return (COMPERROR);
306 }
307 unlink (FName);
308 BIO_Random_Start (chunk_state, RND_BUF_SIZE, &crbs);
309 ChangeMemInUse (RND_BUF_SIZE);
310
311 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
312 ".chunks", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
313 if (!(chunks = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
314 {
315 Message ("Unable to create \"%s\"", FName);
316 return (COMPERROR);
317 }
318 unlink (FName);
319
320 return (COMPALLOK);
321}
322
323
324
325
326
327#define ISR_CACHE 1024
328#define ISR_ENTRY_SIZE (sizeof(mg_ullong)*2 + sizeof(unsigned long))
329
330invf_state_rec *
331in_cache (int pos)
332{
333 static char isr_data[ISR_CACHE * ISR_ENTRY_SIZE];
334 static invf_state_rec isr;
335 static int isr_base = 0, isr_num = -1, isr_pos = -1;
336 if (isr_pos >= 0)
337 bcopy ((char *) &isr, &isr_data[isr_pos * ISR_ENTRY_SIZE], ISR_ENTRY_SIZE);
338 if (pos < isr_base || pos >= isr_base + isr_num)
339 {
340 if (isr_num >= 0)
341 {
342 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
343 fwrite (isr_data, 1, ISR_ENTRY_SIZE * isr_num, invf_state);
344 }
345 isr_base = pos;
346 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
347 fread (isr_data, 1, ISR_ENTRY_SIZE * ISR_CACHE, invf_state);
348 isr_num = ISR_CACHE;
349 }
350 isr_pos = pos - isr_base;
351 bcopy (&isr_data[isr_pos * ISR_ENTRY_SIZE], (char *) &isr, ISR_ENTRY_SIZE);
352 return &isr;
353}
354
355
356
357
358
359unsigned long
360occur_to_lexical (long occ, int clear_state)
361{
362 static long pos = -1;
363 static random_bitio_state rbs;
364 static int val = 0;
365 if (clear_state) {
366 pos = -1;
367 val = 0;
368 return 0;
369 }
370 if (pos == -1)
371 {
372 BIO_Random_Start (count_trans, RND_BUF_SIZE, &rbs);
373 pos = 0x7fffffff;
374 }
375 if (occ < pos)
376 {
377 if (occ == -1)
378 {
379 BIO_Random_Done (&rbs);
380 return 0;
381 }
382 BIO_Random_Seek_X (sizeof (unsigned long) * 8, &rbs);
383 pos = 0;
384 }
385 while (pos <= occ)
386 {
387 val = BIO_Random_Binary_Decode (dict_size + 1, &rbs, NULL) - 1;
388 pos++;
389 }
390 return (val);
391}
392
393
394void
395add_chunk_state (unsigned long pos, unsigned long start_doc,
396 unsigned long N)
397{
398 chunk_data[chunks_read].params_pos = pos;
399 chunk_data[chunks_read].start_doc = start_doc;
400 chunk_data[chunks_read].N = N;
401 chunks_read++;
402}
403
404
405int
406init_ivf_2 (char *file_name)
407{
408 ResetStaticI2Vars();
409 occur_to_lexical(0, 1); /* clear the static vars in here*/
410 u_char prev[MAXSTEMLEN + 1];
411 int i;
412 mg_ullong totalIbits;
413 mg_ullong lasttotalIbits;
414 double logN = 0.0;
415
416
417 if (open_files (file_name) == COMPERROR)
418 return COMPERROR;
419
420
421 /* Read in the stemmed dictionary file header */
422 fread ((char *) &idh, sizeof (idh), 1, dict);
423
424 /* [RPAP - Jan 97: Endian Ordering] */
425 NTOHUL(idh.lookback);
426 NTOHUL(idh.dict_size);
427 NTOHUL(idh.total_bytes);
428 NTOHUL(idh.index_string_bytes);
429 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
430 NTOHUL(idh.num_of_docs);
431 NTOHUL(idh.static_num_of_docs);
432 NTOHUL(idh.num_of_words);
433 NTOHUL(idh.stemmer_num);
434 NTOHUL(idh.stem_method);
435
436 dict_size = idh.dict_size;
437
438 N = idh.num_of_docs;
439
440 if (!(phd = read_perf_hash_data (hash)))
441 {
442 Message ("Unable to read in hash data");
443 return COMPERROR;
444 }
445 totalHbytes = sizeof (perf_hash_data) + sizeof (u_char) * 256 +
446 sizeof (int) * (phd->MAX_N + 1) + sizeof (int *) * 3 * phd->MAX_CH +
447 sizeof (long) * phd->MAX_CH * phd->MAX_L;
448
449 if (!(WordRecs = Xmalloc (sizeof (word_rec) * idh.dict_size)))
450 {
451 Message ("No memory for word entries");
452 return COMPERROR;
453 }
454 totalDbytes += sizeof (word_rec) * idh.dict_size;
455 /* separate storage for the log(b) values, one byte each */
456 if (!(lg_bs = Xmalloc (sizeof (u_char) * idh.dict_size)))
457 {
458 Message ("No memory for lg b's");
459 return COMPERROR;
460 }
461 totalDbytes += sizeof (u_char) * idh.dict_size;
462
463 if (MakeWeights)
464 {
465 /* separate storage for the idf values, one single each */
466 if (!(idf = Xmalloc (sizeof (float) * idh.dict_size)))
467 {
468 Message ("No memory for idf's");
469 return COMPERROR;
470 }
471 totalDbytes += sizeof (float) * idh.dict_size;
472
473 if (!(weights = create_file (file_name, WEIGHTS_SUFFIX, "wb",
474 MAGIC_WGHT, MG_CONTINUE))) { /* [RPAP - Feb 97: WIN32 Port] */
475 Message ("Couldn't open weights file for writing");
476 return (COMPERROR);
477 }
478 }
479 else
480 {
481 unlink (make_name (file_name, WEIGHTS_SUFFIX, NULL));
482 }
483
484 chunk_data = Xmalloc (sizeof (chunk) * (ChunkLimit + 2));
485 totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
486
487 totalIbits = sizeof (unsigned long) * 8; /* The magic number */
488 totalIbits += 8 * 200; /* A 200 byte gap */
489
490 if (MakeWeights)
491 {
492 wl_size = 1024;
493 if (!(word_list = Xmalloc (sizeof (*word_list) * wl_size)))
494 {
495 Message ("No memory for word_list");
496 return COMPERROR;
497 }
498
499 logN = log ((double) N);
500 }
501
502 for (i = 0; i < idh.dict_size; i++)
503 {
504 invf_state_rec *isr;
505 register unsigned long copy, suff, p;
506 unsigned long fcnt, wcnt;
507
508 lasttotalIbits = totalIbits;
509
510 copy = fgetc (dict);
511 suff = fgetc (dict);
512 *prev = copy + suff;
513 fread (prev + copy + 1, sizeof (u_char), suff, dict);
514
515 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
516 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
517
518 /* [RPAP - Jan 97: Endian Ordering] */
519 NTOHUL(fcnt);
520 NTOHUL(wcnt);
521
522 WordRecs[i].last = 0;
523 WordRecs[i].ptr = 0;
524
525 p = fcnt;
526
527 if (MakeWeights)
528 idf[i] = logN - log ((double) fcnt);
529
530
531 isr = in_cache (i);
532
533 isr->Disk_Last = 0;
534 isr->Disk_Ptr = totalIbits;
535
536 isr->Disk_B = BIO_Bblock_Init (N, p);
537
538 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
539
540 if (InvfLevel >= 2)
541 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
542
543#ifdef USE_LONG_LONG
544 totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
545#else
546 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
547#endif
548
549 if (totalIbits < lasttotalIbits) {
550 fprintf(stderr, "ERROR: The totalIbits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
551 if (sizeof (mg_ullong) < 8) {
552 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
553 }
554 fprintf(stderr, " Build aborted.\n");
555 exit(1);
556 }
557 }
558
559
560 /* now convert to bytes, and actually get the space */
561#ifdef USE_LONG_LONG
562 totalIbytes = (totalIbits + 7ull) >> 3ull;
563#else
564 totalIbytes = (totalIbits + 7ul) >> 3ul;
565#endif
566 return (COMPALLOK);
567
568}
569
570
571
572
573
574static void
575LoadCounts (void)
576{
577 unsigned long numwords, i, last_total;
578 static unsigned long local_N = 0;
579 unsigned long totalIbits, crbs_pos;
580 word_rec *wr;
581 unsigned long *counts;
582
583 if (MemoryBuffer == NULL)
584 {
585 MemBufSize = sizeof (unsigned long) * dict_size;
586 if (max_buffer_len > MemBufSize)
587 MemBufSize = max_buffer_len;
588 if (!(MemoryBuffer = Xmalloc (MemBufSize)))
589 FatalError (1, "Unable to allocate memory for buffer");
590 ChangeMemInUse (MemBufSize);
591 }
592 counts = (unsigned long *) MemoryBuffer;
593/* bzero ((char *) counts, sizeof (unsigned long) * dict_size); */
594 bzero ((char *) counts, MemBufSize);
595 docs_left = next_docs_left;
596 if (!docs_left)
597 FatalError (1, "The number of docs in the current chunk is 0");
598
599 BufToUse = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
600
601 numwords = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
602
603 local_N = docs_left;
604
605 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
606 wr->ptr = 0;
607
608 bzero ((char *) lg_bs, dict_size);
609
610 for (i = 0; i < numwords; i++)
611 {
612 unsigned long word_num, wcnt, fcnt, p;
613 word_num = occur_to_lexical (i,0);
614
615 wr = &WordRecs[word_num];
616
617 wcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
618 if (wcnt >= 2)
619 fcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL);
620 else
621 fcnt = wcnt;
622
623
624 p = fcnt;
625 if (wcnt)
626 {
627 register unsigned long length;
628 counts[word_num] = p;
629 length = BIO_Bblock_Bound (local_N, p);
630 if (InvfLevel >= 2)
631 length += wcnt;
632 wr->ptr = length;
633 lg_bs[word_num] = floorlog_2 (BIO_Bblock_Init_W (local_N, p));
634 }
635
636
637 }
638 crbs_pos = BIO_Random_Tell (&crbs);
639 totalIbits = 0;
640 last_total = 0;
641 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
642 {
643 register unsigned long length;
644 length = wr->ptr;
645 wr->last = callnum;
646 BIO_Random_Gamma_Encode (counts[i] + 1, &crbs, NULL);
647 if (counts[i])
648 {
649 if (i)
650 BIO_Random_Delta_Encode (totalIbits - last_total + 1, &crbs, NULL);
651 else
652 BIO_Random_Delta_Encode (1, &crbs, NULL);
653
654 last_total = totalIbits;
655 }
656 wr->ptr = totalIbits;
657 totalIbits += length;
658 }
659 add_chunk_state (crbs_pos, callnum, local_N);
660
661 if ((totalIbits + 7ul) >> 3ul > BufToUse)
662 FatalError (1, "Pointers exceed buffer size");
663
664 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
665}
666
667
668
669
670static void
671DumpChunk (void)
672{
673 chunk_data[chunks_read - 1].disk_pos = Disk_pos << 3;
674 fseek (chunks, Disk_pos, 0);
675 fwrite (MemoryBuffer, sizeof (char), BufToUse, chunks);
676 Disk_pos += BufToUse;
677}
678
679
680
681
682static void
683DiskMerge (void)
684{
685 random_bitio_state *rbsi;
686 random_bitio_state *chks = NULL;
687 unsigned long *chunk_ptrs;
688 int i;
689
690 BIO_Random_Flush (&crbs);
691
692 chunk_ptrs = Xmalloc (chunks_read * sizeof (unsigned long));
693 ChangeMemInUse (chunks_read * sizeof (unsigned long));
694 bzero ((char *) chunk_ptrs, chunks_read * sizeof (unsigned long));
695
696 rbsi = Xmalloc (chunks_read * sizeof (random_bitio_state));
697 ChangeMemInUse (chunks_read * sizeof (random_bitio_state));
698 for (i = 0; i < chunks_read; i++)
699 {
700 rbsi[i] = crbs;
701 rbsi[i].Buf = Xmalloc (rbsi[i].len);
702 ChangeMemInUse (rbsi[i].len);
703 bcopy ((char *) (crbs.Buf), (char *) (rbsi[i].Buf), rbsi[i].len);
704 BIO_Random_Seek (chunk_data[i].params_pos, &rbsi[i]);
705 }
706
707 if (chunks_read > 1)
708 {
709 int j;
710 chks = Xmalloc ((chunks_read - 1) * sizeof (random_bitio_state));
711 ChangeMemInUse ((chunks_read - 1) * sizeof (random_bitio_state));
712 BIO_Random_Start (chunks, RND_BUF_SIZE, &chks[0]);
713 ChangeMemInUse (RND_BUF_SIZE);
714 for (j = 1; j < chunks_read - 1; j++)
715 {
716 chks[j] = chks[0];
717 chks[j].Buf = Xmalloc (chks[0].len);
718 ChangeMemInUse (chks[0].len);
719 bcopy ((char *) (chks[0].Buf), (char *) (chks[j].Buf), chks[0].len);
720 }
721 }
722 for (i = 0; i < dict_size; i++)
723 {
724 int j;
725 invf_state_rec *isr = in_cache (i);
726 register int B;
727
728 BIO_Random_Seek_X (isr->Disk_Ptr, &rbs); /* Position in invf file */
729
730 B = isr->Disk_B;
731
732 for (j = 0; j < chunks_read; j++)
733 {
734 int p;
735 p = BIO_Random_Gamma_Decode (&rbsi[j], NULL) - 1;
736
737 if (p)
738 {
739 int ptr, b;
740 chunk_ptrs[j] += BIO_Random_Delta_Decode (&rbsi[j], NULL) - 1;
741 ptr = chunk_ptrs[j];
742 b = 1 << floorlog_2 (BIO_Bblock_Init_W (chunk_data[j].N, p));
743
744 if (j == chunks_read - 1)
745 {
746 int k, CurrDoc;
747 DECODE_START ((u_char *) MemoryBuffer, ptr)
748 CurrDoc = isr->Disk_Last;
749 for (k = 0; k < p; k++)
750 {
751 register unsigned long x, tf;
752 BBLOCK_DECODE (x, b);
753 if (k == 0)
754 x = x + chunk_data[j].start_doc - isr->Disk_Last;
755 CurrDoc += x;
756 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
757 if (InvfLevel >= 2)
758 {
759 UNARY_DECODE (tf);
760 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
761 }
762 }
763 DECODE_DONE
764 isr->Disk_Last = CurrDoc;
765 }
766 else
767 {
768 int k, CurrDoc;
769 random_bitio_state *Chks = chks + j;
770 BIO_Random_Seek (chunk_data[j].disk_pos + ptr, Chks);
771 CurrDoc = isr->Disk_Last;
772 for (k = 0; k < p; k++)
773 {
774 register unsigned long x, tf;
775 x = BIO_Random_Bblock_Decode (b, Chks, NULL);
776 if (k == 0)
777 x = x + chunk_data[j].start_doc - isr->Disk_Last;
778 CurrDoc += x;
779 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
780 if (InvfLevel >= 2)
781 {
782 tf = BIO_Random_Unary_Decode (Chks, NULL);
783 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
784 }
785 }
786 isr->Disk_Last = CurrDoc;
787 }
788 }
789 }
790
791 isr->Disk_Ptr = BIO_Random_Tell_X (&rbs);
792
793 }
794 if (chunks_read > 1)
795 {
796 int j;
797 for (j = 0; j < chunks_read - 1; j++)
798 {
799 Xfree (chks[j].Buf);
800 ChangeMemInUse (-chks[j].len);
801 }
802 Xfree (chks);
803 ChangeMemInUse (-(chunks_read - 1) * sizeof (random_bitio_state));
804 }
805
806 for (i = 0; i < chunks_read; i++)
807 {
808 Xfree (rbsi[i].Buf);
809 ChangeMemInUse (-rbsi[i].len);
810 }
811 Xfree (rbsi);
812 ChangeMemInUse (-chunks_read * sizeof (random_bitio_state));
813/* chunks_read = 0; */
814 Xfree (chunk_ptrs);
815 ChangeMemInUse (-chunks_read * sizeof (unsigned long));
816 chunks_read = 0;
817 Disk_pos = 0;
818 BIO_Random_Seek (0, &crbs);
819}
820
821static void
822MergeIn (void)
823{
824 static int disk_chunks = 0;
825 static header = 0;
826 if (!header)
827 {
828 fprintf (stderr, "ivf.pass2 : ");
829 header = 1;
830 }
831 if (disk_chunks == ChunkLimit || next_docs_left == 0)
832 {
833 fprintf (stderr, "M");
834 DiskMerge ();
835 disk_chunks = 0;
836 }
837 else
838 {
839 fprintf (stderr, "-");
840 DumpChunk ();
841 disk_chunks++;
842 }
843 if (next_docs_left == 0)
844 fprintf (stderr, "\n");
845}
846
847
848static int
849wl_comp (const void *a, const void *b)
850{
851 return *((int *) a) - *((int *) b);
852}
853
854static int
855process_doc (u_char * s_in, int l_in)
856{
857 int res;
858 u_char *end = s_in + l_in - 1;
859 unsigned long tocode;
860 unsigned long wl_pos = 0;
861
862 if (!docs_left)
863 LoadCounts ();
864
865 callnum++;
866
867 if (!inaword (s_in, end))
868 if (SkipSGML)
869 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
870 else
871 PARSE_NON_STEM_WORD (s_in, end);
872
873 while (s_in <= end)
874 {
875 u_char Word[MAXSTEMLEN + 1];
876
877 PARSE_STEM_WORD (Word, s_in, end);
878 stemmer (idh.stem_method, idh.stemmer_num, Word);
879 if (SkipSGML)
880 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
881 else
882 PARSE_NON_STEM_WORD (s_in, end);
883
884 if (*Word == 0)
885 continue;
886
887 res = perf_hash (phd, Word);
888
889 {
890 word_rec *arr = &WordRecs[res];
891 int b = 1 << lg_bs[res];
892 wordnum++;
893
894 tocode = callnum;
895
896 ENCODE_START ((u_char *) MemoryBuffer, arr->ptr)
897
898 if (tocode > arr->last)
899 {
900 register int x;
901 x = tocode - arr->last - 1;
902 BBLOCK_ENCODE (x + 1, b);
903 if (InvfLevel >= 2)
904 ENCODE_BIT (1);
905 no_of_ptrs++;
906 arr->last = tocode;
907 }
908 else if (InvfLevel >= 2)
909 {
910 __pos--;
911 ENCODE_BIT (0);
912 ENCODE_BIT (1);
913 }
914 arr->ptr = __pos;
915 ENCODE_DONE
916 }
917
918 if (MakeWeights)
919 {
920 if (wl_pos >= wl_size)
921 {
922 wl_size += (wl_size >> 1);
923 word_list = Xrealloc (word_list, sizeof (*word_list) * wl_size);
924 }
925 word_list[wl_pos++] = res;
926 }
927 }
928 if (MakeWeights)
929 {
930 float doc_weight = 0.0;
931 if (wl_pos)
932 {
933 unsigned long *wl = word_list;
934 unsigned long i, count, val;
935 qsort (wl, wl_pos, sizeof (*wl), wl_comp);
936 count = 1;
937 val = *wl++;
938 for (i = 1; i <= wl_pos; i++, wl++)
939 if (i == wl_pos || val != *wl)
940 {
941 double weight = count * idf[val];
942 doc_weight += weight * weight;
943 count = 1;
944 val = *wl;
945 }
946 else
947 count++;
948 }
949 HTONF(doc_weight); /* [RPAP - Jan 97: Endian Ordering] */
950 fwrite ((char *) &doc_weight, sizeof (doc_weight), 1, weights);
951 }
952 docs_left--;
953 if (!docs_left)
954 MergeIn ();
955
956 return COMPALLOK;
957}
958
959int
960process_ivf_2 (u_char * s_in, int l_in)
961{
962 if (InvfLevel <= 2)
963 return process_doc (s_in, l_in);
964 else
965 {
966 int count = 0;
967 int pos = 0;
968 u_char *start = s_in;
969 while (pos < l_in)
970 {
971 if (s_in[pos] == TERMPARAGRAPH)
972 {
973 int len = pos + s_in + 1 - start;
974 if (process_doc (start, len) != COMPALLOK)
975 return (COMPERROR);
976 start = s_in + pos + 1;
977 count++;
978 }
979 pos++;
980 }
981 if (start < s_in + pos)
982 {
983 if (process_doc (start, pos + s_in - start) != COMPALLOK)
984 return (COMPERROR);
985 count++;
986 }
987 HTONSI(count); /* [RPAP - Jan 97: Endian Ordering] */
988 fwrite ((char *) &count, sizeof (count), 1, invf_para);
989 }
990 return COMPALLOK;
991}
992
993
994
995
996
997static void
998stats (unsigned long len)
999{
1000#ifndef SILENT
1001 fseek (count, 0, 2);
1002 fseek (count_trans, 0, 2);
1003 fseek (invf_state, 0, 2);
1004 fseek (invf, 0, 0);
1005 fseek (invf, 0, 2);
1006 fseek (chunks, 0, 2);
1007 fseek (chunk_state, 0, 2);
1008 Message ("File sizes\n");
1009 Message (" Chunk desc : %10u bytes\n", ftell (count));
1010 Message (" Chunk trans : %10u bytes\n", ftell (count_trans));
1011 Message (" Chunks : %10u bytes\n", ftell (chunks));
1012 Message (" Chunk state : %10u bytes\n", ftell (chunk_state));
1013 Message (" Invf state : %10u bytes\n", ftell (invf_state));
1014 Message (" Peak invf : %10u bytes\n", len);
1015 Message (" Final invf : %10u bytes\n", ftell (invf));
1016 Message ("Peak disk usage : %10.2f %%\n",
1017 (double) (ftell (count) + ftell (count_trans) +
1018 ftell (invf_state) + ftell (chunks) +
1019 ftell (chunk_state) + len) / ftell (invf) * 100.0);
1020#endif
1021}
1022
1023
1024/* ARGSUSED */
1025int
1026done_ivf_2 (char *FileName)
1027{
1028 long i;
1029 mg_ullong totalIbits;
1030 unsigned long invf_len;
1031 unsigned long bytes_output;
1032 struct invf_file_header ifh;
1033
1034 if (weights)
1035 fclose (weights);
1036 if (invf_para)
1037 fclose (invf_para);
1038
1039 free_perf_hash (phd);
1040 phd = NULL;
1041
1042 Xfree (MemoryBuffer);
1043 MemoryBuffer = NULL;
1044 ChangeMemInUse (-MemBufSize);
1045
1046 BIO_Random_Done (&rbs);
1047 BIO_Random_Done (&rbsp);
1048 fflush (invf);
1049
1050 fseek (invf, 0, 2);
1051 invf_len = ftell (invf);
1052
1053 fseek (invf_out, sizeof (long), 0);
1054 /* [RPAP - Jan 97: Endian Ordering] */
1055 HTONUL2(dict_size, ifh.no_of_words);
1056 HTONUL2(no_of_ptrs, ifh.no_of_ptrs);
1057 ifh.skip_mode = 0;
1058 bzero ((char *) ifh.params, sizeof (ifh.params));
1059 HTONUL2(InvfLevel, ifh.InvfLevel);
1060 fwrite ((char *) &ifh, sizeof (ifh), 1, invf_out);
1061
1062 bytes_output = ftell (invf_out);
1063
1064 totalIbits = sizeof (unsigned long) * 8; /* The magic number */
1065 totalIbits += 8 * 200; /* A 200 byte gap */
1066
1067 /* find the right place in the file to start reading p values */
1068 fseek (dict, sizeof (unsigned long) + sizeof (struct invf_dict_header), 0);
1069 for (i = 0; i < dict_size; i++)
1070 {
1071 invf_state_rec *isr;
1072 unsigned long fcnt, wcnt, s, e;
1073 register unsigned long p;
1074 u_char dummy1, dummy2[MAXSTEMLEN + 1];
1075
1076 /* output location to the invf_idx */
1077 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1078 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1079 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1080
1081 /* read an entry for a word, just to get p value */
1082 dummy1 = fgetc (dict);
1083 dummy1 = fgetc (dict);
1084 fread (dummy2, sizeof (u_char), dummy1, dict);
1085 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
1086 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
1087
1088 /* [RPAP - Jan 97: Endian Ordering] */
1089 NTOHUL(fcnt);
1090 NTOHUL(wcnt);
1091
1092 p = fcnt;
1093
1094 isr = in_cache (i);
1095
1096 e = (isr->Disk_Ptr + 7ul) >> 3ul;
1097 s = totalIbits >> 3;
1098
1099 fseek (invf_in, s, 0);
1100 while (s < e)
1101 {
1102 u_char c = getc (invf_in);
1103 if (s == e - 1)
1104 {
1105 u_char ands[8] =
1106 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1107 c &= ands[isr->Disk_Ptr & 7ul];
1108 }
1109 putc (c, invf_out);
1110 bytes_output++;
1111 s++;
1112 }
1113
1114 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
1115 if (InvfLevel >= 2)
1116 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
1117#ifdef USE_LONG_LONG
1118 totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
1119#else
1120 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
1121#endif
1122
1123 }
1124
1125 fclose (invf_in);
1126
1127 /* [RPAP - Feb 97: WIN32 Port] */
1128#ifdef __WIN32__
1129 if (!(_chsize (_fileno (invf_out), bytes_output)))
1130 Message ("Could not truncate invf.");
1131#else
1132 ftruncate (fileno (invf_out), bytes_output);
1133#endif
1134
1135 fclose (invf_out);
1136
1137 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1138 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1139 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1140
1141 fclose (invf_idx);
1142
1143#ifndef SILENT
1144 {
1145 char *temp_str = msg_prefix;
1146 unsigned long total;
1147 msg_prefix = "ivf.pass2";
1148 stats (invf_len);
1149 Message ("Pass two data structures : %6.3f Mbyte\n",
1150 (double) totalDbytes / 1024 / 1024);
1151 total = totalDbytes;
1152 Message ("Pass two hash structure(s) : %6.3f Mbyte\n",
1153 (double) totalHbytes / 1024 / 1024);
1154 total += totalHbytes;
1155 Message ("Peak extra memory in use : %6.3f Mbyte\n",
1156 (double) MaxMemInUse / 1024 / 1024);
1157 total += MaxMemInUse;
1158 Message ("Peak total memory in use : %6.3f Mbyte\n",
1159 (double) total / 1024 / 1024);
1160 msg_prefix = temp_str;
1161 }
1162#endif
1163
1164 Xfree(chunk_data);
1165 chunk_data = NULL;
1166 Xfree (WordRecs);
1167 WordRecs = NULL;
1168 Xfree (lg_bs);
1169 lg_bs = NULL;
1170 Xfree (idf);
1171 idf = NULL;
1172 Xfree (word_list);
1173 word_list = NULL;
1174 /* Free the memory allocated for the BIO_Random */
1175 occur_to_lexical (-1,1);
1176
1177 BIO_Random_Done (&crbs);
1178
1179 fclose (invf);
1180 fclose (dict);
1181 fclose (hash);
1182 fclose (count);
1183 fclose (count_trans);
1184 fclose (chunk_state);
1185 fclose (chunks);
1186 fclose (invf_state);
1187 return (COMPALLOK);
1188}
Note: See TracBrowser for help on using the repository browser.