source: trunk/gsdl3/src/packages/mg/src/text/ivf.pass2.c@ 7456

Last change on this file since 7456 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.3 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.c -- Memory efficient pass 2 inversion
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.1 2003/02/20 21:18:23 mdewsnip
27 Addition of MG package for search and retrieval
28
29 Revision 1.2 2001/09/21 12:46:42 kjm18
30 updated mg to be in line with mg_1.3f. Now uses long long for some variables
31 to enable indexing of very large collections.
32
33 * Revision 1.2 1997/08/02 05:01:57 wew
34 * changed literal values of 32 for the bit size of magic numbers of
35 * files to sizeof (unsigned long) * 8, increased the gap at the start
36 * of the invf during processing to 200 bytes
37
38 Revision 1.1 1999/08/10 21:17:54 sjboddie
39 renamed mg-1.3d directory mg
40
41 Revision 1.3 1998/12/17 09:12:51 rjmcnab
42
43 Altered mg to process utf-8 encoded Unicode. The main changes
44 are in the parsing of the input, the casefolding, and the stemming.
45
46 Revision 1.2 1998/11/25 07:55:43 rjmcnab
47
48 Modified mg to that you can specify the stemmer you want
49 to use via a command line option. You specify it to
50 mg_passes during the build process. The number of the
51 stemmer that you used is stored within the inverted
52 dictionary header and the stemmed dictionary header so
53 the correct stemmer is used in later stages of building
54 and querying.
55
56 Revision 1.1 1998/11/17 09:34:45 rjmcnab
57 *** empty log message ***
58
59 * Revision 1.3 1994/10/20 03:56:49 tes
60 * I have rewritten the boolean query optimiser and abstracted out the
61 * components of the boolean query.
62 *
63 * Revision 1.2 1994/09/20 04:41:35 tes
64 * For version 1.1
65 *
66 */
67
68/*
69 * Modified:
70 * - long long disk pointers and bit counts for inverted file
71 * (1999-08-03 Tim Bell <[email protected]>)
72 * Code provided by Owen de Kretser <[email protected]>
73 */
74
75static char *RCSID = "$Id: ivf.pass2.c 3745 2003-02-20 21:20:24Z mdewsnip $";
76
77#include "local_strings.h"
78#include "sysfuncs.h"
79#include "memlib.h"
80#include "messages.h"
81#include "stemmer.h"
82#include "perf_hash.h"
83#include "bitio_m.h"
84#include "bitio_m_mems.h"
85#include "bitio_gen.h"
86#include "bitio_random.h"
87#include "bitio_stdio.h"
88#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
89
90#include "mg_files.h"
91#include "invf.h"
92#include "locallib.h"
93#include "mg.h"
94#include "build.h"
95#include "words.h"
96#include "hash.h"
97
98#include "longlong.h"
99
100#ifdef USE_LONG_LONG
101#define BIO_Random_Seek_X BIO_Random_Seek_LL
102#define BIO_Random_Tell_X BIO_Random_Tell_LL
103#else
104#define BIO_Random_Seek_X BIO_Random_Seek
105#define BIO_Random_Tell_X BIO_Random_Tell
106#endif
107
108/* [RPAP - Feb 97: WIN32 Port] */
109#ifdef __WIN32__
110#include <io.h>
111#endif
112
113#ifndef RND_BUF_SIZE
114#define RND_BUF_SIZE 8*1024
115/*#define RND_BUF_SIZE 128 */
116#endif
117
118#define print_fsize(file)\
119do\
120 {\
121 struct stat file_state;\
122 fstat(fileno(invf_out), &file_state);\
123 Message("len(invf) = %ld", file_state.st_size);\
124 }while(0)
125
126typedef struct word_rec
127 {
128 unsigned long ptr;
129 unsigned long last;
130 }
131word_rec;
132
133typedef struct invf_state_rec
134 {
135 mg_ullong Disk_Ptr;
136 mg_ullong Disk_Last;
137 unsigned long Disk_B;
138 }
139invf_state_rec;
140
141typedef struct chunk
142 {
143 unsigned long start_doc;
144 unsigned long params_pos;
145 unsigned long disk_pos;
146 unsigned long N;
147 }
148chunk;
149
150
151static FILE *dict; /* Stemmed dictionary file */
152static FILE *hash; /* Stemmed dictionary hash file */
153static FILE *invf; /* Inverted file */
154static FILE *invf_in; /* Inverted file */
155static FILE *invf_out; /* Inverted file */
156static FILE *invf_idx; /* Inverted index file */
157static FILE *count; /* Count file */
158static FILE *count_trans; /* Count translation file */
159static FILE *invf_state; /* Inverted file State */
160static FILE *chunk_state; /* Chunk state */
161static FILE *chunks; /* Chunk state */
162static FILE *invf_para = NULL; /* Paragraph counts file */
163static FILE *weights = NULL; /* Weights file */
164
165static stdio_bitio_state sbs;
166static random_bitio_state crbs;
167static chunk *chunk_data = NULL;
168static random_bitio_state rbs, rbsp;
169
170static int docs_left = 0, next_docs_left = 0;
171static unsigned long N;
172
173static word_rec *WordRecs;
174static u_char *lg_bs;
175static float *idf = NULL;
176
177static char *MemoryBuffer = NULL;
178static unsigned long MemBufSize;
179static unsigned long BufToUse;
180static struct invf_dict_header idh;
181
182static perf_hash_data *phd;
183
184static unsigned long *word_list = NULL;
185static unsigned long wl_size = 0;
186
187static unsigned long dict_size;
188static unsigned long no_of_ptrs = 0;
189static unsigned long chunks_read = 0;
190static unsigned long Disk_pos = 0;
191static unsigned long callnum = 0;
192static unsigned long wordnum = 0;
193
194static unsigned long totalIbytes = 0;
195static unsigned long totalDbytes = 0;
196static unsigned long totalHbytes = 0;
197
198static unsigned long MemInUse = 0;
199static unsigned long MaxMemInUse = 0;
200static unsigned long max_buffer_len;
201
202void
203ChangeMemInUse (int mem)
204{
205 MemInUse += mem;
206 if (MemInUse > MaxMemInUse)
207 MaxMemInUse = MemInUse;
208}
209
210
211
212
213static int
214open_files (char *file_name)
215{
216 char FName[200];
217
218 if (!(dict = open_file (file_name, INVF_DICT_SUFFIX, "rb",
219 MAGIC_STEM_BUILD, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
220 return (COMPERROR);
221
222 if (!(hash = open_file (file_name, INVF_DICT_HASH_SUFFIX, "rb",
223 MAGIC_HASH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
224 return (COMPERROR);
225
226 if (!(count = open_file (file_name, INVF_CHUNK_SUFFIX, "rb",
227 MAGIC_CHUNK, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
228 return (COMPERROR);
229 fread (&max_buffer_len, sizeof (max_buffer_len), 1, count);
230 NTOHUL(max_buffer_len); /* [RPAP - Jan 97: Endian Ordering] */
231
232 BIO_Stdio_Decode_Start (count, &sbs);
233 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
234
235 if (!(count_trans = open_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "rb",
236 MAGIC_CHUNK_TRANS, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
237 return (COMPERROR);
238
239 if (!(invf = create_file (file_name, INVF_SUFFIX, "w+b",
240 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
241 return (COMPERROR);
242 fflush (invf);
243 if (!(invf_in = open_file (file_name, INVF_SUFFIX, "rb",
244 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
245 return (COMPERROR);
246 if (!(invf_out = create_file (file_name, INVF_SUFFIX, "wb",
247 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
248 return (COMPERROR);
249 BIO_Random_Start (invf, RND_BUF_SIZE, &rbs);
250 BIO_Random_Start (invf, RND_BUF_SIZE, &rbsp);
251 ChangeMemInUse (RND_BUF_SIZE * 2);
252
253 if (!(invf_idx = create_file (file_name, INVF_IDX_SUFFIX, "wb",
254 MAGIC_INVI, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
255 return (COMPERROR);
256
257 if (InvfLevel == 3)
258 if (!(invf_para = create_file (file_name, INVF_PARAGRAPH_SUFFIX, "wb",
259 MAGIC_PARAGRAPH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
260 return (COMPERROR);
261
262 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
263 ".invf.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
264 if (!(invf_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
265 {
266 Message ("Unable to create \"%s\"", FName);
267 return (COMPERROR);
268 }
269 unlink (FName);
270
271 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
272 ".chunk.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
273 if (!(chunk_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
274 {
275 Message ("Unable to create \"%s\"", FName);
276 return (COMPERROR);
277 }
278 unlink (FName);
279 BIO_Random_Start (chunk_state, RND_BUF_SIZE, &crbs);
280 ChangeMemInUse (RND_BUF_SIZE);
281
282 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
283 ".chunks", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
284 if (!(chunks = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
285 {
286 Message ("Unable to create \"%s\"", FName);
287 return (COMPERROR);
288 }
289 unlink (FName);
290
291 return (COMPALLOK);
292}
293
294
295
296
297
298#define ISR_CACHE 1024
299#define ISR_ENTRY_SIZE (sizeof(mg_ullong)*2 + sizeof(unsigned long))
300
301invf_state_rec *
302in_cache (int pos)
303{
304 static char isr_data[ISR_CACHE * ISR_ENTRY_SIZE];
305 static invf_state_rec isr;
306 static int isr_base = 0, isr_num = -1, isr_pos = -1;
307 if (isr_pos >= 0)
308 bcopy ((char *) &isr, &isr_data[isr_pos * ISR_ENTRY_SIZE], ISR_ENTRY_SIZE);
309 if (pos < isr_base || pos >= isr_base + isr_num)
310 {
311 if (isr_num >= 0)
312 {
313 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
314 fwrite (isr_data, 1, ISR_ENTRY_SIZE * isr_num, invf_state);
315 }
316 isr_base = pos;
317 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
318 fread (isr_data, 1, ISR_ENTRY_SIZE * ISR_CACHE, invf_state);
319 isr_num = ISR_CACHE;
320 }
321 isr_pos = pos - isr_base;
322 bcopy (&isr_data[isr_pos * ISR_ENTRY_SIZE], (char *) &isr, ISR_ENTRY_SIZE);
323 return &isr;
324}
325
326
327
328
329
330unsigned long
331occur_to_lexical (long occ)
332{
333 static long pos = -1;
334 static random_bitio_state rbs;
335 static int val = 0;
336 if (pos == -1)
337 {
338 BIO_Random_Start (count_trans, RND_BUF_SIZE, &rbs);
339 pos = 0x7fffffff;
340 }
341 if (occ < pos)
342 {
343 if (occ == -1)
344 {
345 BIO_Random_Done (&rbs);
346 return 0;
347 }
348 BIO_Random_Seek_X (sizeof (unsigned long) * 8, &rbs);
349 pos = 0;
350 }
351 while (pos <= occ)
352 {
353 val = BIO_Random_Binary_Decode (dict_size + 1, &rbs, NULL) - 1;
354 pos++;
355 }
356 return (val);
357}
358
359
360void
361add_chunk_state (unsigned long pos, unsigned long start_doc,
362 unsigned long N)
363{
364 chunk_data[chunks_read].params_pos = pos;
365 chunk_data[chunks_read].start_doc = start_doc;
366 chunk_data[chunks_read].N = N;
367 chunks_read++;
368}
369
370
371int
372init_ivf_2 (char *file_name)
373{
374 u_char prev[MAXSTEMLEN + 1];
375 int i;
376 mg_ullong totalIbits;
377 mg_ullong lasttotalIbits;
378 double logN = 0.0;
379
380
381 if (open_files (file_name) == COMPERROR)
382 return COMPERROR;
383
384
385 /* Read in the stemmed dictionary file header */
386 fread ((char *) &idh, sizeof (idh), 1, dict);
387
388 /* [RPAP - Jan 97: Endian Ordering] */
389 NTOHUL(idh.lookback);
390 NTOHUL(idh.dict_size);
391 NTOHUL(idh.total_bytes);
392 NTOHUL(idh.index_string_bytes);
393 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
394 NTOHUL(idh.num_of_docs);
395 NTOHUL(idh.static_num_of_docs);
396 NTOHUL(idh.num_of_words);
397 NTOHUL(idh.stemmer_num);
398 NTOHUL(idh.stem_method);
399
400 dict_size = idh.dict_size;
401
402 N = idh.num_of_docs;
403
404 if (!(phd = read_perf_hash_data (hash)))
405 {
406 Message ("Unable to read in hash data");
407 return COMPERROR;
408 }
409 totalHbytes = sizeof (perf_hash_data) + sizeof (u_char) * 256 +
410 sizeof (int) * (phd->MAX_N + 1) + sizeof (int *) * 3 * phd->MAX_CH +
411 sizeof (long) * phd->MAX_CH * phd->MAX_L;
412
413 if (!(WordRecs = Xmalloc (sizeof (word_rec) * idh.dict_size)))
414 {
415 Message ("No memory for word entries");
416 return COMPERROR;
417 }
418 totalDbytes += sizeof (word_rec) * idh.dict_size;
419
420 /* separate storage for the log(b) values, one byte each */
421 if (!(lg_bs = Xmalloc (sizeof (u_char) * idh.dict_size)))
422 {
423 Message ("No memory for lg b's");
424 return COMPERROR;
425 }
426 totalDbytes += sizeof (u_char) * idh.dict_size;
427
428 if (MakeWeights)
429 {
430 /* separate storage for the idf values, one single each */
431 if (!(idf = Xmalloc (sizeof (float) * idh.dict_size)))
432 {
433 Message ("No memory for idf's");
434 return COMPERROR;
435 }
436 totalDbytes += sizeof (float) * idh.dict_size;
437
438 if (!(weights = create_file (file_name, WEIGHTS_SUFFIX, "wb",
439 MAGIC_WGHT, MG_CONTINUE))) { /* [RPAP - Feb 97: WIN32 Port] */
440 Message ("Couldn't open weights file for writing");
441 return (COMPERROR);
442 }
443 }
444 else
445 {
446 unlink (make_name (file_name, WEIGHTS_SUFFIX, NULL));
447 }
448
449 chunk_data = Xmalloc (sizeof (chunk) * (ChunkLimit + 2));
450 totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
451
452 totalIbits = sizeof (unsigned long) * 8; /* The magic number */
453 totalIbits += 8 * 200; /* A 200 byte gap */
454
455 if (MakeWeights)
456 {
457 wl_size = 1024;
458 if (!(word_list = Xmalloc (sizeof (*word_list) * wl_size)))
459 {
460 Message ("No memory for word_list");
461 return COMPERROR;
462 }
463
464 logN = log ((double) N);
465 }
466
467 for (i = 0; i < idh.dict_size; i++)
468 {
469 invf_state_rec *isr;
470 register unsigned long copy, suff, p;
471 unsigned long fcnt, wcnt;
472
473 lasttotalIbits = totalIbits;
474
475 copy = fgetc (dict);
476 suff = fgetc (dict);
477 *prev = copy + suff;
478 fread (prev + copy + 1, sizeof (u_char), suff, dict);
479
480 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
481 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
482
483 /* [RPAP - Jan 97: Endian Ordering] */
484 NTOHUL(fcnt);
485 NTOHUL(wcnt);
486
487 WordRecs[i].last = 0;
488 WordRecs[i].ptr = 0;
489
490 p = fcnt;
491
492 if (MakeWeights)
493 idf[i] = logN - log ((double) fcnt);
494
495
496 isr = in_cache (i);
497
498 isr->Disk_Last = 0;
499 isr->Disk_Ptr = totalIbits;
500
501 isr->Disk_B = BIO_Bblock_Init (N, p);
502
503 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
504
505 if (InvfLevel >= 2)
506 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
507
508#ifdef USE_LONG_LONG
509 totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
510#else
511 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
512#endif
513
514 if (totalIbits < lasttotalIbits) {
515 fprintf(stderr, "ERROR: The totalIbits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
516 if (sizeof (mg_ullong) < 8) {
517 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
518 }
519 fprintf(stderr, " Build aborted.\n");
520 exit(1);
521 }
522 }
523
524
525 /* now convert to bytes, and actually get the space */
526#ifdef USE_LONG_LONG
527 totalIbytes = (totalIbits + 7ull) >> 3ull;
528#else
529 totalIbytes = (totalIbits + 7ul) >> 3ul;
530#endif
531
532 return (COMPALLOK);
533
534}
535
536
537
538
539
540static void
541LoadCounts (void)
542{
543 unsigned long numwords, i, last_total;
544 static unsigned long local_N = 0;
545 unsigned long totalIbits, crbs_pos;
546 word_rec *wr;
547 unsigned long *counts;
548
549 if (MemoryBuffer == NULL)
550 {
551 MemBufSize = sizeof (unsigned long) * dict_size;
552 if (max_buffer_len > MemBufSize)
553 MemBufSize = max_buffer_len;
554 if (!(MemoryBuffer = Xmalloc (MemBufSize)))
555 FatalError (1, "Unable to allocate memory for buffer");
556 ChangeMemInUse (MemBufSize);
557 }
558
559 counts = (unsigned long *) MemoryBuffer;
560 bzero ((char *) counts, sizeof (unsigned long) * dict_size);
561
562 docs_left = next_docs_left;
563 if (!docs_left)
564 FatalError (1, "The number of docs in the current chunk is 0");
565
566 BufToUse = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
567
568 numwords = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
569
570 local_N = docs_left;
571
572
573
574 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
575 wr->ptr = 0;
576
577 bzero ((char *) lg_bs, dict_size);
578
579 for (i = 0; i < numwords; i++)
580 {
581 unsigned long word_num, wcnt, fcnt, p;
582 word_num = occur_to_lexical (i);
583
584 wr = &WordRecs[word_num];
585
586 wcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
587 if (wcnt >= 2)
588 fcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL);
589 else
590 fcnt = wcnt;
591
592 p = fcnt;
593
594 if (wcnt)
595 {
596 register unsigned long length;
597 counts[word_num] = p;
598 length = BIO_Bblock_Bound (local_N, p);
599 if (InvfLevel >= 2)
600 length += wcnt;
601 wr->ptr = length;
602 lg_bs[word_num] = floorlog_2 (BIO_Bblock_Init_W (local_N, p));
603 }
604
605 }
606
607 crbs_pos = BIO_Random_Tell (&crbs);
608
609 totalIbits = 0;
610 last_total = 0;
611 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
612 {
613 register unsigned long length;
614 length = wr->ptr;
615 wr->last = callnum;
616 BIO_Random_Gamma_Encode (counts[i] + 1, &crbs, NULL);
617 if (counts[i])
618 {
619 if (i)
620 BIO_Random_Delta_Encode (totalIbits - last_total + 1, &crbs, NULL);
621 else
622 BIO_Random_Delta_Encode (1, &crbs, NULL);
623
624 last_total = totalIbits;
625 }
626 wr->ptr = totalIbits;
627 totalIbits += length;
628 }
629 add_chunk_state (crbs_pos, callnum, local_N);
630
631 if ((totalIbits + 7ul) >> 3ul > BufToUse)
632 FatalError (1, "Pointers exceed buffer size");
633
634 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
635}
636
637
638
639
640static void
641DumpChunk (void)
642{
643 chunk_data[chunks_read - 1].disk_pos = Disk_pos << 3;
644 fseek (chunks, Disk_pos, 0);
645 fwrite (MemoryBuffer, sizeof (char), BufToUse, chunks);
646 Disk_pos += BufToUse;
647}
648
649
650
651
652static void
653DiskMerge (void)
654{
655 random_bitio_state *rbsi;
656 random_bitio_state *chks = NULL;
657 unsigned long *chunk_ptrs;
658 int i;
659
660 BIO_Random_Flush (&crbs);
661
662 chunk_ptrs = Xmalloc (chunks_read * sizeof (unsigned long));
663 ChangeMemInUse (chunks_read * sizeof (unsigned long));
664 bzero ((char *) chunk_ptrs, chunks_read * sizeof (unsigned long));
665
666 rbsi = Xmalloc (chunks_read * sizeof (random_bitio_state));
667 ChangeMemInUse (chunks_read * sizeof (random_bitio_state));
668 for (i = 0; i < chunks_read; i++)
669 {
670 rbsi[i] = crbs;
671 rbsi[i].Buf = Xmalloc (rbsi[i].len);
672 ChangeMemInUse (rbsi[i].len);
673 bcopy ((char *) (crbs.Buf), (char *) (rbsi[i].Buf), rbsi[i].len);
674 BIO_Random_Seek (chunk_data[i].params_pos, &rbsi[i]);
675 }
676
677 if (chunks_read > 1)
678 {
679 int j;
680 chks = Xmalloc ((chunks_read - 1) * sizeof (random_bitio_state));
681 ChangeMemInUse ((chunks_read - 1) * sizeof (random_bitio_state));
682 BIO_Random_Start (chunks, RND_BUF_SIZE, &chks[0]);
683 ChangeMemInUse (RND_BUF_SIZE);
684 for (j = 1; j < chunks_read - 1; j++)
685 {
686 chks[j] = chks[0];
687 chks[j].Buf = Xmalloc (chks[0].len);
688 ChangeMemInUse (chks[0].len);
689 bcopy ((char *) (chks[0].Buf), (char *) (chks[j].Buf), chks[0].len);
690 }
691 }
692 for (i = 0; i < dict_size; i++)
693 {
694 int j;
695 invf_state_rec *isr = in_cache (i);
696 register int B;
697
698 BIO_Random_Seek_X (isr->Disk_Ptr, &rbs); /* Position in invf file */
699
700 B = isr->Disk_B;
701
702 for (j = 0; j < chunks_read; j++)
703 {
704 int p;
705 p = BIO_Random_Gamma_Decode (&rbsi[j], NULL) - 1;
706
707 if (p)
708 {
709 int ptr, b;
710 chunk_ptrs[j] += BIO_Random_Delta_Decode (&rbsi[j], NULL) - 1;
711 ptr = chunk_ptrs[j];
712 b = 1 << floorlog_2 (BIO_Bblock_Init_W (chunk_data[j].N, p));
713
714 if (j == chunks_read - 1)
715 {
716 int k, CurrDoc;
717 DECODE_START ((u_char *) MemoryBuffer, ptr)
718 CurrDoc = isr->Disk_Last;
719 for (k = 0; k < p; k++)
720 {
721 register unsigned long x, tf;
722 BBLOCK_DECODE (x, b);
723 if (k == 0)
724 x = x + chunk_data[j].start_doc - isr->Disk_Last;
725 CurrDoc += x;
726 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
727 if (InvfLevel >= 2)
728 {
729 UNARY_DECODE (tf);
730 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
731 }
732 }
733 DECODE_DONE
734 isr->Disk_Last = CurrDoc;
735 }
736 else
737 {
738 int k, CurrDoc;
739 random_bitio_state *Chks = chks + j;
740 BIO_Random_Seek (chunk_data[j].disk_pos + ptr, Chks);
741 CurrDoc = isr->Disk_Last;
742 for (k = 0; k < p; k++)
743 {
744 register unsigned long x, tf;
745 x = BIO_Random_Bblock_Decode (b, Chks, NULL);
746 if (k == 0)
747 x = x + chunk_data[j].start_doc - isr->Disk_Last;
748 CurrDoc += x;
749 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
750 if (InvfLevel >= 2)
751 {
752 tf = BIO_Random_Unary_Decode (Chks, NULL);
753 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
754 }
755 }
756 isr->Disk_Last = CurrDoc;
757 }
758 }
759 }
760
761 isr->Disk_Ptr = BIO_Random_Tell_X (&rbs);
762
763 }
764 if (chunks_read > 1)
765 {
766 int j;
767 for (j = 0; j < chunks_read - 1; j++)
768 {
769 Xfree (chks[j].Buf);
770 ChangeMemInUse (-chks[j].len);
771 }
772 Xfree (chks);
773 ChangeMemInUse (-(chunks_read - 1) * sizeof (random_bitio_state));
774 }
775
776 for (i = 0; i < chunks_read; i++)
777 {
778 Xfree (rbsi[i].Buf);
779 ChangeMemInUse (-rbsi[i].len);
780 }
781 Xfree (rbsi);
782 ChangeMemInUse (-chunks_read * sizeof (random_bitio_state));
783 chunks_read = 0;
784 Xfree (chunk_ptrs);
785 ChangeMemInUse (-chunks_read * sizeof (unsigned long));
786 Disk_pos = 0;
787 BIO_Random_Seek (0, &crbs);
788}
789
790static void
791MergeIn (void)
792{
793 static int disk_chunks = 0;
794 static header = 0;
795 if (!header)
796 {
797 fprintf (stderr, "ivf.pass2 : ");
798 header = 1;
799 }
800 if (disk_chunks == ChunkLimit || next_docs_left == 0)
801 {
802 fprintf (stderr, "M");
803 DiskMerge ();
804 disk_chunks = 0;
805 }
806 else
807 {
808 fprintf (stderr, "-");
809 DumpChunk ();
810 disk_chunks++;
811 }
812 if (next_docs_left == 0)
813 fprintf (stderr, "\n");
814}
815
816
817static int
818wl_comp (const void *a, const void *b)
819{
820 return *((int *) a) - *((int *) b);
821}
822
823static int
824process_doc (u_char * s_in, int l_in)
825{
826 int res;
827 u_char *end = s_in + l_in - 1;
828 unsigned long tocode;
829 unsigned long wl_pos = 0;
830
831 if (!docs_left)
832 LoadCounts ();
833
834 callnum++;
835
836 if (!inaword (s_in, end))
837 if (SkipSGML)
838 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
839 else
840 PARSE_NON_STEM_WORD (s_in, end);
841
842 while (s_in <= end)
843 {
844 u_char Word[MAXSTEMLEN + 1];
845
846 PARSE_STEM_WORD (Word, s_in, end);
847 stemmer (idh.stem_method, idh.stemmer_num, Word);
848 if (SkipSGML)
849 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
850 else
851 PARSE_NON_STEM_WORD (s_in, end);
852
853 if (*Word == 0)
854 continue;
855
856 res = perf_hash (phd, Word);
857
858 {
859 word_rec *arr = &WordRecs[res];
860 int b = 1 << lg_bs[res];
861 wordnum++;
862
863 tocode = callnum;
864
865 ENCODE_START ((u_char *) MemoryBuffer, arr->ptr)
866
867 if (tocode > arr->last)
868 {
869 register int x;
870 x = tocode - arr->last - 1;
871 BBLOCK_ENCODE (x + 1, b);
872 if (InvfLevel >= 2)
873 ENCODE_BIT (1);
874 no_of_ptrs++;
875 arr->last = tocode;
876 }
877 else if (InvfLevel >= 2)
878 {
879 __pos--;
880 ENCODE_BIT (0);
881 ENCODE_BIT (1);
882 }
883 arr->ptr = __pos;
884 ENCODE_DONE
885 }
886
887 if (MakeWeights)
888 {
889 if (wl_pos >= wl_size)
890 {
891 wl_size += (wl_size >> 1);
892 word_list = Xrealloc (word_list, sizeof (*word_list) * wl_size);
893 }
894 word_list[wl_pos++] = res;
895 }
896 }
897 if (MakeWeights)
898 {
899 float doc_weight = 0.0;
900 if (wl_pos)
901 {
902 unsigned long *wl = word_list;
903 unsigned long i, count, val;
904 qsort (wl, wl_pos, sizeof (*wl), wl_comp);
905 count = 1;
906 val = *wl++;
907 for (i = 1; i <= wl_pos; i++, wl++)
908 if (i == wl_pos || val != *wl)
909 {
910 double weight = count * idf[val];
911 doc_weight += weight * weight;
912 count = 1;
913 val = *wl;
914 }
915 else
916 count++;
917 }
918 HTONF(doc_weight); /* [RPAP - Jan 97: Endian Ordering] */
919 fwrite ((char *) &doc_weight, sizeof (doc_weight), 1, weights);
920 }
921 docs_left--;
922 if (!docs_left)
923 MergeIn ();
924
925 return COMPALLOK;
926}
927
928int
929process_ivf_2 (u_char * s_in, int l_in)
930{
931 if (InvfLevel <= 2)
932 return process_doc (s_in, l_in);
933 else
934 {
935 int count = 0;
936 int pos = 0;
937 u_char *start = s_in;
938 while (pos < l_in)
939 {
940 if (s_in[pos] == TERMPARAGRAPH)
941 {
942 int len = pos + s_in + 1 - start;
943 if (process_doc (start, len) != COMPALLOK)
944 return (COMPERROR);
945 start = s_in + pos + 1;
946 count++;
947 }
948 pos++;
949 }
950 if (start < s_in + pos)
951 {
952 if (process_doc (start, pos + s_in - start) != COMPALLOK)
953 return (COMPERROR);
954 count++;
955 }
956 HTONSI(count); /* [RPAP - Jan 97: Endian Ordering] */
957 fwrite ((char *) &count, sizeof (count), 1, invf_para);
958 }
959 return COMPALLOK;
960}
961
962
963
964
965
966static void
967stats (unsigned long len)
968{
969#ifndef SILENT
970 fseek (count, 0, 2);
971 fseek (count_trans, 0, 2);
972 fseek (invf_state, 0, 2);
973 fseek (invf, 0, 0);
974 fseek (invf, 0, 2);
975 fseek (chunks, 0, 2);
976 fseek (chunk_state, 0, 2);
977 Message ("File sizes\n");
978 Message (" Chunk desc : %10u bytes\n", ftell (count));
979 Message (" Chunk trans : %10u bytes\n", ftell (count_trans));
980 Message (" Chunks : %10u bytes\n", ftell (chunks));
981 Message (" Chunk state : %10u bytes\n", ftell (chunk_state));
982 Message (" Invf state : %10u bytes\n", ftell (invf_state));
983 Message (" Peak invf : %10u bytes\n", len);
984 Message (" Final invf : %10u bytes\n", ftell (invf));
985 Message ("Peak disk usage : %10.2f %%\n",
986 (double) (ftell (count) + ftell (count_trans) +
987 ftell (invf_state) + ftell (chunks) +
988 ftell (chunk_state) + len) / ftell (invf) * 100.0);
989#endif
990}
991
992
993/* ARGSUSED */
994int
995done_ivf_2 (char *FileName)
996{
997 long i;
998 mg_ullong totalIbits;
999 unsigned long invf_len;
1000 unsigned long bytes_output;
1001 struct invf_file_header ifh;
1002
1003 if (weights)
1004 fclose (weights);
1005 if (invf_para)
1006 fclose (invf_para);
1007
1008 free_perf_hash (phd);
1009
1010 free (MemoryBuffer);
1011 ChangeMemInUse (-MemBufSize);
1012
1013 BIO_Random_Done (&rbs);
1014 BIO_Random_Done (&rbsp);
1015 fflush (invf);
1016
1017 fseek (invf, 0, 2);
1018 invf_len = ftell (invf);
1019
1020 fseek (invf_out, sizeof (long), 0);
1021 /* [RPAP - Jan 97: Endian Ordering] */
1022 HTONUL2(dict_size, ifh.no_of_words);
1023 HTONUL2(no_of_ptrs, ifh.no_of_ptrs);
1024 ifh.skip_mode = 0;
1025 bzero ((char *) ifh.params, sizeof (ifh.params));
1026 HTONUL2(InvfLevel, ifh.InvfLevel);
1027 fwrite ((char *) &ifh, sizeof (ifh), 1, invf_out);
1028
1029 bytes_output = ftell (invf_out);
1030
1031 totalIbits = sizeof (unsigned long) * 8; /* The magic number */
1032 totalIbits += 8 * 200; /* A 200 byte gap */
1033
1034 /* find the right place in the file to start reading p values */
1035 fseek (dict, sizeof (unsigned long) + sizeof (struct invf_dict_header), 0);
1036 for (i = 0; i < dict_size; i++)
1037 {
1038 invf_state_rec *isr;
1039 unsigned long fcnt, wcnt, s, e;
1040 register unsigned long p;
1041 u_char dummy1, dummy2[MAXSTEMLEN + 1];
1042
1043 /* output location to the invf_idx */
1044 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1045 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1046 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1047
1048 /* read an entry for a word, just to get p value */
1049 dummy1 = fgetc (dict);
1050 dummy1 = fgetc (dict);
1051 fread (dummy2, sizeof (u_char), dummy1, dict);
1052 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
1053 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
1054
1055 /* [RPAP - Jan 97: Endian Ordering] */
1056 NTOHUL(fcnt);
1057 NTOHUL(wcnt);
1058
1059 p = fcnt;
1060
1061 isr = in_cache (i);
1062
1063 e = (isr->Disk_Ptr + 7ul) >> 3ul;
1064 s = totalIbits >> 3;
1065
1066 fseek (invf_in, s, 0);
1067 while (s < e)
1068 {
1069 u_char c = getc (invf_in);
1070 if (s == e - 1)
1071 {
1072 u_char ands[8] =
1073 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1074 c &= ands[isr->Disk_Ptr & 7ul];
1075 }
1076 putc (c, invf_out);
1077 bytes_output++;
1078 s++;
1079 }
1080
1081 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
1082 if (InvfLevel >= 2)
1083 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
1084#ifdef USE_LONG_LONG
1085 totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
1086#else
1087 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
1088#endif
1089
1090 }
1091
1092 fclose (invf_in);
1093
1094 /* [RPAP - Feb 97: WIN32 Port] */
1095#ifdef __WIN32__
1096 if (!(_chsize (_fileno (invf_out), bytes_output)))
1097 Message ("Could not truncate invf.");
1098#else
1099 ftruncate (fileno (invf_out), bytes_output);
1100#endif
1101
1102 fclose (invf_out);
1103
1104 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1105 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1106 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1107
1108 fclose (invf_idx);
1109
1110#ifndef SILENT
1111 {
1112 char *temp_str = msg_prefix;
1113 unsigned long total;
1114 msg_prefix = "ivf.pass2";
1115 stats (invf_len);
1116 Message ("Pass two data structures : %6.3f Mbyte\n",
1117 (double) totalDbytes / 1024 / 1024);
1118 total = totalDbytes;
1119 Message ("Pass two hash structure(s) : %6.3f Mbyte\n",
1120 (double) totalHbytes / 1024 / 1024);
1121 total += totalHbytes;
1122 Message ("Peak extra memory in use : %6.3f Mbyte\n",
1123 (double) MaxMemInUse / 1024 / 1024);
1124 total += MaxMemInUse;
1125 Message ("Peak total memory in use : %6.3f Mbyte\n",
1126 (double) total / 1024 / 1024);
1127 msg_prefix = temp_str;
1128 }
1129#endif
1130
1131 Xfree (WordRecs);
1132 Xfree (lg_bs);
1133
1134 /* Free the memory allocated for the BIO_Random */
1135 occur_to_lexical (-1);
1136
1137 BIO_Random_Done (&crbs);
1138
1139 fclose (invf);
1140 fclose (dict);
1141 fclose (hash);
1142 fclose (count);
1143 fclose (count_trans);
1144 fclose (chunk_state);
1145 fclose (chunks);
1146 fclose (invf_state);
1147 return (COMPALLOK);
1148}
Note: See TracBrowser for help on using the repository browser.