source: gsdl/trunk/trunk/mg/src/text/ivf.pass2.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.5 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.c -- Memory efficient pass 2 inversion
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.c 16583 2008-07-29 10:20:36Z davidb $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.3 2004/06/10 05:07:43 kjdon
27 have to declare vars before calling functions!!
28
29 Revision 1.2 2004/06/10 03:02:05 kjdon
30 fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2
31
32 Revision 1.1 2003/02/20 21:18:23 mdewsnip
33 Addition of MG package for search and retrieval
34
35 Revision 1.2 2001/09/21 12:46:42 kjm18
36 updated mg to be in line with mg_1.3f. Now uses long long for some variables
37 to enable indexing of very large collections.
38
39 * Revision 1.2 1997/08/02 05:01:57 wew
40 * changed literal values of 32 for the bit size of magic numbers of
41 * files to sizeof (unsigned long) * 8, increased the gap at the start
42 * of the invf during processing to 200 bytes
43
44 Revision 1.1 1999/08/10 21:17:54 sjboddie
45 renamed mg-1.3d directory mg
46
47 Revision 1.3 1998/12/17 09:12:51 rjmcnab
48
49 Altered mg to process utf-8 encoded Unicode. The main changes
50 are in the parsing of the input, the casefolding, and the stemming.
51
52 Revision 1.2 1998/11/25 07:55:43 rjmcnab
53
54 Modified mg to that you can specify the stemmer you want
55 to use via a command line option. You specify it to
56 mg_passes during the build process. The number of the
57 stemmer that you used is stored within the inverted
58 dictionary header and the stemmed dictionary header so
59 the correct stemmer is used in later stages of building
60 and querying.
61
62 Revision 1.1 1998/11/17 09:34:45 rjmcnab
63 *** empty log message ***
64
65 * Revision 1.3 1994/10/20 03:56:49 tes
66 * I have rewritten the boolean query optimiser and abstracted out the
67 * components of the boolean query.
68 *
69 * Revision 1.2 1994/09/20 04:41:35 tes
70 * For version 1.1
71 *
72 */
73
74/*
75 * Modified:
76 * - long long disk pointers and bit counts for inverted file
77 * (1999-08-03 Tim Bell <[email protected]>)
78 * Code provided by Owen de Kretser <[email protected]>
79 */
80
81static char *RCSID = "$Id: ivf.pass2.c 16583 2008-07-29 10:20:36Z davidb $";
82
83#include "local_strings.h"
84#include "sysfuncs.h"
85#include "memlib.h"
86#include "messages.h"
87#include "stemmer.h"
88#include "perf_hash.h"
89#include "bitio_m.h"
90#include "bitio_m_mems.h"
91#include "bitio_gen.h"
92#include "bitio_random.h"
93#include "bitio_stdio.h"
94#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
95
96#include "mg_files.h"
97#include "invf.h"
98#include "locallib.h"
99#include "mg.h"
100#include "build.h"
101#include "words.h"
102#include "hash.h"
103
104#include "longlong.h"
105
106#ifdef USE_LONG_LONG
107#define BIO_Random_Seek_X BIO_Random_Seek_LL
108#define BIO_Random_Tell_X BIO_Random_Tell_LL
109#else
110#define BIO_Random_Seek_X BIO_Random_Seek
111#define BIO_Random_Tell_X BIO_Random_Tell
112#endif
113
114/* [RPAP - Feb 97: WIN32 Port] */
115#ifdef __WIN32__
116#include <io.h>
117#endif
118
119#ifndef RND_BUF_SIZE
120#define RND_BUF_SIZE 8*1024
121/*#define RND_BUF_SIZE 128 */
122#endif
123
124#define print_fsize(file)\
125do\
126 {\
127 struct stat file_state;\
128 fstat(fileno(invf_out), &file_state);\
129 Message("len(invf) = %ld", file_state.st_size);\
130 }while(0)
131
132typedef struct word_rec
133 {
134 unsigned long ptr;
135 unsigned long last;
136 }
137word_rec;
138
139typedef struct invf_state_rec
140 {
141 mg_ullong Disk_Ptr;
142 mg_ullong Disk_Last;
143 unsigned long Disk_B;
144 }
145invf_state_rec;
146
147typedef struct chunk
148 {
149 unsigned long start_doc;
150 unsigned long params_pos;
151 unsigned long disk_pos;
152 unsigned long N;
153 }
154chunk;
155
156
157static FILE *dict; /* Stemmed dictionary file */
158static FILE *hash; /* Stemmed dictionary hash file */
159static FILE *invf; /* Inverted file */
160static FILE *invf_in; /* Inverted file */
161static FILE *invf_out; /* Inverted file */
162static FILE *invf_idx; /* Inverted index file */
163static FILE *count; /* Count file */
164static FILE *count_trans; /* Count translation file */
165static FILE *invf_state; /* Inverted file State */
166static FILE *chunk_state; /* Chunk state */
167static FILE *chunks; /* Chunk state */
168static FILE *invf_para = NULL; /* Paragraph counts file */
169static FILE *weights = NULL; /* Weights file */
170
171static stdio_bitio_state sbs;
172static random_bitio_state crbs;
173static chunk *chunk_data = NULL;
174static random_bitio_state rbs, rbsp;
175
176static int docs_left = 0, next_docs_left = 0;
177static unsigned long N;
178
179static word_rec *WordRecs;
180static u_char *lg_bs;
181static float *idf = NULL;
182
183static char *MemoryBuffer = NULL;
184static unsigned long MemBufSize;
185static unsigned long BufToUse;
186static struct invf_dict_header idh;
187
188static perf_hash_data *phd;
189
190static unsigned long *word_list = NULL;
191static unsigned long wl_size = 0;
192
193static unsigned long dict_size;
194static unsigned long no_of_ptrs = 0;
195static unsigned long chunks_read = 0;
196static unsigned long Disk_pos = 0;
197static unsigned long callnum = 0;
198static unsigned long wordnum = 0;
199
200static unsigned long totalIbytes = 0;
201static unsigned long totalDbytes = 0;
202static unsigned long totalHbytes = 0;
203
204static unsigned long MemInUse = 0;
205static unsigned long MaxMemInUse = 0;
206static unsigned long max_buffer_len;
207
208void
209ChangeMemInUse (int mem)
210{
211 MemInUse += mem;
212 if (MemInUse > MaxMemInUse)
213 MaxMemInUse = MemInUse;
214}
215
216void
217ResetStaticI2Vars()
218{
219 docs_left = 0;
220 next_docs_left = 0;
221 N = 0;
222 MemBufSize=0;
223 BufToUse=0;
224 memset(&idh, 0, sizeof(idh));
225 wl_size = 0;
226
227 dict_size = 0;
228 no_of_ptrs = 0;
229 chunks_read = 0;
230 Disk_pos = 0;
231 callnum = 0;
232 wordnum = 0;
233
234 totalIbytes = 0;
235 totalDbytes = 0;
236 totalHbytes = 0;
237
238 MemInUse = 0;
239 MaxMemInUse = 0;
240 max_buffer_len = 0;
241
242}
243
244
245static int
246open_files (char *file_name)
247{
248 char FName[200];
249
250 if (!(dict = open_file (file_name, INVF_DICT_SUFFIX, "rb",
251 MAGIC_STEM_BUILD, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
252 return (COMPERROR);
253
254 if (!(hash = open_file (file_name, INVF_DICT_HASH_SUFFIX, "rb",
255 MAGIC_HASH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
256 return (COMPERROR);
257
258 if (!(count = open_file (file_name, INVF_CHUNK_SUFFIX, "rb",
259 MAGIC_CHUNK, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
260 return (COMPERROR);
261 fread (&max_buffer_len, sizeof (max_buffer_len), 1, count);
262 NTOHUL(max_buffer_len); /* [RPAP - Jan 97: Endian Ordering] */
263
264 BIO_Stdio_Decode_Start (count, &sbs);
265 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
266
267 if (!(count_trans = open_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "rb",
268 MAGIC_CHUNK_TRANS, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
269 return (COMPERROR);
270
271 if (!(invf = create_file (file_name, INVF_SUFFIX, "w+b",
272 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
273 return (COMPERROR);
274 fflush (invf);
275 if (!(invf_in = open_file (file_name, INVF_SUFFIX, "rb",
276 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
277 return (COMPERROR);
278 if (!(invf_out = create_file (file_name, INVF_SUFFIX, "wb",
279 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
280 return (COMPERROR);
281 BIO_Random_Start (invf, RND_BUF_SIZE, &rbs);
282 BIO_Random_Start (invf, RND_BUF_SIZE, &rbsp);
283 ChangeMemInUse (RND_BUF_SIZE * 2);
284
285 if (!(invf_idx = create_file (file_name, INVF_IDX_SUFFIX, "wb",
286 MAGIC_INVI, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
287 return (COMPERROR);
288
289 if (InvfLevel == 3)
290 if (!(invf_para = create_file (file_name, INVF_PARAGRAPH_SUFFIX, "wb",
291 MAGIC_PARAGRAPH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
292 return (COMPERROR);
293
294 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
295 ".invf.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
296 if (!(invf_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
297 {
298 Message ("Unable to create \"%s\"", FName);
299 return (COMPERROR);
300 }
301 unlink (FName);
302
303 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
304 ".chunk.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
305 if (!(chunk_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
306 {
307 Message ("Unable to create \"%s\"", FName);
308 return (COMPERROR);
309 }
310 unlink (FName);
311 BIO_Random_Start (chunk_state, RND_BUF_SIZE, &crbs);
312 ChangeMemInUse (RND_BUF_SIZE);
313
314 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
315 ".chunks", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
316 if (!(chunks = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
317 {
318 Message ("Unable to create \"%s\"", FName);
319 return (COMPERROR);
320 }
321 unlink (FName);
322
323 return (COMPALLOK);
324}
325
326
327
328
329
330#define ISR_CACHE 1024
331#define ISR_ENTRY_SIZE (sizeof(mg_ullong)*2 + sizeof(unsigned long))
332
333invf_state_rec *
334in_cache (int pos)
335{
336 static char isr_data[ISR_CACHE * ISR_ENTRY_SIZE];
337 static invf_state_rec isr;
338 static int isr_base = 0, isr_num = -1, isr_pos = -1;
339 if (isr_pos >= 0)
340 bcopy ((char *) &isr, &isr_data[isr_pos * ISR_ENTRY_SIZE], ISR_ENTRY_SIZE);
341 if (pos < isr_base || pos >= isr_base + isr_num)
342 {
343 if (isr_num >= 0)
344 {
345 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
346 fwrite (isr_data, 1, ISR_ENTRY_SIZE * isr_num, invf_state);
347 }
348 isr_base = pos;
349 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
350 fread (isr_data, 1, ISR_ENTRY_SIZE * ISR_CACHE, invf_state);
351 isr_num = ISR_CACHE;
352 }
353 isr_pos = pos - isr_base;
354 bcopy (&isr_data[isr_pos * ISR_ENTRY_SIZE], (char *) &isr, ISR_ENTRY_SIZE);
355 return &isr;
356}
357
358
359
360
361
362unsigned long
363occur_to_lexical (long occ, int clear_state)
364{
365 static long pos = -1;
366 static random_bitio_state rbs;
367 static int val = 0;
368 if (clear_state) {
369 pos = -1;
370 val = 0;
371 return 0;
372 }
373 if (pos == -1)
374 {
375 BIO_Random_Start (count_trans, RND_BUF_SIZE, &rbs);
376 pos = 0x7fffffff;
377 }
378 if (occ < pos)
379 {
380 if (occ == -1)
381 {
382 BIO_Random_Done (&rbs);
383 return 0;
384 }
385 BIO_Random_Seek_X (sizeof (unsigned long) * 8, &rbs);
386 pos = 0;
387 }
388 while (pos <= occ)
389 {
390 val = BIO_Random_Binary_Decode (dict_size + 1, &rbs, NULL) - 1;
391 pos++;
392 }
393 return (val);
394}
395
396
397void
398add_chunk_state (unsigned long pos, unsigned long start_doc,
399 unsigned long N)
400{
401 chunk_data[chunks_read].params_pos = pos;
402 chunk_data[chunks_read].start_doc = start_doc;
403 chunk_data[chunks_read].N = N;
404 chunks_read++;
405}
406
407
408int
409init_ivf_2 (char *file_name)
410{
411 u_char prev[MAXSTEMLEN + 1];
412 int i;
413 mg_ullong totalIbits;
414 mg_ullong lasttotalIbits;
415 double logN = 0.0;
416
417 ResetStaticI2Vars(); /* clear the global statics */
418 occur_to_lexical(0, 1); /* clear the statics in here*/
419
420 if (open_files (file_name) == COMPERROR)
421 return COMPERROR;
422
423
424 /* Read in the stemmed dictionary file header */
425 fread ((char *) &idh, sizeof (idh), 1, dict);
426
427 /* [RPAP - Jan 97: Endian Ordering] */
428 NTOHUL(idh.lookback);
429 NTOHUL(idh.dict_size);
430 NTOHUL(idh.total_bytes);
431 NTOHUL(idh.index_string_bytes);
432 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
433 NTOHUL(idh.num_of_docs);
434 NTOHUL(idh.static_num_of_docs);
435 NTOHUL(idh.num_of_words);
436 NTOHUL(idh.stemmer_num);
437 NTOHUL(idh.stem_method);
438
439 dict_size = idh.dict_size;
440
441 N = idh.num_of_docs;
442
443 if (!(phd = read_perf_hash_data (hash)))
444 {
445 Message ("Unable to read in hash data");
446 return COMPERROR;
447 }
448 totalHbytes = sizeof (perf_hash_data) + sizeof (u_char) * 256 +
449 sizeof (int) * (phd->MAX_N + 1) + sizeof (int *) * 3 * phd->MAX_CH +
450 sizeof (long) * phd->MAX_CH * phd->MAX_L;
451
452 if (!(WordRecs = Xmalloc (sizeof (word_rec) * idh.dict_size)))
453 {
454 Message ("No memory for word entries");
455 return COMPERROR;
456 }
457 totalDbytes += sizeof (word_rec) * idh.dict_size;
458 /* separate storage for the log(b) values, one byte each */
459 if (!(lg_bs = Xmalloc (sizeof (u_char) * idh.dict_size)))
460 {
461 Message ("No memory for lg b's");
462 return COMPERROR;
463 }
464 totalDbytes += sizeof (u_char) * idh.dict_size;
465
466 if (MakeWeights)
467 {
468 /* separate storage for the idf values, one single each */
469 if (!(idf = Xmalloc (sizeof (float) * idh.dict_size)))
470 {
471 Message ("No memory for idf's");
472 return COMPERROR;
473 }
474 totalDbytes += sizeof (float) * idh.dict_size;
475
476 if (!(weights = create_file (file_name, WEIGHTS_SUFFIX, "wb",
477 MAGIC_WGHT, MG_CONTINUE))) { /* [RPAP - Feb 97: WIN32 Port] */
478 Message ("Couldn't open weights file for writing");
479 return (COMPERROR);
480 }
481 }
482 else
483 {
484 unlink (make_name (file_name, WEIGHTS_SUFFIX, NULL));
485 }
486
487 chunk_data = Xmalloc (sizeof (chunk) * (ChunkLimit + 2));
488 totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
489
490 totalIbits = sizeof (unsigned long) * 8; /* The magic number */
491 totalIbits += 8 * 200; /* A 200 byte gap */
492
493 if (MakeWeights)
494 {
495 wl_size = 1024;
496 if (!(word_list = Xmalloc (sizeof (*word_list) * wl_size)))
497 {
498 Message ("No memory for word_list");
499 return COMPERROR;
500 }
501
502 logN = log ((double) N);
503 }
504
505 for (i = 0; i < idh.dict_size; i++)
506 {
507 invf_state_rec *isr;
508 register unsigned long copy, suff, p;
509 unsigned long fcnt, wcnt;
510
511 lasttotalIbits = totalIbits;
512
513 copy = fgetc (dict);
514 suff = fgetc (dict);
515 *prev = copy + suff;
516 fread (prev + copy + 1, sizeof (u_char), suff, dict);
517
518 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
519 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
520
521 /* [RPAP - Jan 97: Endian Ordering] */
522 NTOHUL(fcnt);
523 NTOHUL(wcnt);
524
525 WordRecs[i].last = 0;
526 WordRecs[i].ptr = 0;
527
528 p = fcnt;
529
530 if (MakeWeights)
531 idf[i] = logN - log ((double) fcnt);
532
533
534 isr = in_cache (i);
535
536 isr->Disk_Last = 0;
537 isr->Disk_Ptr = totalIbits;
538
539 isr->Disk_B = BIO_Bblock_Init (N, p);
540
541 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
542
543 if (InvfLevel >= 2)
544 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
545
546#ifdef USE_LONG_LONG
547 totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
548#else
549 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
550#endif
551
552 if (totalIbits < lasttotalIbits) {
553 fprintf(stderr, "ERROR: The totalIbits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
554 if (sizeof (mg_ullong) < 8) {
555 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
556 }
557 fprintf(stderr, " Build aborted.\n");
558 exit(1);
559 }
560 }
561
562
563 /* now convert to bytes, and actually get the space */
564#ifdef USE_LONG_LONG
565 totalIbytes = (totalIbits + 7ull) >> 3ull;
566#else
567 totalIbytes = (totalIbits + 7ul) >> 3ul;
568#endif
569 return (COMPALLOK);
570
571}
572
573
574
575
576
577static void
578LoadCounts (void)
579{
580 unsigned long numwords, i, last_total;
581 static unsigned long local_N = 0;
582 unsigned long totalIbits, crbs_pos;
583 word_rec *wr;
584 unsigned long *counts;
585
586 if (MemoryBuffer == NULL)
587 {
588 MemBufSize = sizeof (unsigned long) * dict_size;
589 if (max_buffer_len > MemBufSize)
590 MemBufSize = max_buffer_len;
591 if (!(MemoryBuffer = Xmalloc (MemBufSize)))
592 FatalError (1, "Unable to allocate memory for buffer");
593 ChangeMemInUse (MemBufSize);
594 }
595 counts = (unsigned long *) MemoryBuffer;
596/* bzero ((char *) counts, sizeof (unsigned long) * dict_size); */
597 bzero ((char *) counts, MemBufSize);
598 docs_left = next_docs_left;
599 if (!docs_left)
600 FatalError (1, "The number of docs in the current chunk is 0");
601
602 BufToUse = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
603
604 numwords = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
605
606 local_N = docs_left;
607
608 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
609 wr->ptr = 0;
610
611 bzero ((char *) lg_bs, dict_size);
612
613 for (i = 0; i < numwords; i++)
614 {
615 unsigned long word_num, wcnt, fcnt, p;
616 word_num = occur_to_lexical (i,0);
617
618 wr = &WordRecs[word_num];
619
620 wcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
621 if (wcnt >= 2)
622 fcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL);
623 else
624 fcnt = wcnt;
625
626
627 p = fcnt;
628 if (wcnt)
629 {
630 register unsigned long length;
631 counts[word_num] = p;
632 length = BIO_Bblock_Bound (local_N, p);
633 if (InvfLevel >= 2)
634 length += wcnt;
635 wr->ptr = length;
636 lg_bs[word_num] = floorlog_2 (BIO_Bblock_Init_W (local_N, p));
637 }
638
639
640 }
641 crbs_pos = BIO_Random_Tell (&crbs);
642 totalIbits = 0;
643 last_total = 0;
644 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
645 {
646 register unsigned long length;
647 length = wr->ptr;
648 wr->last = callnum;
649 BIO_Random_Gamma_Encode (counts[i] + 1, &crbs, NULL);
650 if (counts[i])
651 {
652 if (i)
653 BIO_Random_Delta_Encode (totalIbits - last_total + 1, &crbs, NULL);
654 else
655 BIO_Random_Delta_Encode (1, &crbs, NULL);
656
657 last_total = totalIbits;
658 }
659 wr->ptr = totalIbits;
660 totalIbits += length;
661 }
662 add_chunk_state (crbs_pos, callnum, local_N);
663
664 if ((totalIbits + 7ul) >> 3ul > BufToUse)
665 FatalError (1, "Pointers exceed buffer size");
666
667 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
668}
669
670
671
672
673static void
674DumpChunk (void)
675{
676 chunk_data[chunks_read - 1].disk_pos = Disk_pos << 3;
677 fseek (chunks, Disk_pos, 0);
678 fwrite (MemoryBuffer, sizeof (char), BufToUse, chunks);
679 Disk_pos += BufToUse;
680}
681
682
683
684
685static void
686DiskMerge (void)
687{
688 random_bitio_state *rbsi;
689 random_bitio_state *chks = NULL;
690 unsigned long *chunk_ptrs;
691 int i;
692
693 BIO_Random_Flush (&crbs);
694
695 chunk_ptrs = Xmalloc (chunks_read * sizeof (unsigned long));
696 ChangeMemInUse (chunks_read * sizeof (unsigned long));
697 bzero ((char *) chunk_ptrs, chunks_read * sizeof (unsigned long));
698
699 rbsi = Xmalloc (chunks_read * sizeof (random_bitio_state));
700 ChangeMemInUse (chunks_read * sizeof (random_bitio_state));
701 for (i = 0; i < chunks_read; i++)
702 {
703 rbsi[i] = crbs;
704 rbsi[i].Buf = Xmalloc (rbsi[i].len);
705 ChangeMemInUse (rbsi[i].len);
706 bcopy ((char *) (crbs.Buf), (char *) (rbsi[i].Buf), rbsi[i].len);
707 BIO_Random_Seek (chunk_data[i].params_pos, &rbsi[i]);
708 }
709
710 if (chunks_read > 1)
711 {
712 int j;
713 chks = Xmalloc ((chunks_read - 1) * sizeof (random_bitio_state));
714 ChangeMemInUse ((chunks_read - 1) * sizeof (random_bitio_state));
715 BIO_Random_Start (chunks, RND_BUF_SIZE, &chks[0]);
716 ChangeMemInUse (RND_BUF_SIZE);
717 for (j = 1; j < chunks_read - 1; j++)
718 {
719 chks[j] = chks[0];
720 chks[j].Buf = Xmalloc (chks[0].len);
721 ChangeMemInUse (chks[0].len);
722 bcopy ((char *) (chks[0].Buf), (char *) (chks[j].Buf), chks[0].len);
723 }
724 }
725 for (i = 0; i < dict_size; i++)
726 {
727 int j;
728 invf_state_rec *isr = in_cache (i);
729 register int B;
730
731 BIO_Random_Seek_X (isr->Disk_Ptr, &rbs); /* Position in invf file */
732
733 B = isr->Disk_B;
734
735 for (j = 0; j < chunks_read; j++)
736 {
737 int p;
738 p = BIO_Random_Gamma_Decode (&rbsi[j], NULL) - 1;
739
740 if (p)
741 {
742 int ptr, b;
743 chunk_ptrs[j] += BIO_Random_Delta_Decode (&rbsi[j], NULL) - 1;
744 ptr = chunk_ptrs[j];
745 b = 1 << floorlog_2 (BIO_Bblock_Init_W (chunk_data[j].N, p));
746
747 if (j == chunks_read - 1)
748 {
749 int k, CurrDoc;
750 DECODE_START ((u_char *) MemoryBuffer, ptr)
751 CurrDoc = isr->Disk_Last;
752 for (k = 0; k < p; k++)
753 {
754 register unsigned long x, tf;
755 BBLOCK_DECODE (x, b);
756 if (k == 0)
757 x = x + chunk_data[j].start_doc - isr->Disk_Last;
758 CurrDoc += x;
759 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
760 if (InvfLevel >= 2)
761 {
762 UNARY_DECODE (tf);
763 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
764 }
765 }
766 DECODE_DONE
767 isr->Disk_Last = CurrDoc;
768 }
769 else
770 {
771 int k, CurrDoc;
772 random_bitio_state *Chks = chks + j;
773 BIO_Random_Seek (chunk_data[j].disk_pos + ptr, Chks);
774 CurrDoc = isr->Disk_Last;
775 for (k = 0; k < p; k++)
776 {
777 register unsigned long x, tf;
778 x = BIO_Random_Bblock_Decode (b, Chks, NULL);
779 if (k == 0)
780 x = x + chunk_data[j].start_doc - isr->Disk_Last;
781 CurrDoc += x;
782 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
783 if (InvfLevel >= 2)
784 {
785 tf = BIO_Random_Unary_Decode (Chks, NULL);
786 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
787 }
788 }
789 isr->Disk_Last = CurrDoc;
790 }
791 }
792 }
793
794 isr->Disk_Ptr = BIO_Random_Tell_X (&rbs);
795
796 }
797 if (chunks_read > 1)
798 {
799 int j;
800 for (j = 0; j < chunks_read - 1; j++)
801 {
802 Xfree (chks[j].Buf);
803 ChangeMemInUse (-chks[j].len);
804 }
805 Xfree (chks);
806 ChangeMemInUse (-(chunks_read - 1) * sizeof (random_bitio_state));
807 }
808
809 for (i = 0; i < chunks_read; i++)
810 {
811 Xfree (rbsi[i].Buf);
812 ChangeMemInUse (-rbsi[i].len);
813 }
814 Xfree (rbsi);
815 ChangeMemInUse (-chunks_read * sizeof (random_bitio_state));
816/* chunks_read = 0; */
817 Xfree (chunk_ptrs);
818 ChangeMemInUse (-chunks_read * sizeof (unsigned long));
819 chunks_read = 0;
820 Disk_pos = 0;
821 BIO_Random_Seek (0, &crbs);
822}
823
824static void
825MergeIn (void)
826{
827 static int disk_chunks = 0;
828 static header = 0;
829 if (!header)
830 {
831 fprintf (stderr, "ivf.pass2 : ");
832 header = 1;
833 }
834 if (disk_chunks == ChunkLimit || next_docs_left == 0)
835 {
836 fprintf (stderr, "M");
837 DiskMerge ();
838 disk_chunks = 0;
839 }
840 else
841 {
842 fprintf (stderr, "-");
843 DumpChunk ();
844 disk_chunks++;
845 }
846 if (next_docs_left == 0)
847 fprintf (stderr, "\n");
848}
849
850
851static int
852wl_comp (const void *a, const void *b)
853{
854 return *((int *) a) - *((int *) b);
855}
856
857static int
858process_doc (u_char * s_in, int l_in)
859{
860 int res;
861 u_char *end = s_in + l_in - 1;
862 unsigned long tocode;
863 unsigned long wl_pos = 0;
864
865 if (!docs_left)
866 LoadCounts ();
867
868 callnum++;
869
870 if (!inaword (s_in, end))
871 if (SkipSGML)
872 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
873 else
874 PARSE_NON_STEM_WORD (s_in, end);
875
876 while (s_in <= end)
877 {
878 u_char Word[MAXSTEMLEN + 1];
879
880 PARSE_STEM_WORD (Word, s_in, end);
881 stemmer (idh.stem_method, idh.stemmer_num, Word);
882 if (SkipSGML)
883 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
884 else
885 PARSE_NON_STEM_WORD (s_in, end);
886
887 if (*Word == 0)
888 continue;
889
890 res = perf_hash (phd, Word);
891
892 {
893 word_rec *arr = &WordRecs[res];
894 int b = 1 << lg_bs[res];
895 wordnum++;
896
897 tocode = callnum;
898
899 ENCODE_START ((u_char *) MemoryBuffer, arr->ptr)
900
901 if (tocode > arr->last)
902 {
903 register int x;
904 x = tocode - arr->last - 1;
905 BBLOCK_ENCODE (x + 1, b);
906 if (InvfLevel >= 2)
907 ENCODE_BIT (1);
908 no_of_ptrs++;
909 arr->last = tocode;
910 }
911 else if (InvfLevel >= 2)
912 {
913 __pos--;
914 ENCODE_BIT (0);
915 ENCODE_BIT (1);
916 }
917 arr->ptr = __pos;
918 ENCODE_DONE
919 }
920
921 if (MakeWeights)
922 {
923 if (wl_pos >= wl_size)
924 {
925 wl_size += (wl_size >> 1);
926 word_list = Xrealloc (word_list, sizeof (*word_list) * wl_size);
927 }
928 word_list[wl_pos++] = res;
929 }
930 }
931 if (MakeWeights)
932 {
933 float doc_weight = 0.0;
934 if (wl_pos)
935 {
936 unsigned long *wl = word_list;
937 unsigned long i, count, val;
938 qsort (wl, wl_pos, sizeof (*wl), wl_comp);
939 count = 1;
940 val = *wl++;
941 for (i = 1; i <= wl_pos; i++, wl++)
942 if (i == wl_pos || val != *wl)
943 {
944 double weight = count * idf[val];
945 doc_weight += weight * weight;
946 count = 1;
947 val = *wl;
948 }
949 else
950 count++;
951 }
952 HTONF(doc_weight); /* [RPAP - Jan 97: Endian Ordering] */
953 fwrite ((char *) &doc_weight, sizeof (doc_weight), 1, weights);
954 }
955 docs_left--;
956 if (!docs_left)
957 MergeIn ();
958
959 return COMPALLOK;
960}
961
962int
963process_ivf_2 (u_char * s_in, int l_in)
964{
965 if (InvfLevel <= 2)
966 return process_doc (s_in, l_in);
967 else
968 {
969 int count = 0;
970 int pos = 0;
971 u_char *start = s_in;
972 while (pos < l_in)
973 {
974 if (s_in[pos] == TERMPARAGRAPH)
975 {
976 int len = pos + s_in + 1 - start;
977 if (process_doc (start, len) != COMPALLOK)
978 return (COMPERROR);
979 start = s_in + pos + 1;
980 count++;
981 }
982 pos++;
983 }
984 if (start < s_in + pos)
985 {
986 if (process_doc (start, pos + s_in - start) != COMPALLOK)
987 return (COMPERROR);
988 count++;
989 }
990 HTONSI(count); /* [RPAP - Jan 97: Endian Ordering] */
991 fwrite ((char *) &count, sizeof (count), 1, invf_para);
992 }
993 return COMPALLOK;
994}
995
996
997
998
999
1000static void
1001stats (unsigned long len)
1002{
1003#ifndef SILENT
1004 fseek (count, 0, 2);
1005 fseek (count_trans, 0, 2);
1006 fseek (invf_state, 0, 2);
1007 fseek (invf, 0, 0);
1008 fseek (invf, 0, 2);
1009 fseek (chunks, 0, 2);
1010 fseek (chunk_state, 0, 2);
1011 Message ("File sizes\n");
1012 Message (" Chunk desc : %10u bytes\n", ftell (count));
1013 Message (" Chunk trans : %10u bytes\n", ftell (count_trans));
1014 Message (" Chunks : %10u bytes\n", ftell (chunks));
1015 Message (" Chunk state : %10u bytes\n", ftell (chunk_state));
1016 Message (" Invf state : %10u bytes\n", ftell (invf_state));
1017 Message (" Peak invf : %10u bytes\n", len);
1018 Message (" Final invf : %10u bytes\n", ftell (invf));
1019 Message ("Peak disk usage : %10.2f %%\n",
1020 (double) (ftell (count) + ftell (count_trans) +
1021 ftell (invf_state) + ftell (chunks) +
1022 ftell (chunk_state) + len) / ftell (invf) * 100.0);
1023#endif
1024}
1025
1026
1027/* ARGSUSED */
1028int
1029done_ivf_2 (char *FileName)
1030{
1031 long i;
1032 mg_ullong totalIbits;
1033 unsigned long invf_len;
1034 unsigned long bytes_output;
1035 struct invf_file_header ifh;
1036
1037 if (weights)
1038 fclose (weights);
1039 if (invf_para)
1040 fclose (invf_para);
1041
1042 free_perf_hash (phd);
1043 phd = NULL;
1044
1045 Xfree (MemoryBuffer);
1046 MemoryBuffer = NULL;
1047 ChangeMemInUse (-MemBufSize);
1048
1049 BIO_Random_Done (&rbs);
1050 BIO_Random_Done (&rbsp);
1051 fflush (invf);
1052
1053 fseek (invf, 0, 2);
1054 invf_len = ftell (invf);
1055
1056 fseek (invf_out, sizeof (long), 0);
1057 /* [RPAP - Jan 97: Endian Ordering] */
1058 HTONUL2(dict_size, ifh.no_of_words);
1059 HTONUL2(no_of_ptrs, ifh.no_of_ptrs);
1060 ifh.skip_mode = 0;
1061 bzero ((char *) ifh.params, sizeof (ifh.params));
1062 HTONUL2(InvfLevel, ifh.InvfLevel);
1063 fwrite ((char *) &ifh, sizeof (ifh), 1, invf_out);
1064
1065 bytes_output = ftell (invf_out);
1066
1067 totalIbits = sizeof (unsigned long) * 8; /* The magic number */
1068 totalIbits += 8 * 200; /* A 200 byte gap */
1069
1070 /* find the right place in the file to start reading p values */
1071 fseek (dict, sizeof (unsigned long) + sizeof (struct invf_dict_header), 0);
1072 for (i = 0; i < dict_size; i++)
1073 {
1074 invf_state_rec *isr;
1075 unsigned long fcnt, wcnt, s, e;
1076 register unsigned long p;
1077 u_char dummy1, dummy2[MAXSTEMLEN + 1];
1078
1079 /* output location to the invf_idx */
1080 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1081 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1082 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1083
1084 /* read an entry for a word, just to get p value */
1085 dummy1 = fgetc (dict);
1086 dummy1 = fgetc (dict);
1087 fread (dummy2, sizeof (u_char), dummy1, dict);
1088 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
1089 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
1090
1091 /* [RPAP - Jan 97: Endian Ordering] */
1092 NTOHUL(fcnt);
1093 NTOHUL(wcnt);
1094
1095 p = fcnt;
1096
1097 isr = in_cache (i);
1098
1099 e = (isr->Disk_Ptr + 7ul) >> 3ul;
1100 s = totalIbits >> 3;
1101
1102 fseek (invf_in, s, 0);
1103 while (s < e)
1104 {
1105 u_char c = getc (invf_in);
1106 if (s == e - 1)
1107 {
1108 u_char ands[8] =
1109 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1110 c &= ands[isr->Disk_Ptr & 7ul];
1111 }
1112 putc (c, invf_out);
1113 bytes_output++;
1114 s++;
1115 }
1116
1117 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
1118 if (InvfLevel >= 2)
1119 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
1120#ifdef USE_LONG_LONG
1121 totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
1122#else
1123 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
1124#endif
1125
1126 }
1127
1128 fclose (invf_in);
1129
1130 /* [RPAP - Feb 97: WIN32 Port] */
1131#ifdef __WIN32__
1132 if (!(_chsize (_fileno (invf_out), bytes_output)))
1133 Message ("Could not truncate invf.");
1134#else
1135 ftruncate (fileno (invf_out), bytes_output);
1136#endif
1137
1138 fclose (invf_out);
1139
1140 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1141 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1142 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1143
1144 fclose (invf_idx);
1145
1146#ifndef SILENT
1147 {
1148 char *temp_str = msg_prefix;
1149 unsigned long total;
1150 msg_prefix = "ivf.pass2";
1151 stats (invf_len);
1152 Message ("Pass two data structures : %6.3f Mbyte\n",
1153 (double) totalDbytes / 1024 / 1024);
1154 total = totalDbytes;
1155 Message ("Pass two hash structure(s) : %6.3f Mbyte\n",
1156 (double) totalHbytes / 1024 / 1024);
1157 total += totalHbytes;
1158 Message ("Peak extra memory in use : %6.3f Mbyte\n",
1159 (double) MaxMemInUse / 1024 / 1024);
1160 total += MaxMemInUse;
1161 Message ("Peak total memory in use : %6.3f Mbyte\n",
1162 (double) total / 1024 / 1024);
1163 msg_prefix = temp_str;
1164 }
1165#endif
1166
1167 Xfree(chunk_data);
1168 chunk_data = NULL;
1169 Xfree (WordRecs);
1170 WordRecs = NULL;
1171 Xfree (lg_bs);
1172 lg_bs = NULL;
1173 Xfree (idf);
1174 idf = NULL;
1175 Xfree (word_list);
1176 word_list = NULL;
1177 /* Free the memory allocated for the BIO_Random */
1178 occur_to_lexical (-1,1);
1179
1180 BIO_Random_Done (&crbs);
1181
1182 fclose (invf);
1183 fclose (dict);
1184 fclose (hash);
1185 fclose (count);
1186 fclose (count_trans);
1187 fclose (chunk_state);
1188 fclose (chunks);
1189 fclose (invf_state);
1190 return (COMPALLOK);
1191}
Note: See TracBrowser for help on using the repository browser.