source: trunk/gsdl/packages/mg/src/text/ivf.pass2.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.6 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.c -- Memory efficient pass 2 inversion
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.1 1999/08/10 21:17:54 sjboddie
27 renamed mg-1.3d directory mg
28
29 Revision 1.3 1998/12/17 09:12:51 rjmcnab
30
31 Altered mg to process utf-8 encoded Unicode. The main changes
32 are in the parsing of the input, the casefolding, and the stemming.
33
34 Revision 1.2 1998/11/25 07:55:43 rjmcnab
35
36 Modified mg to that you can specify the stemmer you want
37 to use via a command line option. You specify it to
38 mg_passes during the build process. The number of the
39 stemmer that you used is stored within the inverted
40 dictionary header and the stemmed dictionary header so
41 the correct stemmer is used in later stages of building
42 and querying.
43
44 Revision 1.1 1998/11/17 09:34:45 rjmcnab
45 *** empty log message ***
46
47 * Revision 1.3 1994/10/20 03:56:49 tes
48 * I have rewritten the boolean query optimiser and abstracted out the
49 * components of the boolean query.
50 *
51 * Revision 1.2 1994/09/20 04:41:35 tes
52 * For version 1.1
53 *
54 */
55
56static char *RCSID = "$Id: ivf.pass2.c 439 1999-08-10 21:23:37Z sjboddie $";
57
58#include "local_strings.h"
59#include "sysfuncs.h"
60#include "memlib.h"
61#include "messages.h"
62#include "stemmer.h"
63#include "perf_hash.h"
64#include "bitio_m.h"
65#include "bitio_m_mems.h"
66#include "bitio_gen.h"
67#include "bitio_random.h"
68#include "bitio_stdio.h"
69#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
70
71#include "mg_files.h"
72#include "invf.h"
73#include "locallib.h"
74#include "mg.h"
75#include "build.h"
76#include "words.h"
77#include "hash.h"
78
79/* [RPAP - Feb 97: WIN32 Port] */
80#ifdef __WIN32__
81#include <io.h>
82#endif
83
84#ifndef RND_BUF_SIZE
85#define RND_BUF_SIZE 8*1024
86/*#define RND_BUF_SIZE 128 */
87#endif
88
89#define print_fsize(file)\
90do\
91 {\
92 struct stat file_state;\
93 fstat(fileno(invf_out), &file_state);\
94 Message("len(invf) = %ld", file_state.st_size);\
95 }while(0)
96
97typedef struct word_rec
98 {
99 unsigned long ptr;
100 unsigned long last;
101 }
102word_rec;
103
104typedef struct invf_state_rec
105 {
106 unsigned long Disk_Ptr;
107 unsigned long Disk_Last;
108 unsigned long Disk_B;
109 }
110invf_state_rec;
111
112typedef struct chunk
113 {
114 unsigned long start_doc;
115 unsigned long params_pos;
116 unsigned long disk_pos;
117 unsigned long N;
118 }
119chunk;
120
121
122static FILE *dict; /* Stemmed dictionary file */
123static FILE *hash; /* Stemmed dictionary hash file */
124static FILE *invf; /* Inverted file */
125static FILE *invf_in; /* Inverted file */
126static FILE *invf_out; /* Inverted file */
127static FILE *invf_idx; /* Inverted index file */
128static FILE *count; /* Count file */
129static FILE *count_trans; /* Count translation file */
130static FILE *invf_state; /* Inverted file State */
131static FILE *chunk_state; /* Chunk state */
132static FILE *chunks; /* Chunk state */
133static FILE *invf_para = NULL; /* Paragraph counts file */
134static FILE *weights = NULL; /* Weights file */
135
136static stdio_bitio_state sbs;
137static random_bitio_state crbs;
138static chunk *chunk_data = NULL;
139static random_bitio_state rbs, rbsp;
140
141static int docs_left = 0, next_docs_left = 0;
142static unsigned long N;
143
144static word_rec *WordRecs;
145static u_char *lg_bs;
146static float *idf = NULL;
147
148static char *MemoryBuffer = NULL;
149static unsigned long MemBufSize;
150static unsigned long BufToUse;
151static struct invf_dict_header idh;
152
153static perf_hash_data *phd;
154
155static unsigned long *word_list = NULL;
156static unsigned long wl_size = 0;
157
158static unsigned long dict_size;
159static unsigned long no_of_ptrs = 0;
160static unsigned long chunks_read = 0;
161static unsigned long Disk_pos = 0;
162static unsigned long callnum = 0;
163static unsigned long wordnum = 0;
164
165static unsigned long totalIbytes = 0;
166static unsigned long totalDbytes = 0;
167static unsigned long totalHbytes = 0;
168
169static unsigned long MemInUse = 0;
170static unsigned long MaxMemInUse = 0;
171static unsigned long max_buffer_len;
172
173void
174ChangeMemInUse (int mem)
175{
176 MemInUse += mem;
177 if (MemInUse > MaxMemInUse)
178 MaxMemInUse = MemInUse;
179}
180
181
182
183
184static int
185open_files (char *file_name)
186{
187 char FName[200];
188
189 if (!(dict = open_file (file_name, INVF_DICT_SUFFIX, "rb",
190 MAGIC_STEM_BUILD, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
191 return (COMPERROR);
192
193 if (!(hash = open_file (file_name, INVF_DICT_HASH_SUFFIX, "rb",
194 MAGIC_HASH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
195 return (COMPERROR);
196
197 if (!(count = open_file (file_name, INVF_CHUNK_SUFFIX, "rb",
198 MAGIC_CHUNK, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
199 return (COMPERROR);
200 fread (&max_buffer_len, sizeof (max_buffer_len), 1, count);
201 NTOHUL(max_buffer_len); /* [RPAP - Jan 97: Endian Ordering] */
202
203 BIO_Stdio_Decode_Start (count, &sbs);
204 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
205
206 if (!(count_trans = open_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "rb",
207 MAGIC_CHUNK_TRANS, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
208 return (COMPERROR);
209
210 if (!(invf = create_file (file_name, INVF_SUFFIX, "w+b",
211 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
212 return (COMPERROR);
213 fflush (invf);
214 if (!(invf_in = open_file (file_name, INVF_SUFFIX, "rb",
215 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
216 return (COMPERROR);
217 if (!(invf_out = create_file (file_name, INVF_SUFFIX, "wb",
218 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
219 return (COMPERROR);
220 BIO_Random_Start (invf, RND_BUF_SIZE, &rbs);
221 BIO_Random_Start (invf, RND_BUF_SIZE, &rbsp);
222 ChangeMemInUse (RND_BUF_SIZE * 2);
223
224 if (!(invf_idx = create_file (file_name, INVF_IDX_SUFFIX, "wb",
225 MAGIC_INVI, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
226 return (COMPERROR);
227
228 if (InvfLevel == 3)
229 if (!(invf_para = create_file (file_name, INVF_PARAGRAPH_SUFFIX, "wb",
230 MAGIC_PARAGRAPH, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
231 return (COMPERROR);
232
233 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
234 ".invf.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
235 if (!(invf_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
236 {
237 Message ("Unable to create \"%s\"", FName);
238 return (COMPERROR);
239 }
240 unlink (FName);
241
242 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
243 ".chunk.state", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
244 if (!(chunk_state = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
245 {
246 Message ("Unable to create \"%s\"", FName);
247 return (COMPERROR);
248 }
249 unlink (FName);
250 BIO_Random_Start (chunk_state, RND_BUF_SIZE, &crbs);
251 ChangeMemInUse (RND_BUF_SIZE);
252
253 sprintf (FName, FILE_NAME_FORMAT ".%ld", get_basepath (), file_name,
254 ".chunks", (long) getpid ()); /* [RPAP - Feb 97: WIN32 Port] */
255 if (!(chunks = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
256 {
257 Message ("Unable to create \"%s\"", FName);
258 return (COMPERROR);
259 }
260 unlink (FName);
261
262 return (COMPALLOK);
263}
264
265
266
267
268
269#define ISR_CACHE 1024
270#define ISR_ENTRY_SIZE (sizeof(unsigned long)*2 + sizeof(unsigned long))
271
272invf_state_rec *
273in_cache (int pos)
274{
275 static char isr_data[ISR_CACHE * ISR_ENTRY_SIZE];
276 static invf_state_rec isr;
277 static int isr_base = 0, isr_num = -1, isr_pos = -1;
278 if (isr_pos >= 0)
279 bcopy ((char *) &isr, &isr_data[isr_pos * ISR_ENTRY_SIZE], ISR_ENTRY_SIZE);
280 if (pos < isr_base || pos >= isr_base + isr_num)
281 {
282 if (isr_num >= 0)
283 {
284 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
285 fwrite (isr_data, 1, ISR_ENTRY_SIZE * isr_num, invf_state);
286 }
287 isr_base = pos;
288 fseek (invf_state, isr_base * ISR_ENTRY_SIZE, 0);
289 fread (isr_data, 1, ISR_ENTRY_SIZE * ISR_CACHE, invf_state);
290 isr_num = ISR_CACHE;
291 }
292 isr_pos = pos - isr_base;
293 bcopy (&isr_data[isr_pos * ISR_ENTRY_SIZE], (char *) &isr, ISR_ENTRY_SIZE);
294 return &isr;
295}
296
297
298
299
300
301unsigned long
302occur_to_lexical (long occ)
303{
304 static long pos = -1;
305 static random_bitio_state rbs;
306 static int val = 0;
307 if (pos == -1)
308 {
309 BIO_Random_Start (count_trans, RND_BUF_SIZE, &rbs);
310 pos = 0x7fffffff;
311 }
312 if (occ < pos)
313 {
314 if (occ == -1)
315 {
316 BIO_Random_Done (&rbs);
317 return 0;
318 }
319 BIO_Random_Seek (32, &rbs);
320 pos = 0;
321 }
322 while (pos <= occ)
323 {
324 val = BIO_Random_Binary_Decode (dict_size + 1, &rbs, NULL) - 1;
325 pos++;
326 }
327 return (val);
328}
329
330
331void
332add_chunk_state (unsigned long pos, unsigned long start_doc,
333 unsigned long N)
334{
335 chunk_data[chunks_read].params_pos = pos;
336 chunk_data[chunks_read].start_doc = start_doc;
337 chunk_data[chunks_read].N = N;
338 chunks_read++;
339}
340
341
342int
343init_ivf_2 (char *file_name)
344{
345 u_char prev[MAXSTEMLEN + 1];
346 int i;
347 unsigned long totalIbits;
348 double logN = 0.0;
349
350 if (open_files (file_name) == COMPERROR)
351 return COMPERROR;
352
353
354 /* Read in the stemmed dictionary file header */
355 fread ((char *) &idh, sizeof (idh), 1, dict);
356
357 /* [RPAP - Jan 97: Endian Ordering] */
358 NTOHUL(idh.lookback);
359 NTOHUL(idh.dict_size);
360 NTOHUL(idh.total_bytes);
361 NTOHUL(idh.index_string_bytes);
362 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
363 NTOHUL(idh.num_of_docs);
364 NTOHUL(idh.static_num_of_docs);
365 NTOHUL(idh.num_of_words);
366 NTOHUL(idh.stemmer_num);
367 NTOHUL(idh.stem_method);
368
369 dict_size = idh.dict_size;
370
371 N = idh.num_of_docs;
372
373 if (!(phd = read_perf_hash_data (hash)))
374 {
375 Message ("Unable to read in hash data");
376 return COMPERROR;
377 }
378 totalHbytes = sizeof (perf_hash_data) + sizeof (u_char) * 256 +
379 sizeof (int) * (phd->MAX_N + 1) + sizeof (int *) * 3 * phd->MAX_CH +
380 sizeof (long) * phd->MAX_CH * phd->MAX_L;
381
382 if (!(WordRecs = Xmalloc (sizeof (word_rec) * idh.dict_size)))
383 {
384 Message ("No memory for word entries");
385 return COMPERROR;
386 }
387 totalDbytes += sizeof (word_rec) * idh.dict_size;
388
389 /* separate storage for the log(b) values, one byte each */
390 if (!(lg_bs = Xmalloc (sizeof (u_char) * idh.dict_size)))
391 {
392 Message ("No memory for lg b's");
393 return COMPERROR;
394 }
395 totalDbytes += sizeof (u_char) * idh.dict_size;
396
397 if (MakeWeights)
398 {
399 /* separate storage for the idf values, one single each */
400 if (!(idf = Xmalloc (sizeof (float) * idh.dict_size)))
401 {
402 Message ("No memory for idf's");
403 return COMPERROR;
404 }
405 totalDbytes += sizeof (float) * idh.dict_size;
406
407 if (!(weights = create_file (file_name, WEIGHTS_SUFFIX, "wb",
408 MAGIC_WGHT, MG_CONTINUE))) { /* [RPAP - Feb 97: WIN32 Port] */
409 Message ("Couldn't open weights file for writing");
410 return (COMPERROR);
411 }
412 }
413 else
414 {
415 unlink (make_name (file_name, WEIGHTS_SUFFIX, NULL));
416 }
417
418 chunk_data = Xmalloc (sizeof (chunk) * (ChunkLimit + 2));
419 totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
420
421 totalIbits = 32; /* The magic number */
422 totalIbits += 8 * 100; /* A 100 byte gap */
423
424 if (MakeWeights)
425 {
426 wl_size = 1024;
427 if (!(word_list = Xmalloc (sizeof (*word_list) * wl_size)))
428 {
429 Message ("No memory for word_list");
430 return COMPERROR;
431 }
432
433 logN = log ((double) N);
434 }
435
436 for (i = 0; i < idh.dict_size; i++)
437 {
438 invf_state_rec *isr;
439 register unsigned long copy, suff, p;
440 unsigned long fcnt, wcnt;
441
442 copy = fgetc (dict);
443 suff = fgetc (dict);
444 *prev = copy + suff;
445 fread (prev + copy + 1, sizeof (u_char), suff, dict);
446
447 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
448 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
449
450 /* [RPAP - Jan 97: Endian Ordering] */
451 NTOHUL(fcnt);
452 NTOHUL(wcnt);
453
454 WordRecs[i].last = 0;
455 WordRecs[i].ptr = 0;
456
457 p = fcnt;
458
459 if (MakeWeights)
460 idf[i] = logN - log ((double) fcnt);
461
462
463 isr = in_cache (i);
464
465 isr->Disk_Last = 0;
466 isr->Disk_Ptr = totalIbits;
467
468 isr->Disk_B = BIO_Bblock_Init (N, p);
469
470 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
471
472 if (InvfLevel >= 2)
473 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
474
475 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
476
477 }
478
479
480 /* now convert to bytes, and actually get the space */
481 totalIbytes = (totalIbits + 7ul) >> 3ul;
482
483
484 return (COMPALLOK);
485
486}
487
488
489
490
491
492static void
493LoadCounts (void)
494{
495 unsigned long numwords, i, last_total;
496 static unsigned long local_N = 0;
497 unsigned long totalIbits, crbs_pos;
498 word_rec *wr;
499 unsigned long *counts;
500
501 if (MemoryBuffer == NULL)
502 {
503 MemBufSize = sizeof (unsigned long) * dict_size;
504 if (max_buffer_len > MemBufSize)
505 MemBufSize = max_buffer_len;
506 if (!(MemoryBuffer = Xmalloc (MemBufSize)))
507 FatalError (1, "Unable to allocate memory for buffer");
508 ChangeMemInUse (MemBufSize);
509 }
510
511 counts = (unsigned long *) MemoryBuffer;
512 bzero ((char *) counts, sizeof (unsigned long) * dict_size);
513
514 docs_left = next_docs_left;
515 if (!docs_left)
516 FatalError (1, "The number of docs in the current chunk is 0");
517
518 BufToUse = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
519
520 numwords = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
521
522 local_N = docs_left;
523
524
525
526 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
527 wr->ptr = 0;
528
529 bzero ((char *) lg_bs, dict_size);
530
531 for (i = 0; i < numwords; i++)
532 {
533 unsigned long word_num, wcnt, fcnt, p;
534 word_num = occur_to_lexical (i);
535
536 wr = &WordRecs[word_num];
537
538 wcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
539 if (wcnt >= 2)
540 fcnt = BIO_Stdio_Gamma_Decode (&sbs, NULL);
541 else
542 fcnt = wcnt;
543
544 p = fcnt;
545
546 if (wcnt)
547 {
548 register unsigned long length;
549 counts[word_num] = p;
550 length = BIO_Bblock_Bound (local_N, p);
551 if (InvfLevel >= 2)
552 length += wcnt;
553 wr->ptr = length;
554 lg_bs[word_num] = floorlog_2 (BIO_Bblock_Init_W (local_N, p));
555 }
556
557 }
558
559 crbs_pos = BIO_Random_Tell (&crbs);
560
561 totalIbits = 0;
562 last_total = 0;
563 for (wr = WordRecs, i = 0; i < dict_size; i++, wr++)
564 {
565 register unsigned long length;
566 length = wr->ptr;
567 wr->last = callnum;
568 BIO_Random_Gamma_Encode (counts[i] + 1, &crbs, NULL);
569 if (counts[i])
570 {
571 if (i)
572 BIO_Random_Delta_Encode (totalIbits - last_total + 1, &crbs, NULL);
573 else
574 BIO_Random_Delta_Encode (1, &crbs, NULL);
575
576 last_total = totalIbits;
577 }
578 wr->ptr = totalIbits;
579 totalIbits += length;
580 }
581 add_chunk_state (crbs_pos, callnum, local_N);
582
583 if ((totalIbits + 7ul) >> 3ul > BufToUse)
584 FatalError (1, "Pointers exceed buffer size");
585
586 next_docs_left = BIO_Stdio_Gamma_Decode (&sbs, NULL) - 1;
587}
588
589
590
591
592static void
593DumpChunk (void)
594{
595 chunk_data[chunks_read - 1].disk_pos = Disk_pos << 3;
596 fseek (chunks, Disk_pos, 0);
597 fwrite (MemoryBuffer, sizeof (char), BufToUse, chunks);
598 Disk_pos += BufToUse;
599}
600
601
602
603
604static void
605DiskMerge (void)
606{
607 random_bitio_state *rbsi;
608 random_bitio_state *chks = NULL;
609 unsigned long *chunk_ptrs;
610 int i;
611
612 BIO_Random_Flush (&crbs);
613
614 chunk_ptrs = Xmalloc (chunks_read * sizeof (unsigned long));
615 ChangeMemInUse (chunks_read * sizeof (unsigned long));
616 bzero ((char *) chunk_ptrs, chunks_read * sizeof (unsigned long));
617
618 rbsi = Xmalloc (chunks_read * sizeof (random_bitio_state));
619 ChangeMemInUse (chunks_read * sizeof (random_bitio_state));
620 for (i = 0; i < chunks_read; i++)
621 {
622 rbsi[i] = crbs;
623 rbsi[i].Buf = Xmalloc (rbsi[i].len);
624 ChangeMemInUse (rbsi[i].len);
625 bcopy ((char *) (crbs.Buf), (char *) (rbsi[i].Buf), rbsi[i].len);
626 BIO_Random_Seek (chunk_data[i].params_pos, &rbsi[i]);
627 }
628
629 if (chunks_read > 1)
630 {
631 int j;
632 chks = Xmalloc ((chunks_read - 1) * sizeof (random_bitio_state));
633 ChangeMemInUse ((chunks_read - 1) * sizeof (random_bitio_state));
634 BIO_Random_Start (chunks, RND_BUF_SIZE, &chks[0]);
635 ChangeMemInUse (RND_BUF_SIZE);
636 for (j = 1; j < chunks_read - 1; j++)
637 {
638 chks[j] = chks[0];
639 chks[j].Buf = Xmalloc (chks[0].len);
640 ChangeMemInUse (chks[0].len);
641 bcopy ((char *) (chks[0].Buf), (char *) (chks[j].Buf), chks[0].len);
642 }
643 }
644 for (i = 0; i < dict_size; i++)
645 {
646 int j;
647 invf_state_rec *isr = in_cache (i);
648 register int B;
649
650 BIO_Random_Seek (isr->Disk_Ptr, &rbs); /* Position in invf file */
651
652 B = isr->Disk_B;
653
654 for (j = 0; j < chunks_read; j++)
655 {
656 int p;
657 p = BIO_Random_Gamma_Decode (&rbsi[j], NULL) - 1;
658
659 if (p)
660 {
661 int ptr, b;
662 chunk_ptrs[j] += BIO_Random_Delta_Decode (&rbsi[j], NULL) - 1;
663 ptr = chunk_ptrs[j];
664 b = 1 << floorlog_2 (BIO_Bblock_Init_W (chunk_data[j].N, p));
665
666 if (j == chunks_read - 1)
667 {
668 int k, CurrDoc;
669 DECODE_START ((u_char *) MemoryBuffer, ptr)
670 CurrDoc = isr->Disk_Last;
671 for (k = 0; k < p; k++)
672 {
673 register unsigned long x, tf;
674 BBLOCK_DECODE (x, b);
675 if (k == 0)
676 x = x + chunk_data[j].start_doc - isr->Disk_Last;
677 CurrDoc += x;
678 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
679 if (InvfLevel >= 2)
680 {
681 UNARY_DECODE (tf);
682 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
683 }
684 }
685 DECODE_DONE
686 isr->Disk_Last = CurrDoc;
687 }
688 else
689 {
690 int k, CurrDoc;
691 random_bitio_state *Chks = chks + j;
692 BIO_Random_Seek (chunk_data[j].disk_pos + ptr, Chks);
693 CurrDoc = isr->Disk_Last;
694 for (k = 0; k < p; k++)
695 {
696 register unsigned long x, tf;
697 x = BIO_Random_Bblock_Decode (b, Chks, NULL);
698 if (k == 0)
699 x = x + chunk_data[j].start_doc - isr->Disk_Last;
700 CurrDoc += x;
701 BIO_Random_Bblock_Encode (x, B, &rbs, NULL);
702 if (InvfLevel >= 2)
703 {
704 tf = BIO_Random_Unary_Decode (Chks, NULL);
705 BIO_Random_Gamma_Encode (tf, &rbs, NULL);
706 }
707 }
708 isr->Disk_Last = CurrDoc;
709 }
710 }
711 }
712
713 isr->Disk_Ptr = BIO_Random_Tell (&rbs);
714
715 }
716 if (chunks_read > 1)
717 {
718 int j;
719 for (j = 0; j < chunks_read - 1; j++)
720 {
721 Xfree (chks[j].Buf);
722 ChangeMemInUse (-chks[j].len);
723 }
724 Xfree (chks);
725 ChangeMemInUse (-(chunks_read - 1) * sizeof (random_bitio_state));
726 }
727
728 for (i = 0; i < chunks_read; i++)
729 {
730 Xfree (rbsi[i].Buf);
731 ChangeMemInUse (-rbsi[i].len);
732 }
733 Xfree (rbsi);
734 ChangeMemInUse (-chunks_read * sizeof (random_bitio_state));
735 chunks_read = 0;
736 Xfree (chunk_ptrs);
737 ChangeMemInUse (-chunks_read * sizeof (unsigned long));
738 Disk_pos = 0;
739 BIO_Random_Seek (0, &crbs);
740}
741
742static void
743MergeIn (void)
744{
745 static int disk_chunks = 0;
746 static header = 0;
747 if (!header)
748 {
749 fprintf (stderr, "ivf.pass2 : ");
750 header = 1;
751 }
752 if (disk_chunks == ChunkLimit || next_docs_left == 0)
753 {
754 fprintf (stderr, "M");
755 DiskMerge ();
756 disk_chunks = 0;
757 }
758 else
759 {
760 fprintf (stderr, "-");
761 DumpChunk ();
762 disk_chunks++;
763 }
764 if (next_docs_left == 0)
765 fprintf (stderr, "\n");
766}
767
768
769static int
770wl_comp (const void *a, const void *b)
771{
772 return *((int *) a) - *((int *) b);
773}
774
775static int
776process_doc (u_char * s_in, int l_in)
777{
778 int res;
779 u_char *end = s_in + l_in - 1;
780 unsigned long tocode;
781 unsigned long wl_pos = 0;
782
783 if (!docs_left)
784 LoadCounts ();
785
786 callnum++;
787
788 if (!inaword (s_in, end))
789 if (SkipSGML)
790 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
791 else
792 PARSE_NON_STEM_WORD (s_in, end);
793
794 while (s_in <= end)
795 {
796 u_char Word[MAXSTEMLEN + 1];
797
798 PARSE_STEM_WORD (Word, s_in, end);
799 stemmer (idh.stem_method, idh.stemmer_num, Word);
800 if (SkipSGML)
801 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
802 else
803 PARSE_NON_STEM_WORD (s_in, end);
804
805 if (*Word == 0)
806 continue;
807
808 res = perf_hash (phd, Word);
809
810 {
811 word_rec *arr = &WordRecs[res];
812 int b = 1 << lg_bs[res];
813 wordnum++;
814
815 tocode = callnum;
816
817 ENCODE_START ((u_char *) MemoryBuffer, arr->ptr)
818
819 if (tocode > arr->last)
820 {
821 register int x;
822 x = tocode - arr->last - 1;
823 BBLOCK_ENCODE (x + 1, b);
824 if (InvfLevel >= 2)
825 ENCODE_BIT (1);
826 no_of_ptrs++;
827 arr->last = tocode;
828 }
829 else if (InvfLevel >= 2)
830 {
831 __pos--;
832 ENCODE_BIT (0);
833 ENCODE_BIT (1);
834 }
835 arr->ptr = __pos;
836 ENCODE_DONE
837 }
838
839 if (MakeWeights)
840 {
841 if (wl_pos >= wl_size)
842 {
843 wl_size += (wl_size >> 1);
844 word_list = Xrealloc (word_list, sizeof (*word_list) * wl_size);
845 }
846 word_list[wl_pos++] = res;
847 }
848 }
849 if (MakeWeights)
850 {
851 float doc_weight = 0.0;
852 if (wl_pos)
853 {
854 unsigned long *wl = word_list;
855 unsigned long i, count, val;
856 qsort (wl, wl_pos, sizeof (*wl), wl_comp);
857 count = 1;
858 val = *wl++;
859 for (i = 1; i <= wl_pos; i++, wl++)
860 if (i == wl_pos || val != *wl)
861 {
862 double weight = count * idf[val];
863 doc_weight += weight * weight;
864 count = 1;
865 val = *wl;
866 }
867 else
868 count++;
869 }
870 HTONF(doc_weight); /* [RPAP - Jan 97: Endian Ordering] */
871 fwrite ((char *) &doc_weight, sizeof (doc_weight), 1, weights);
872 }
873 docs_left--;
874 if (!docs_left)
875 MergeIn ();
876
877 return COMPALLOK;
878}
879
880int
881process_ivf_2 (u_char * s_in, int l_in)
882{
883 if (InvfLevel <= 2)
884 return process_doc (s_in, l_in);
885 else
886 {
887 int count = 0;
888 int pos = 0;
889 u_char *start = s_in;
890 while (pos < l_in)
891 {
892 if (s_in[pos] == TERMPARAGRAPH)
893 {
894 int len = pos + s_in + 1 - start;
895 if (process_doc (start, len) != COMPALLOK)
896 return (COMPERROR);
897 start = s_in + pos + 1;
898 count++;
899 }
900 pos++;
901 }
902 if (start < s_in + pos)
903 {
904 if (process_doc (start, pos + s_in - start) != COMPALLOK)
905 return (COMPERROR);
906 count++;
907 }
908 HTONSI(count); /* [RPAP - Jan 97: Endian Ordering] */
909 fwrite ((char *) &count, sizeof (count), 1, invf_para);
910 }
911 return COMPALLOK;
912}
913
914
915
916
917
918static void
919stats (unsigned long len)
920{
921#ifndef SILENT
922 fseek (count, 0, 2);
923 fseek (count_trans, 0, 2);
924 fseek (invf_state, 0, 2);
925 fseek (invf, 0, 0);
926 fseek (invf, 0, 2);
927 fseek (chunks, 0, 2);
928 fseek (chunk_state, 0, 2);
929 Message ("File sizes\n");
930 Message (" Chunk desc : %10u bytes\n", ftell (count));
931 Message (" Chunk trans : %10u bytes\n", ftell (count_trans));
932 Message (" Chunks : %10u bytes\n", ftell (chunks));
933 Message (" Chunk state : %10u bytes\n", ftell (chunk_state));
934 Message (" Invf state : %10u bytes\n", ftell (invf_state));
935 Message (" Peak invf : %10u bytes\n", len);
936 Message (" Final invf : %10u bytes\n", ftell (invf));
937 Message ("Peak disk usage : %10.2f %%\n",
938 (double) (ftell (count) + ftell (count_trans) +
939 ftell (invf_state) + ftell (chunks) +
940 ftell (chunk_state) + len) / ftell (invf) * 100.0);
941#endif
942}
943
944
945/* ARGSUSED */
946int
947done_ivf_2 (char *FileName)
948{
949 long i;
950 unsigned long totalIbits;
951 unsigned long invf_len;
952 unsigned long bytes_output;
953 struct invf_file_header ifh;
954
955 if (weights)
956 fclose (weights);
957 if (invf_para)
958 fclose (invf_para);
959
960 free_perf_hash (phd);
961
962 free (MemoryBuffer);
963 ChangeMemInUse (-MemBufSize);
964
965 BIO_Random_Done (&rbs);
966 BIO_Random_Done (&rbsp);
967 fflush (invf);
968
969 fseek (invf, 0, 2);
970 invf_len = ftell (invf);
971
972 fseek (invf_out, sizeof (long), 0);
973 /* [RPAP - Jan 97: Endian Ordering] */
974 HTONUL2(dict_size, ifh.no_of_words);
975 HTONUL2(no_of_ptrs, ifh.no_of_ptrs);
976 ifh.skip_mode = 0;
977 bzero ((char *) ifh.params, sizeof (ifh.params));
978 HTONUL2(InvfLevel, ifh.InvfLevel);
979 fwrite ((char *) &ifh, sizeof (ifh), 1, invf_out);
980
981 bytes_output = ftell (invf_out);
982
983 totalIbits = 32; /* The magic number */
984 totalIbits += 8 * 100; /* A 100 byte gap */
985
986 /* find the right place in the file to start reading p values */
987 fseek (dict, sizeof (unsigned long) + sizeof (struct invf_dict_header), 0);
988 for (i = 0; i < dict_size; i++)
989 {
990 invf_state_rec *isr;
991 unsigned long fcnt, wcnt, s, e;
992 register unsigned long p;
993 u_char dummy1, dummy2[MAXSTEMLEN + 1];
994
995 /* output location to the invf_idx */
996 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
997 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
998 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
999
1000 /* read an entry for a word, just to get p value */
1001 dummy1 = fgetc (dict);
1002 dummy1 = fgetc (dict);
1003 fread (dummy2, sizeof (u_char), dummy1, dict);
1004 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
1005 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
1006
1007 /* [RPAP - Jan 97: Endian Ordering] */
1008 NTOHUL(fcnt);
1009 NTOHUL(wcnt);
1010
1011 p = fcnt;
1012
1013 isr = in_cache (i);
1014
1015 e = (isr->Disk_Ptr + 7ul) >> 3ul;
1016 s = totalIbits >> 3;
1017
1018 fseek (invf_in, s, 0);
1019 while (s < e)
1020 {
1021 u_char c = getc (invf_in);
1022 if (s == e - 1)
1023 {
1024 u_char ands[8] =
1025 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1026 c &= ands[isr->Disk_Ptr & 7ul];
1027 }
1028 putc (c, invf_out);
1029 bytes_output++;
1030 s++;
1031 }
1032
1033 totalIbits += BIO_Bblock_Bound_b (N, p, isr->Disk_B);
1034 if (InvfLevel >= 2)
1035 totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
1036 totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
1037
1038 }
1039
1040 fclose (invf_in);
1041
1042 /* [RPAP - Feb 97: WIN32 Port] */
1043#ifdef __WIN32__
1044 if (!(_chsize (_fileno (invf_out), bytes_output)))
1045 Message ("Could not truncate invf.");
1046#else
1047 ftruncate (fileno (invf_out), bytes_output);
1048#endif
1049
1050 fclose (invf_out);
1051
1052 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1053 fwrite ((char *) &bytes_output, sizeof (bytes_output), 1, invf_idx);
1054 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
1055
1056 fclose (invf_idx);
1057
1058#ifndef SILENT
1059 {
1060 char *temp_str = msg_prefix;
1061 unsigned long total;
1062 msg_prefix = "ivf.pass2";
1063 stats (invf_len);
1064 Message ("Pass two data structures : %6.3f Mbyte\n",
1065 (double) totalDbytes / 1024 / 1024);
1066 total = totalDbytes;
1067 Message ("Pass two hash structure(s) : %6.3f Mbyte\n",
1068 (double) totalHbytes / 1024 / 1024);
1069 total += totalHbytes;
1070 Message ("Peak extra memory in use : %6.3f Mbyte\n",
1071 (double) MaxMemInUse / 1024 / 1024);
1072 total += MaxMemInUse;
1073 Message ("Peak total memory in use : %6.3f Mbyte\n",
1074 (double) total / 1024 / 1024);
1075 msg_prefix = temp_str;
1076 }
1077#endif
1078
1079 Xfree (WordRecs);
1080 Xfree (lg_bs);
1081
1082 /* Free the memory allocated for the BIO_Random */
1083 occur_to_lexical (-1);
1084
1085 BIO_Random_Done (&crbs);
1086
1087 fclose (invf);
1088 fclose (dict);
1089 fclose (hash);
1090 fclose (count);
1091 fclose (count_trans);
1092 fclose (chunk_state);
1093 fclose (chunks);
1094 fclose (invf_state);
1095 return (COMPALLOK);
1096}
Note: See TracBrowser for help on using the repository browser.