source: gsdl/trunk/trunk/mg/src/text/mg_invf_merge.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.8 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_merge.c -- description
4 * Copyright (C) 1995 Shane Hudson ([email protected])
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * Last edited: 31 March, 1995
21 * $Id: mg_invf_merge.c 16583 2008-07-29 10:20:36Z davidb $
22 *
23 **************************************************************************/
24
25#include "sysfuncs.h"
26
27#include "memlib.h"
28#include "locallib.h"
29#include "local_strings.h"
30#include "messages.h"
31#include "timing.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "bitio_m.h"
35#include "bitio_gen.h"
36#include "bitio_stdio.h"
37
38#include "mg.h"
39#include "mg_merge.h"
40#include "mg_files.h"
41#include "invf.h"
42#include "words.h"
43
44typedef char FileName[256];
45
46
47/*
48 * The merge_info struct contains all the data for an inverted file and
49 * lexicon. We declare 3 merge_info variables, m[OLD] m[NEW] and m[MERGE].
50 */
51typedef struct merge_info_type
52 {
53 FileName fname;
54 FILE *invf;
55 FILE *dict;
56 FILE *idx;
57 FILE *para; /* [RPAP - Feb 97: Level 3 Merge] */
58 struct invf_dict_header idh;
59 struct invf_file_header ifh;
60 u_long nDocs;
61
62 /* Lexicon processing variables */
63 u_long fcnt, wcnt;
64 int suff, pref;
65 unsigned char term[MAXSTEMLEN + 1];
66 u_long term_count;
67 int done;
68
69 /* Variables used for merging */
70 struct stdio_bitio_state sbs; /* for bit-based I/O on inverted files */
71 u_long nTerms; /* = number of invf entries */
72 int *fcntlist; /* store fcnt values read in from the lexicons */
73
74 }
75merge_info;
76
77
78/***************************************************
79 **** GLOBALS ****
80 ***************************************************/
81
82merge_info m[3]; /* m[OLD], m[NEW] and m[MERGE] */
83u_long magicsize; /* how big is magic number? currently always 4 */
84u_long Nstatic; /* static # of documents in file */
85
86float *DocWeightBuffer;
87int weightOption = 0; /* true if weights file is to be appended */
88
89/*** Globals for processing the lexicons -- see readTerm() and writeTerm()
90 ***/
91ProgTime start; /* for printing elapsed time */
92unsigned char prevTerm[MAXSTEMLEN + 1];
93
94/*** Globals for merging -- see processEntry()
95 ***/
96char *mergedata; /* stores value OLD/NEW/MERGE for each term */
97int fastMerge = 1; /* True if fast merge selected. Default = true */
98
99u_long oldIdxValue; /* stores last read value from m[OLD].idx
100 Used to calculate entry lengths */
101u_long bytes_output; /* for writing .invf.idx file */
102u_long oldOnlyBits = 0; /* entries in old only */
103u_long oldMergeBits = 0; /* entries in old that are merged with
104 a new entry */
105
106/**************************************************************************/
107
108
109/*=======================================================================
110 * init_merge_invf(): open files, initialise globals, etc
111 *=======================================================================*/
112int
113init_merge_invf ()
114{
115 prevTerm[0] = prevTerm[1] = 0;
116
117 /***
118 * open .dict files
119 ***/
120 m[OLD].dict = open_file (m[OLD].fname, INVF_DICT_SUFFIX, "rb",
121 MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
122 magicsize = ftell (m[OLD].dict);
123 fread (&(m[OLD].idh), sizeof (m[OLD].idh), 1, m[OLD].dict);
124
125 /* [RPAP - Jan 97: Endian Ordering] */
126 NTOHUL(m[OLD].idh.lookback);
127 NTOHUL(m[OLD].idh.dict_size);
128 NTOHUL(m[OLD].idh.total_bytes);
129 NTOHUL(m[OLD].idh.index_string_bytes);
130 NTOHD(m[OLD].idh.input_bytes); /* [RJM 07/97: 4G limit] */
131 NTOHUL(m[OLD].idh.num_of_docs);
132 NTOHUL(m[OLD].idh.static_num_of_docs);
133 NTOHUL(m[OLD].idh.num_of_words);
134 NTOHUL(m[OLD].idh.stemmer_num);
135 NTOHUL(m[OLD].idh.stem_method);
136
137 m[NEW].dict = open_file (m[NEW].fname, INVF_DICT_SUFFIX, "rb",
138 MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
139 fread (&(m[NEW].idh), sizeof (m[NEW].idh), 1, m[NEW].dict);
140
141 /* [RPAP - Jan 97: Endian Ordering] */
142 NTOHUL(m[NEW].idh.lookback);
143 NTOHUL(m[NEW].idh.dict_size);
144 NTOHUL(m[NEW].idh.total_bytes);
145 NTOHUL(m[NEW].idh.index_string_bytes);
146 NTOHD(m[NEW].idh.input_bytes); /* [RJM 07/97: 4G limit] */
147 NTOHUL(m[NEW].idh.num_of_docs);
148 NTOHUL(m[NEW].idh.static_num_of_docs);
149 NTOHUL(m[NEW].idh.num_of_words);
150 NTOHUL(m[NEW].idh.stemmer_num);
151 NTOHUL(m[NEW].idh.stem_method);
152
153 m[MERGE].dict = create_file (m[MERGE].fname, INVF_DICT_SUFFIX, "wb",
154 MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
155
156 /* write space for header */
157 fwrite ((char *) &(m[OLD].idh), sizeof (m[OLD].idh), 1, m[MERGE].dict);
158
159 m[OLD].nDocs = m[OLD].idh.num_of_docs;
160 m[NEW].nDocs = m[NEW].idh.num_of_docs;
161 Nstatic = m[OLD].idh.static_num_of_docs;
162 m[OLD].nTerms = m[OLD].idh.dict_size;
163 m[NEW].nTerms = m[NEW].idh.dict_size;
164 m[MERGE].nDocs = m[OLD].nDocs + m[NEW].nDocs;
165 m[MERGE].nTerms = 0;
166
167 /* Set up weight buffer for weights of new documents */
168 if (!(DocWeightBuffer = Xmalloc (m[NEW].nDocs * sizeof (*DocWeightBuffer))))
169 {
170 Message ("Insufficient memory\n");
171 return COMPERROR;
172 }
173 bzero ((char *) DocWeightBuffer, m[NEW].nDocs * sizeof (*DocWeightBuffer));
174
175 /***
176 * open .invf files
177 * Try ".ORG" extension first for m[OLD].invf since an inverted file
178 * with skips may have been created.
179 ****/
180 if (!(m[OLD].invf = open_file (m[OLD].fname, INVF_SUFFIX ".ORG", "rb",
181 MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
182 m[OLD].invf = open_file (m[OLD].fname, INVF_SUFFIX, "rb",
183 MAGIC_INVF, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
184 fread (&(m[OLD].ifh), sizeof (m[OLD].ifh), 1, m[OLD].invf);
185
186 /* [RPAP - Jan 97: Endian Ordering] */
187 NTOHUL(m[OLD].ifh.no_of_words);
188 NTOHUL(m[OLD].ifh.no_of_ptrs);
189 NTOHUL(m[OLD].ifh.skip_mode);
190 {
191 int i;
192 for (i = 0; i < 16; i++)
193 NTOHUL(m[OLD].ifh.params[i]);
194 }
195 NTOHUL(m[OLD].ifh.InvfLevel);
196
197 m[NEW].invf = open_file (m[NEW].fname, INVF_SUFFIX, "rb",
198 MAGIC_INVF, MG_ABORT);
199 fread (&(m[NEW].ifh), sizeof (m[NEW].ifh), 1, m[NEW].invf);
200
201 /* [RPAP - Jan 97: Endian Ordering] */
202 NTOHUL(m[NEW].ifh.no_of_words);
203 NTOHUL(m[NEW].ifh.no_of_ptrs);
204 NTOHUL(m[NEW].ifh.skip_mode);
205 {
206 int i;
207 for (i = 0; i < 16; i++)
208 NTOHUL(m[NEW].ifh.params[i]);
209 }
210 NTOHUL(m[NEW].ifh.InvfLevel);
211
212 if (m[OLD].ifh.skip_mode != 0)
213 FatalError (1, "The old invf file contains skips. Unable to merge.");
214
215 if (m[OLD].ifh.InvfLevel != m[NEW].ifh.InvfLevel)
216 FatalError (1, "The two invf files have different inversion levels!");
217
218 /********** [RPAP - Feb 97: Level 3 Merge]
219
220 if ((m[OLD].ifh.InvfLevel > 2) || (m[NEW].ifh.InvfLevel > 2))
221 FatalError (1, "mgmerge cannot handle level 3 inverted files!");
222
223 *************/
224
225 m[MERGE].invf = create_file (m[MERGE].fname, INVF_SUFFIX, "wb",
226 MAGIC_INVF, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
227
228 /* write space for header in inverted file */
229 m[MERGE].ifh.no_of_words = m[MERGE].ifh.no_of_ptrs = 0;
230 m[MERGE].ifh.skip_mode = 0;
231 m[MERGE].ifh.InvfLevel = 2;
232 {
233 int i;
234 for (i = 0; i < 16; i++)
235 m[MERGE].ifh.params[i] = 0;
236 }
237 fwrite ((char *) &(m[MERGE].ifh), sizeof (m[MERGE].ifh), 1, m[MERGE].invf);
238
239 /***
240 * open .invf.idx files
241 ***/
242 m[OLD].idx = open_file (m[OLD].fname, INVF_IDX_SUFFIX, "rb",
243 MAGIC_INVI, MG_ABORT);
244 fread (&oldIdxValue, sizeof (u_long), 1, m[OLD].idx);
245 NTOHUL(oldIdxValue); /* [RPAP - Jan 97: Endian Ordering] */
246
247 m[NEW].idx = open_file (m[NEW].fname, INVF_IDX_SUFFIX, "rb",
248 MAGIC_INVI, MG_ABORT);
249
250 m[MERGE].idx = create_file (m[MERGE].fname, INVF_IDX_SUFFIX, "wb",
251 MAGIC_INVI, MG_ABORT);
252
253
254 /* [RPAP - Feb 97: Level 3 Merge] */
255
256 /***
257 * open .invf.parargraph files
258 ***/
259 if (m[OLD].ifh.InvfLevel == 3)
260 {
261 m[OLD].para = NULL;
262 m[NEW].para = open_file (m[NEW].fname, INVF_PARAGRAPH_SUFFIX, "rb",
263 MAGIC_PARAGRAPH, MG_ABORT);
264 m[MERGE].para = open_file (m[MERGE].fname, INVF_PARAGRAPH_SUFFIX, "rb+",
265 MAGIC_PARAGRAPH, MG_ABORT);
266 }
267 else
268 {
269 m[OLD].para = NULL;
270 m[NEW].para = NULL;
271 m[MERGE].para = NULL;
272 }
273
274 return OK;
275}
276
277
278/*=======================================================================
279 * procEntry_slow(): merge an entry, slow method
280 *=======================================================================*/
281void
282procEntry_slow (int tnum, int type, u_long oldN, u_long newN, u_long mergeN)
283/*
284 * Process an invf entry.
285 * Slow Merge. (Always decode/re-encode).
286 *
287 * tnum is the entry number, type = OLD or NEW or MERGE.
288 * oldN, newN and mergeN are the N parameters for Bblock coding.
289 * They are passed so other procEntry_fast can call this one
290 * when a decode/recode is needed to save repeating code.
291 */
292{
293 int doc = 0;
294 int fcntOLD = m[OLD].fcntlist[tnum];
295 int fcntNEW = m[NEW].fcntlist[tnum];
296 int inblk, outblk;
297 u_long inbits = 0, outbits = 0;
298
299 /* write .invf.idx pointer */
300 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
301 fwrite (&bytes_output, sizeof (u_long), 1, m[MERGE].idx);
302 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
303
304 outblk = BIO_Bblock_Init (mergeN, (fcntOLD + fcntNEW));
305
306/*** OLD entry -> merge entry ***/
307 if (type != NEW)
308 {
309 int i;
310 inblk = BIO_Bblock_Init (oldN, fcntOLD);
311 for (i = 0; i < fcntOLD; i++)
312 {
313 int num;
314 num = BIO_Stdio_Bblock_Decode (inblk, &(m[OLD].sbs), &inbits);
315 doc += num;
316 BIO_Stdio_Bblock_Encode (num, outblk, &(m[MERGE].sbs), &outbits);
317 if (m[OLD].ifh.InvfLevel >= 2) /* frequencies */
318 {
319 num = BIO_Stdio_Gamma_Decode (&(m[OLD].sbs), &inbits);
320 BIO_Stdio_Gamma_Encode (num, &(m[MERGE].sbs), &outbits);
321 }
322 }
323 if (type == OLD)
324 oldOnlyBits += (inbits + m[OLD].sbs.Btg);
325 else
326 oldMergeBits += (inbits + m[OLD].sbs.Btg);
327 }
328
329/*** NEW entry -> merge entry ***/
330 if (type != OLD)
331 {
332 int i;
333 int offset;
334 double logN = log ((double) m[MERGE].nDocs);
335 double idf = logN - log ((double) (fcntOLD + fcntNEW));
336
337 inblk = BIO_Bblock_Init (newN, fcntNEW);
338 offset = (m[OLD].nDocs - doc); /* amount to be added to 1ST pointer */
339 for (i = 0; i < fcntNEW; i++)
340 {
341 register int num;
342 register double weight;
343 num = BIO_Stdio_Bblock_Decode (inblk, &(m[NEW].sbs), &inbits);
344 if (i == 0)
345 num += offset;
346 doc += num;
347 BIO_Stdio_Bblock_Encode (num, outblk, &(m[MERGE].sbs), &outbits);
348 if (m[OLD].ifh.InvfLevel >= 2) /* frequencies */
349 {
350 num = BIO_Stdio_Gamma_Decode (&(m[NEW].sbs), &inbits);
351 BIO_Stdio_Gamma_Encode (num, &(m[MERGE].sbs), &outbits);
352 weight = num * idf;
353 DocWeightBuffer[doc - m[OLD].nDocs - 1] += weight * weight;
354 }
355 }
356 }
357/*** Now the padding bits ***/
358 while (m[OLD].sbs.Btg)
359 BIO_Stdio_Decode_Bit (&(m[OLD].sbs));
360 while (m[NEW].sbs.Btg)
361 BIO_Stdio_Decode_Bit (&(m[NEW].sbs));
362 while (m[MERGE].sbs.Btg != 8)
363 {
364 BIO_Stdio_Encode_Bit (0, &(m[MERGE].sbs));
365 outbits++;
366 }
367 bytes_output += (outbits >> 3);
368 return;
369}
370
371
372/*=======================================================================
373 * procEntry_fast(): merge an entry, WITHOUT decoding if it's in the
374 * old inverted file only
375 *=======================================================================*/
376void
377procEntry_fast (int tnum, int type)
378/*
379 * Merge an invf entry.
380 * Faster Method : Copy (don't re-code) entries for terms only in IFold.
381 * The N parameter is Nstatic, from m[OLD].idh.static_num_of_docs.
382 * If entry type isnt OLD, just call procEntry_slow()
383 */
384{
385 u_long newIdxValue, len = 0;
386
387 /* Calculate IFold entry length in bytes if not NEW type */
388 if (type != NEW) /* read in an index number from m[OLD].idx */
389 {
390 fread (&newIdxValue, sizeof (u_long), 1, m[OLD].idx);
391 NTOHUL(newIdxValue); /* [RPAP - Jan 97: Endian Ordering] */
392 len = newIdxValue - oldIdxValue;
393 oldIdxValue = newIdxValue;
394 }
395
396 if (type != OLD) /* must decode/recode */
397 procEntry_slow (tnum, type, Nstatic, m[NEW].nDocs, Nstatic);
398 else
399 {
400 /* write .invf.idx pointer */
401 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
402 fwrite (&bytes_output, sizeof (u_long), 1, m[MERGE].idx);
403 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
404
405 /* copy entry */
406 {
407 int i;
408 char c;
409 for (i = 0; i < len; i++)
410 {
411 c = fgetc (m[OLD].invf);
412 fputc (c, m[MERGE].invf);
413 }
414 }
415 bytes_output += len;
416 oldOnlyBits += (len * 8);
417 }
418 return;
419}
420
421
422/*=======================================================================
423 * processEntry(): merge an entry. Call either procEntry_slow() or
424 * procEntry_fast.
425 *=======================================================================*/
426void
427processEntry (int tnum, int type)
428/*
429 * Read an entry from IFold and/or IFnew, and write to IFmerge.
430 * tnum is the term number, type is OLD or NEW or MERGE
431 */
432{
433 if (fastMerge)
434 procEntry_fast (tnum, type);
435 else
436 procEntry_slow (tnum, type, Nstatic, m[NEW].nDocs, m[MERGE].nDocs);
437 return;
438}
439
440
441/*=======================================================================
442 * readTerm()
443 *=======================================================================*/
444void
445readTerm (int x)
446/*
447 * Read the next term from the appropriate ".invf.dict" file.
448 * x should be either OLD or NEW.
449 * Sets m[x].done to true when that lexicon has been completely read.
450 */
451{
452 int i;
453
454 if (m[x].term_count >= m[x].nTerms)
455 {
456 m[x].term[0] = 0;
457 m[x].done = 1;
458 return;
459 }
460 m[x].pref = fgetc (m[x].dict);
461 m[x].suff = fgetc (m[x].dict);
462 m[x].term[0] = m[x].pref + m[x].suff;
463
464 for (i = 0; i < m[x].suff; i++)
465 {
466 m[x].term[m[x].pref + i + 1] = fgetc (m[x].dict);
467 }
468 m[x].term_count = m[x].term_count + 1;
469
470 fread ((char *) &(m[x].fcnt), sizeof (u_long), 1, m[x].dict);
471 fread ((char *) &(m[x].wcnt), sizeof (u_long), 1, m[x].dict);
472
473 /* [RPAP - Jan 97: Endian Ordering] */
474 NTOHUL(m[x].fcnt);
475 NTOHUL(m[x].wcnt);
476
477 return;
478}
479
480
481/*=======================================================================
482 * writeTerm()
483 *=======================================================================*/
484void
485writeTerm (int x)
486/*
487 * Write the current term to m[MERGE].dict file.
488 * x is OLD, NEW or MERGE.
489 */
490{
491 unsigned char i, prefix, suffix;
492 if (x == MERGE)
493 {
494 m[MERGE].fcnt = m[OLD].fcnt + m[NEW].fcnt;
495 m[MERGE].wcnt = m[OLD].wcnt + m[NEW].wcnt;
496 m[MERGE].pref = m[OLD].pref;
497 m[MERGE].suff = m[OLD].suff;
498 memcpy (m[MERGE].term, m[OLD].term, m[OLD].term[0] + 1);
499 }
500 if (x == OLD)
501 {
502 m[MERGE].fcnt = m[OLD].fcnt;
503 m[MERGE].wcnt = m[OLD].wcnt;
504 }
505 if (x == NEW)
506 {
507 m[MERGE].fcnt = m[NEW].fcnt;
508 m[MERGE].wcnt = m[NEW].wcnt;
509 }
510 prefix = prefixlen (prevTerm, m[x].term);
511 suffix = m[x].term[0] - prefix;
512 fputc (prefix, m[MERGE].dict); /* prefix length */
513 fputc (suffix, m[MERGE].dict); /* suffix length */
514 for (i = 0; i < suffix; i++)
515 fputc (m[x].term[prefix + i + 1], m[MERGE].dict);
516
517 /* [RPAP - Jan 97: Endian Ordering] */
518 HTONUL(m[MERGE].fcnt);
519 HTONUL(m[MERGE].wcnt);
520
521 fwrite ((char *) &(m[MERGE].fcnt), sizeof (m[MERGE].fcnt), 1, m[MERGE].dict);
522 fwrite ((char *) &(m[MERGE].wcnt), sizeof (m[MERGE].wcnt), 1, m[MERGE].dict);
523
524 /* [RPAP - Jan 97: Endian Ordering] */
525 NTOHUL(m[MERGE].fcnt);
526 NTOHUL(m[MERGE].wcnt);
527
528 memcpy (prevTerm, m[x].term, prefix + suffix + 1);
529 m[MERGE].idh.total_bytes += m[x].term[0] + 1;
530 m[MERGE].idh.index_string_bytes += m[x].term[0] + 2 - prefix;
531 fflush (m[MERGE].dict);
532}
533
534
535/*=======================================================================
536 * process_merge_invf(): The main loop
537 *=======================================================================*/
538int
539process_merge_invf (void)
540/*
541 * The main function to merge the lexicons and inverted files.
542 */
543{
544/*** Some extra Initialisation stuff first ***/
545
546 m[OLD].term[0] = m[NEW].term[0] = '\0';
547 m[OLD].term_count = m[NEW].term_count = 0;
548 m[OLD].done = m[NEW].done = 0;
549 m[MERGE].idh.total_bytes = m[MERGE].idh.index_string_bytes = 0;
550
551/*** malloc arrays: mergedata, m[OLD].fcntlist, m[NEW].fcntlist ***/
552 mergedata = malloc ((m[OLD].nTerms + m[NEW].nTerms) * sizeof (char));
553 if (mergedata == 0)
554 {
555 fprintf (stderr, "MALLOC error!\n");
556 exit (1);
557 }
558 {
559 int i;
560 for (i = 0; i < 2; i++)
561 {
562 m[i].fcntlist = malloc ((m[OLD].nTerms + m[NEW].nTerms) * sizeof (int));
563 if (m[i].fcntlist == 0)
564 {
565 fprintf (stderr, "MALLOC error!\n");
566 exit (1);
567 }
568 }
569 }
570
571/*========================*/
572/*** LEXICON PASS ***/
573/*========================*/
574
575/*** read first terms ***/
576 readTerm (OLD);
577 readTerm (NEW);
578
579 m[MERGE].nTerms = 0;
580 while (m[OLD].done == 0 || m[NEW].done == 0)
581 {
582 int i;
583 if (m[OLD].done)
584 i = 1; /* NEW will always be greater */
585 else if (m[NEW].done)
586 i = -1; /* OLD will always be greater */
587 else
588 i = casecompare (m[OLD].term, m[NEW].term);
589
590 if (i < 0)
591 { /* term in OLD only */
592 mergedata[m[MERGE].nTerms] = (char) OLD;
593 m[OLD].fcntlist[m[MERGE].nTerms] = m[OLD].fcnt;
594 m[NEW].fcntlist[m[MERGE].nTerms] = 0;
595 m[MERGE].nTerms++;
596 writeTerm (OLD);
597 readTerm (OLD);
598 }
599 if (i == 0)
600 { /* term in both lexions */
601 mergedata[m[MERGE].nTerms] = (char) MERGE;
602 m[OLD].fcntlist[m[MERGE].nTerms] = m[OLD].fcnt;
603 m[NEW].fcntlist[m[MERGE].nTerms] = m[NEW].fcnt;
604 m[MERGE].nTerms++;
605 writeTerm (MERGE);
606 readTerm (OLD);
607 readTerm (NEW);
608 }
609 if (i > 0)
610 { /* term in NEW only */
611 mergedata[m[MERGE].nTerms] = (char) NEW;
612 m[NEW].fcntlist[m[MERGE].nTerms] = m[NEW].fcnt;
613 m[OLD].fcntlist[m[MERGE].nTerms] = 0;
614 m[MERGE].nTerms++;
615 writeTerm (NEW);
616 readTerm (NEW);
617 }
618 }/*while*/
619 Message ("%s\n", ElapsedTime (&start, NULL));
620 /* print some results about terms */
621 {
622 fprintf (stderr, " Terms: OLD = %ld, NEW = %ld, MERGE = %ld\n",
623 m[OLD].nTerms, m[NEW].nTerms, m[MERGE].nTerms);
624 fprintf (stderr, " In OLD only: %ld, In NEW only: %ld, BOTH: %ld\n",
625 (m[MERGE].nTerms - m[NEW].nTerms),
626 (m[MERGE].nTerms - m[OLD].nTerms),
627 (m[OLD].nTerms + m[NEW].nTerms - m[MERGE].nTerms));
628 }
629
630
631/*========================*/
632/*** INVF PASS ***/
633/*========================*/
634
635 bytes_output = ftell (m[MERGE].invf);
636 BIO_Stdio_Decode_Start (m[OLD].invf, &m[OLD].sbs);
637 BIO_Stdio_Decode_Start (m[NEW].invf, &m[NEW].sbs);
638 BIO_Stdio_Encode_Start (m[MERGE].invf, &m[MERGE].sbs);
639 {
640 int i;
641 for (i = 0; i < m[MERGE].nTerms; i++)
642 {
643 processEntry (i, mergedata[i]);
644 }
645 /* write final .invf.idx pointer */
646 HTONUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
647 fwrite (&bytes_output, sizeof (u_long), 1, m[MERGE].idx);
648 NTOHUL(bytes_output); /* [RPAP - Jan 97: Endian Ordering] */
649
650 fprintf (stderr, "Old invf: old only bits: %ld, merged bits: %ld\n",
651 oldOnlyBits, oldMergeBits);
652 }
653
654 return OK;
655}
656
657
658/*=======================================================================
659 * done_merge_invf(): write headers, close files, append weights to
660 * weights file
661 *=======================================================================*/
662int
663done_merge_invf (void)
664{
665/*** write new lexicon header ***/
666 m[MERGE].idh.dict_size = m[MERGE].nTerms;
667 m[MERGE].idh.lookback = m[OLD].idh.lookback;
668 m[MERGE].idh.input_bytes = m[OLD].idh.input_bytes + m[NEW].idh.input_bytes;
669 m[MERGE].idh.total_bytes = m[OLD].idh.total_bytes + m[NEW].idh.total_bytes;
670 m[MERGE].idh.num_of_docs = m[MERGE].nDocs;
671 m[MERGE].idh.num_of_words = m[OLD].idh.num_of_words + m[NEW].idh.num_of_words;
672 m[MERGE].idh.stemmer_num = m[OLD].idh.stemmer_num;
673 m[MERGE].idh.stem_method = m[OLD].idh.stem_method;
674
675 /* if fastMerge, static num of docs stays the same! */
676 if (fastMerge)
677 m[MERGE].idh.static_num_of_docs = m[OLD].idh.static_num_of_docs;
678 else
679 m[MERGE].idh.static_num_of_docs = m[MERGE].nDocs;
680
681 /* [RPAP - Jan 97: Endian Ordering] */
682 HTONUL(m[MERGE].idh.lookback);
683 HTONUL(m[MERGE].idh.dict_size);
684 HTONUL(m[MERGE].idh.total_bytes);
685 HTONUL(m[MERGE].idh.index_string_bytes);
686 HTOND(m[MERGE].idh.input_bytes); /* [RJM 07/97: 4G limit] */
687 HTONUL(m[MERGE].idh.num_of_docs);
688 HTONUL(m[MERGE].idh.static_num_of_docs);
689 HTONUL(m[MERGE].idh.num_of_words);
690 HTONUL(m[MERGE].idh.stemmer_num);
691 HTONUL(m[MERGE].idh.stem_method);
692
693 fseek (m[MERGE].dict, magicsize, 0);
694 fwrite (&(m[MERGE].idh), sizeof (struct invf_dict_header), 1, m[MERGE].dict);
695
696/*** write new inverted file header ***/
697 m[MERGE].ifh.no_of_words = m[MERGE].nTerms;
698 m[MERGE].ifh.no_of_ptrs = m[OLD].ifh.no_of_ptrs + m[NEW].ifh.no_of_ptrs;
699 m[MERGE].ifh.skip_mode = m[OLD].ifh.skip_mode;
700 m[MERGE].ifh.InvfLevel = m[OLD].ifh.InvfLevel;
701
702 /* ifh.params[16] -- I have NO IDEA what these do!!!!! */
703 /* only set if a inverted file is in skipped format, I think */
704 {
705 int i;
706 for (i = 0; i < 16; i++)
707 m[MERGE].ifh.params[i] = m[OLD].ifh.params[i];
708 }
709
710 /* [RPAP - Jan 97: Endian Ordering] */
711 HTONUL(m[MERGE].ifh.no_of_words);
712 HTONUL(m[MERGE].ifh.no_of_ptrs);
713 HTONUL(m[MERGE].ifh.skip_mode);
714 {
715 int i;
716 for (i = 0; i < 16; i++)
717 HTONUL(m[MERGE].ifh.params[i]);
718 }
719 HTONUL(m[MERGE].ifh.InvfLevel);
720
721 fseek (m[MERGE].invf, magicsize, 0); /* go to start of header */
722 fwrite (&(m[MERGE].ifh), sizeof (struct invf_file_header), 1, m[MERGE].invf);
723
724/*** Write new document weights --- leave old weights alone ***/
725 if ((weightOption) && (m[OLD].ifh.InvfLevel >= 2))
726 {
727 FILE *weightfile = open_file (m[MERGE].fname, WEIGHTS_SUFFIX, "r+b",
728 MAGIC_WGHT, MG_CONTINUE); /* [RPAP - Feb 97: WIN32 Port] */
729 if (weightfile)
730 {
731 u_long i; /* [RPAP - Jan 97: Endian Ordering] */
732
733 fprintf (stderr, "writing new weights: Nnew = %ld\n", m[NEW].nDocs);
734 fseek (weightfile, 0, 2); /* end of file */
735
736 /* [RPAP - Jan 97: Endian Ordering] */
737 for (i = 0; i < m[NEW].nDocs; i++)
738 HTONF(DocWeightBuffer[i]);
739
740 fwrite ((char *) DocWeightBuffer, sizeof (float), m[NEW].nDocs,
741 weightfile);
742 }
743 fclose (weightfile);
744 }
745
746
747 /* [RPAP - Feb 97: Level 3 Merge] */
748 if (m[OLD].ifh.InvfLevel == 3)
749 {
750 /* Update paragraph file */
751 int buffer_size = 1000;
752 u_char *buffer;
753 int num;
754
755 if (!(buffer = (u_char *) Xmalloc (buffer_size)))
756 FatalError (1, "Could not allocate memory for paragraph buffer");
757
758 fseek (m[MERGE].para, 0, SEEK_END);
759 while ((num = fread ((char *) buffer, sizeof (*buffer), buffer_size, m[NEW].para)) != 0)
760 fwrite ((char *) buffer, sizeof (*buffer), num, m[MERGE].para);
761
762 Xfree (buffer);
763 }
764
765/*** close files ***/
766 fclose (m[MERGE].invf);
767 fclose (m[OLD].invf);
768 fclose (m[NEW].invf);
769 fclose (m[MERGE].dict);
770 fclose (m[OLD].dict);
771 fclose (m[NEW].dict);
772 fclose (m[MERGE].idx);
773 fclose (m[OLD].idx);
774 fclose (m[NEW].idx);
775
776 /* [RPAP - Feb 97: Level 3 Merge] */
777 if (m[OLD].ifh.InvfLevel == 3)
778 {
779 fclose (m[NEW].para);
780 fclose (m[MERGE].para);
781 }
782
783 return OK;
784}
785
786
787/*=======================================================================
788 * usage()
789 *=======================================================================*/
790void
791usage (char *progname)
792{
793 fprintf (stderr, "usage: %s [-s] [-w] [-d directory] -f mg_collection\n",
794 progname);
795 fprintf (stderr, " example: %s -s -w -f MERGE/mailfiles\n", progname);
796 exit (1);
797}
798
799
800/*=======================================================================
801 * main()
802 *=======================================================================*/
803int
804main (int argc, char *argv[])
805{
806 int ch;
807 char *progname;
808 progname = argv[0];
809 m[MERGE].fname[0] = '\0';
810
811 /* message and timing information */
812 msg_prefix = argv[0];
813
814 while ((ch = getopt (argc, argv, "f:d:swh")) != -1)
815 switch (ch)
816 {
817 case 'f':
818 strcpy (m[MERGE].fname, optarg);
819 break;
820 case 'd':
821 set_basepath (optarg);
822 break;
823 case 's':
824 fastMerge = 0;
825 break;
826 case 'w':
827 weightOption = 1;
828 break;
829 case 'h':
830 case '?':
831 default:
832 usage (progname);
833 }
834
835 if (m[MERGE].fname[0] == '\0')
836 usage (progname);
837 strcpy (m[OLD].fname, m[MERGE].fname);
838 strcat (m[OLD].fname, ".old");
839 strcpy (m[NEW].fname, m[MERGE].fname);
840 strcat (m[NEW].fname, ".new");
841
842 GetTime (&start);
843 init_merge_invf ();
844 process_merge_invf ();
845 done_merge_invf ();
846 Message ("%s\n", ElapsedTime (&start, NULL));
847 exit (0);
848}
849
850/*************************************************************************/
851/* EOF: mg_invf_merge_new.c */
852/*************************************************************************/
Note: See TracBrowser for help on using the repository browser.