source: trunk/indexers/mg/src/text/mg_invf_rebuild.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.3 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_rebuild.c -- Program to rebuild an inverted file with skipping
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_rebuild.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "timing.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "bitio_gen.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "invf.h"
36#include "locallib.h"
37#include "words.h"
38#include "mg.h"
39
40#define LEN 40
41#define WRD 1
42
43typedef struct invf_info
44 {
45 unsigned long doc_num, count, bits_so_far, count_bits;
46 }
47invf_info;
48
49/*
50 $Log$
51 Revision 1.1 2003/02/20 21:18:24 mdewsnip
52 Addition of MG package for search and retrieval
53
54 Revision 1.1 1999/08/10 21:18:11 sjboddie
55 renamed mg-1.3d directory mg
56
57 Revision 1.2 1998/11/25 07:55:46 rjmcnab
58
59 Modified mg to that you can specify the stemmer you want
60 to use via a command line option. You specify it to
61 mg_passes during the build process. The number of the
62 stemmer that you used is stored within the inverted
63 dictionary header and the stemmed dictionary header so
64 the correct stemmer is used in later stages of building
65 and querying.
66
67 Revision 1.1 1998/11/17 09:35:10 rjmcnab
68 *** empty log message ***
69
70 * Revision 1.4 1994/11/29 00:32:03 tes
71 * Committing the new merged files and changes.
72 *
73 * Revision 1.3 1994/10/20 03:56:56 tes
74 * I have rewritten the boolean query optimiser and abstracted out the
75 * components of the boolean query.
76 *
77 * Revision 1.2 1994/09/20 04:41:51 tes
78 * For version 1.1
79 *
80 */
81
82static char *RCSID = "$Id: mg_invf_rebuild.c 3745 2003-02-20 21:20:24Z mdewsnip $";
83
84static char pathname[256];
85
86static void process_files (char *filename);
87
88static int k = -1;
89static int mode = 0;
90static int max_nodes = -1;
91static int mins = -1;
92
93void
94usage (char *pgname)
95{
96 fprintf (stderr, "usage: %s [-f input_file]"
97 "[-d data directory] -0|-1|-2 [-k num] [-m num] [-s num]\n", pgname);
98 printf ("Mode 0: (the default)\n");
99 printf ("\tk, m, and s have no meaning and if specified produce an error\n");
100 printf ("Mode 1:\n");
101 printf ("\tm, and s have no meaning and if specified produce an error\n");
102 printf ("\tk The size of skips\n");
103 printf ("Mode 2:\n");
104 printf ("\tk has no meaning and if specified will produce an error\n");
105 printf ("\tm is the the number of accumulators that will be used for the\n");
106 printf ("\t ranking. The program builds an inverted file that is \n");
107 printf ("\t \"optimal\" for that number of accumulators.\n");
108 printf ("\ts is the minimum size for skips.\n");
109 exit (1);
110}
111
112
113
114int main (int argc, char **argv)
115{
116 ProgTime start;
117 char *dir_name, *file_name = "";
118 int ch;
119 msg_prefix = argv[0];
120 dir_name = getenv ("MGDATA");
121 strcpy (pathname, dir_name ? dir_name : ".");
122 opterr = 0;
123 msg_prefix = argv[0];
124 while ((ch = getopt (argc, argv, "012hf:d:k:b:s:m:")) != -1)
125 switch (ch)
126 {
127 case '0':
128 mode = 0;
129 break;
130 case '1':
131 mode = 1;
132 break;
133 case '2':
134 mode = 2;
135 break;
136 case 'f': /* input file */
137 file_name = optarg;
138 break;
139 case 'd':
140 strcpy (pathname, optarg);
141 break;
142 case 'k':
143 k = atoi (optarg);
144 break;
145 case 'm':
146 max_nodes = atoi (optarg);
147 break;
148 case 's':
149 mins = atoi (optarg);
150 if (mins < 1)
151 FatalError (1, "The number for the -m option must be greater"
152 " than or equal to 1");
153 break;
154 case 'h':
155 case '?':
156 usage (argv[0]);
157 }
158
159 if ((mode == 0 && (k != -1 || max_nodes != -1 || mins != -1)) ||
160 (mode == 1 && (max_nodes != -1 || mins != -1)) ||
161 (mode == 2 && (k != -1)))
162 {
163 Message ("Illegal parameters for mode");
164 usage (argv[0]);
165 }
166
167 if (mode == 1 && k == -1)
168 {
169 k = 8;
170 Message ("k is required for mode 1: defaulting k to %d", k);
171 }
172 if (mode == 2 && max_nodes == -1)
173 {
174 max_nodes = 1024;
175 Message ("m is required for mode 2: defaulting m to %d", max_nodes);
176 }
177 if (mode == 2 && mins == -1)
178 {
179 mins = 1;
180 Message ("s is required for mode 2: defaulting s to %d", mins);
181 }
182
183
184 GetTime (&start);
185 process_files (file_name);
186 Message ("%s\n", ElapsedTime (&start, NULL));
187 Message ("**** Don\'t forget to rebuild the stemmed dictionary with mg_invf_dict. ****\n");
188 Message ("**** If the collection was built with stem indexes don\'t forget to ****\n");
189 Message ("**** rebuild them with mg_stem_idx. ****\n");
190
191 return 0;
192}
193
194
195
196
197
198static void
199process_files (char *filename)
200{
201 FILE *in, *out, *idx, *odx, *dict;
202 unsigned long magic, outmode, N, in_k, out_k;
203 unsigned long bits_out, bytes_out, i, j;
204 stdio_bitio_state out_buf, in_buf;
205 struct invf_dict_header idh;
206 struct invf_file_header ifh_in, ifh_out;
207 char FName[256];
208
209 outmode = mode;
210
211 /* open .invf.ORG, rename .invf if have to */
212 sprintf (FName, FILE_NAME_FORMAT ".ORG", pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
213 if (!(in = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
214 {
215 char fname[256];
216 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
217 rename (fname, FName);
218 if (!(in = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
219 FatalError (1, "Unable to open \"%s\".\n", FName);
220 }
221 else
222 {
223 char fname[256];
224 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
225 unlink (fname);
226 }
227 Message ("Opening \"%s\"\n", FName);
228
229 /* check the magic number for .invf.ORG */
230 if (fread (&magic, sizeof (magic), 1, in) != 1 || NTOHUL(magic) != MAGIC_INVF) /* [RPAP - Jan 97: Endian Ordering] */
231 FatalError (1, "Bad magic number in \"%s\".\n", FName);
232
233
234 /* open .invf.idx.ORG, rename .invf.idx if have to */
235 sprintf (FName, FILE_NAME_FORMAT ".ORG", pathname, filename, INVF_IDX_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
236 if (!(idx = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
237 {
238 char fname[256];
239 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX);
240 rename (fname, FName);
241 if (!(idx = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
242 FatalError (1, "Unable to open \"%s\".\n", FName);
243 }
244 else
245 {
246 char fname[256];
247 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX);
248 unlink (fname);
249 }
250 Message ("Opening \"%s\"\n", FName);
251
252 /* check the magic number for .invf.idx.ORG */
253 if (fread (&magic, sizeof (magic), 1, idx) != 1 || NTOHUL(magic) != MAGIC_INVI) /* [RPAP - Jan 97: Endian Ordering] */
254 FatalError (1, "Bad magic number in \"%s\".\n", FName);
255
256 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
257 Message ("Creating \"%s\"\n", FName);
258 if (!(out = fopen (FName, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
259 FatalError (1, "Unable to open \"%s\".\n", FName);
260
261 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
262 Message ("Creating \"%s\"\n", FName);
263 if (!(odx = fopen (FName, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
264 FatalError (1, "Unable to open \"%s\".\n", FName);
265
266 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_DICT_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
267 Message ("Opening \"%s\"\n", FName);
268 if (!(dict = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
269 FatalError (1, "Unable to open \"%s\".\n", FName);
270
271 if (fread (&magic, sizeof (magic), 1, dict) != 1 || NTOHUL(magic) != MAGIC_STEM_BUILD) /* [RPAP - Jan 97: Endian Ordering] */
272 FatalError (1, "Bad magic number in \"%s\".\n", FName);
273
274 fread ((char *) &idh, sizeof (idh), 1, dict);
275
276 /* [RPAP - Jan 97: Endian Ordering] */
277 NTOHUL(idh.lookback);
278 NTOHUL(idh.dict_size);
279 NTOHUL(idh.total_bytes);
280 NTOHUL(idh.index_string_bytes);
281 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
282 NTOHUL(idh.num_of_docs);
283 NTOHUL(idh.static_num_of_docs);
284 NTOHUL(idh.num_of_words);
285 NTOHUL(idh.stemmer_num);
286 NTOHUL(idh.stem_method);
287
288 HTONUL2(MAGIC_INVF, magic); /* [RPAP - Jan 97: Endian Ordering] */
289 fwrite ((char *) &magic, sizeof (magic), 1, out);
290
291 fread ((char *) &ifh_in, sizeof (ifh_in), 1, in);
292
293 /* [RPAP - Jan 97: Endian Ordering] */
294 NTOHUL(ifh_in.no_of_words);
295 NTOHUL(ifh_in.no_of_ptrs);
296 NTOHUL(ifh_in.skip_mode);
297 for (i = 0; i < 16; i++)
298 NTOHUL(ifh_in.params[i]);
299 NTOHUL(ifh_in.InvfLevel);
300
301 ifh_out = ifh_in;
302 ifh_out.skip_mode = outmode;
303 bzero ((char *) ifh_out.params, sizeof (ifh_out.params));
304 switch (outmode)
305 {
306 case 0:
307 break;
308 case 1:
309 ifh_out.params[0] = k;
310 break;
311 case 2:
312 ifh_out.params[0] = max_nodes;
313 ifh_out.params[1] = mins;
314 break;
315 }
316
317 /* [RPAP - Jan 97: Endian Ordering] */
318 HTONUL(ifh_out.no_of_words);
319 HTONUL(ifh_out.no_of_ptrs);
320 HTONUL(ifh_out.skip_mode);
321 for (i = 0; i < 16; i++)
322 HTONUL(ifh_out.params[i]);
323 HTONUL(ifh_out.InvfLevel);
324
325 fwrite ((char *) &ifh_out, sizeof (ifh_out), 1, out);
326
327 /* [RPAP - Jan 97: Endian Ordering] */
328 NTOHUL(ifh_out.no_of_words);
329 NTOHUL(ifh_out.no_of_ptrs);
330 NTOHUL(ifh_out.skip_mode);
331 for (i = 0; i < 16; i++)
332 NTOHUL(ifh_out.params[i]);
333 NTOHUL(ifh_out.InvfLevel);
334
335 Message ("The file is a level %d inverted file.\n", ifh_in.InvfLevel);
336
337 bits_out = ftell (out) * 8;
338
339
340 HTONUL2(MAGIC_INVI, magic); /* [RPAP - Jan 97: Endian Ordering] */
341 fwrite ((char *) &magic, sizeof (magic), 1, odx);
342
343 DECODE_START (in)
344 DECODE_PAUSE (in_buf)
345
346 ENCODE_START (out)
347 ENCODE_PAUSE (out_buf)
348
349 N = idh.num_of_docs;
350
351 for (i = 0; i < ifh_in.no_of_words; i++)
352 {
353 unsigned long blk, p;
354 unsigned long odn_blk = 0, olen_blk = 0;
355 unsigned long idn_blk = 0, ilen_blk = 0;
356 register unsigned long suff;
357 unsigned long fcnt, wcnt, doc_num, bits_so_far, last_major;
358 unsigned long next_mjr_dn, kd;
359 char dummy2[MAXSTEMLEN + 1];
360 invf_info *ii;
361
362 fgetc (dict);
363 suff = fgetc (dict);
364 fread (dummy2, sizeof (u_char), suff, dict);
365 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
366 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
367
368 /* [RPAP - Jan 97: Endian Ordering] */
369 NTOHUL(fcnt);
370 NTOHUL(wcnt);
371
372 HTONUL2(bits_out >> 3, bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
373 fwrite ((char *) &bytes_out, sizeof (bytes_out), 1, odx);
374 NTOHUL(bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
375
376 p = fcnt;
377 blk = BIO_Bblock_Init (idh.static_num_of_docs, p);
378 switch (outmode)
379 {
380 case 1:
381 {
382 unsigned long len;
383 if (p <= ifh_out.params[0])
384 out_k = 0;
385 else
386 {
387 out_k = ifh_out.params[0];
388 len = BIO_Bblock_Bound (N, p);
389 if (ifh_in.InvfLevel >= 2)
390 len += wcnt;
391 odn_blk = BIO_Bblock_Init (idh.num_of_docs, (p + out_k - 1) / out_k);
392 olen_blk = BIO_Bblock_Init (len, (p + out_k - 1) / out_k);
393 }
394 break;
395 }
396 case 2:
397 {
398 unsigned long len;
399 if (p <= mins)
400 out_k = 0;
401 else
402 {
403 out_k = (int) (2 * sqrt ((double) p / max_nodes));
404 if (out_k <= mins)
405 out_k = mins;
406 len = BIO_Bblock_Bound (N, p);
407 if (ifh_in.InvfLevel >= 2)
408 len += wcnt;
409 odn_blk = BIO_Bblock_Init (idh.num_of_docs,
410 (p + out_k - 1) / out_k);
411 olen_blk = BIO_Bblock_Init (len, (p + out_k - 1) / out_k);
412 }
413 break;
414 }
415 default:
416 out_k = 0;
417 }
418
419 switch (ifh_in.skip_mode)
420 {
421 case 1:
422 {
423 unsigned long len;
424 if (p <= ifh_in.params[0])
425 in_k = 0;
426 else
427 {
428 in_k = ifh_in.params[0];
429 len = BIO_Bblock_Bound (N, p);
430 if (ifh_in.InvfLevel >= 2)
431 len += wcnt;
432 idn_blk = BIO_Bblock_Init (idh.num_of_docs, (p + in_k - 1) / in_k);
433 ilen_blk = BIO_Bblock_Init (len, (p + in_k - 1) / in_k);
434 }
435 break;
436 }
437 case 2:
438 {
439 unsigned long len;
440 if (p <= ifh_in.params[1])
441 {
442 in_k = 0;
443 }
444 else
445 {
446 in_k = (int) (2 * sqrt ((double) p / ifh_in.params[0]));
447 if (in_k <= ifh_in.params[1])
448 in_k = ifh_in.params[1];
449 len = BIO_Bblock_Bound (N, p);
450 if (ifh_in.InvfLevel >= 2)
451 len += wcnt;
452 idn_blk = BIO_Bblock_Init (idh.num_of_docs,
453 (p + in_k - 1) / in_k);
454 ilen_blk = BIO_Bblock_Init (len, (p + in_k - 1) / in_k);
455 }
456 break;
457 }
458 default:
459 in_k = 0;
460 }
461
462 if (!(ii = Xmalloc (sizeof (invf_info) * p)))
463 FatalError (1, "Unable to allocate memory for \"ii\"\n");
464
465 doc_num = bits_so_far = 0;
466 next_mjr_dn = 0;
467 kd = 0;
468 DECODE_CONTINUE (in_buf)
469 for (j = 0; j < p; j++, kd++)
470 {
471 unsigned long doc_diff, count = 0;
472 if (kd == in_k)
473 kd = 0;
474 if (in_k && kd == 0 && j + in_k < p)
475 {
476 int temp;
477 BBLOCK_DECODE (next_mjr_dn, idn_blk);
478 next_mjr_dn += doc_num;
479 BBLOCK_DECODE (temp, ilen_blk);
480 }
481 ii[j].bits_so_far = bits_so_far;
482 if (in_k && kd == in_k - 1 && j != p - 1)
483 {
484 int count;
485 BBLOCK_LENGTH (next_mjr_dn - doc_num, blk, count);
486 bits_so_far += count;
487 doc_num = next_mjr_dn;
488 }
489 else
490 {
491 BBLOCK_DECODE_L (doc_diff, blk, bits_so_far);
492 doc_num += doc_diff;
493 }
494 ii[j].doc_num = doc_num;
495 if (ifh_in.InvfLevel >= 2)
496 {
497 int count_bits = 0;
498 GAMMA_DECODE_L (count, count_bits);
499 ii[j].count_bits = count_bits;
500 bits_so_far += count_bits;
501 ii[j].count = count;
502 }
503 }
504
505 /* read till a byte boundary */
506 while (__btg)
507 {
508 DECODE_BIT;
509 bits_so_far++;
510 }
511
512 DECODE_PAUSE (in_buf)
513
514 doc_num = bits_so_far = 0;
515 last_major = 0;
516 kd = 0;
517 ENCODE_CONTINUE (out_buf)
518 for (j = 0; j < p; j++, kd++)
519 {
520 if (kd == out_k)
521 kd = 0;
522 if (out_k && kd == 0)
523 {
524 if (j + out_k < p)
525 {
526 int num = ii[j + out_k - 1].doc_num - last_major;
527 BBLOCK_ENCODE_L (num, odn_blk, bits_out);
528 last_major = ii[j + out_k - 1].doc_num;
529
530 num = ii[j + out_k - 1].bits_so_far + ii[j + out_k - 1].count_bits -
531 bits_so_far;
532 BBLOCK_ENCODE_L (num, olen_blk, bits_out);
533 bits_so_far = ii[j + out_k].bits_so_far;
534 }
535 }
536 if (!(out_k && kd == out_k - 1 && j != p - 1))
537 BBLOCK_ENCODE_L (ii[j].doc_num - doc_num, blk, bits_out);
538 doc_num = ii[j].doc_num;
539 if (ifh_in.InvfLevel >= 2)
540 GAMMA_ENCODE_L (ii[j].count, bits_out);
541 }
542
543 /* write till a byte boundary */
544 while (__btg != 8)
545 {
546 ENCODE_BIT (0);
547 bits_out++;
548 }
549 ENCODE_PAUSE (out_buf)
550
551 Xfree (ii);
552
553 }
554 ENCODE_CONTINUE (out_buf)
555 ENCODE_DONE
556
557 HTONUL2(bits_out >> 3, bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
558 fwrite ((char *) &bytes_out, sizeof (bytes_out), 1, odx);
559 NTOHUL(bytes_out);
560
561 fclose (idx);
562 fclose (odx);
563 fclose (in);
564 fclose (out);
565 fclose (dict);
566
567}
Note: See TracBrowser for help on using the repository browser.