source: trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_rebuild.c@ 34

Last change on this file since 34 was 34, checked in by rjmcnab, 26 years ago

Modified mg to that you can specify the stemmer you want
to use via a command line option. You specify it to
mg_passes during the build process. The number of the
stemmer that you used is stored within the inverted
dictionary header and the stemmed dictionary header so
the correct stemmer is used in later stages of building
and querying.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_rebuild.c -- Program to rebuild an inverted file with skipping
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_rebuild.c 34 1998-11-25 07:55:52Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "timing.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "bitio_gen.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "invf.h"
36#include "locallib.h"
37#include "words.h"
38#include "mg.h"
39
40#define LEN 40
41#define WRD 1
42
43typedef struct invf_info
44 {
45 unsigned long doc_num, count, bits_so_far, count_bits;
46 }
47invf_info;
48
49/*
50 $Log$
51 Revision 1.2 1998/11/25 07:55:46 rjmcnab
52
53 Modified mg to that you can specify the stemmer you want
54 to use via a command line option. You specify it to
55 mg_passes during the build process. The number of the
56 stemmer that you used is stored within the inverted
57 dictionary header and the stemmed dictionary header so
58 the correct stemmer is used in later stages of building
59 and querying.
60
61 Revision 1.1 1998/11/17 09:35:10 rjmcnab
62 *** empty log message ***
63
64 * Revision 1.4 1994/11/29 00:32:03 tes
65 * Committing the new merged files and changes.
66 *
67 * Revision 1.3 1994/10/20 03:56:56 tes
68 * I have rewritten the boolean query optimiser and abstracted out the
69 * components of the boolean query.
70 *
71 * Revision 1.2 1994/09/20 04:41:51 tes
72 * For version 1.1
73 *
74 */
75
76static char *RCSID = "$Id: mg_invf_rebuild.c 34 1998-11-25 07:55:52Z rjmcnab $";
77
78static char pathname[256];
79
80static void process_files (char *filename);
81
82static int k = -1;
83static int mode = 0;
84static int max_nodes = -1;
85static int mins = -1;
86
87void
88usage (char *pgname)
89{
90 fprintf (stderr, "usage: %s [-f input_file]"
91 "[-d data directory] -0|-1|-2 [-k num] [-m num] [-s num]\n", pgname);
92 printf ("Mode 0: (the default)\n");
93 printf ("\tk, m, and s have no meaning and if specified produce an error\n");
94 printf ("Mode 1:\n");
95 printf ("\tm, and s have no meaning and if specified produce an error\n");
96 printf ("\tk The size of skips\n");
97 printf ("Mode 2:\n");
98 printf ("\tk has no meaning and if specified will produce an error\n");
99 printf ("\tm is the the number of accumulators that will be used for the\n");
100 printf ("\t ranking. The program builds an inverted file that is \n");
101 printf ("\t \"optimal\" for that number of accumulators.\n");
102 printf ("\ts is the minimum size for skips.\n");
103 exit (1);
104}
105
106
107
108int main (int argc, char **argv)
109{
110 ProgTime start;
111 char *dir_name, *file_name = "";
112 int ch;
113 msg_prefix = argv[0];
114 dir_name = getenv ("MGDATA");
115 strcpy (pathname, dir_name ? dir_name : ".");
116 opterr = 0;
117 msg_prefix = argv[0];
118 while ((ch = getopt (argc, argv, "012hf:d:k:b:s:m:")) != -1)
119 switch (ch)
120 {
121 case '0':
122 mode = 0;
123 break;
124 case '1':
125 mode = 1;
126 break;
127 case '2':
128 mode = 2;
129 break;
130 case 'f': /* input file */
131 file_name = optarg;
132 break;
133 case 'd':
134 strcpy (pathname, optarg);
135 break;
136 case 'k':
137 k = atoi (optarg);
138 break;
139 case 'm':
140 max_nodes = atoi (optarg);
141 break;
142 case 's':
143 mins = atoi (optarg);
144 if (mins < 1)
145 FatalError (1, "The number for the -m option must be greater"
146 " than or equal to 1");
147 break;
148 case 'h':
149 case '?':
150 usage (argv[0]);
151 }
152
153 if ((mode == 0 && (k != -1 || max_nodes != -1 || mins != -1)) ||
154 (mode == 1 && (max_nodes != -1 || mins != -1)) ||
155 (mode == 2 && (k != -1)))
156 {
157 Message ("Illegal parameters for mode");
158 usage (argv[0]);
159 }
160
161 if (mode == 1 && k == -1)
162 {
163 k = 8;
164 Message ("k is required for mode 1: defaulting k to %d", k);
165 }
166 if (mode == 2 && max_nodes == -1)
167 {
168 max_nodes = 1024;
169 Message ("m is required for mode 2: defaulting m to %d", max_nodes);
170 }
171 if (mode == 2 && mins == -1)
172 {
173 mins = 1;
174 Message ("s is required for mode 2: defaulting s to %d", mins);
175 }
176
177
178 GetTime (&start);
179 process_files (file_name);
180 Message ("%s\n", ElapsedTime (&start, NULL));
181 Message ("**** Don\'t forget to rebuild the stemmed dictionary with mg_invf_dict. ****\n");
182 Message ("**** If the collection was built with stem indexes don\'t forget to ****\n");
183 Message ("**** rebuild them with mg_stem_idx. ****\n");
184
185 return 0;
186}
187
188
189
190
191
192static void
193process_files (char *filename)
194{
195 FILE *in, *out, *idx, *odx, *dict;
196 unsigned long magic, outmode, N, in_k, out_k;
197 unsigned long bits_out, bytes_out, i, j;
198 stdio_bitio_state out_buf, in_buf;
199 struct invf_dict_header idh;
200 struct invf_file_header ifh_in, ifh_out;
201 char FName[256];
202
203 outmode = mode;
204
205 /* open .invf.ORG, rename .invf if have to */
206 sprintf (FName, FILE_NAME_FORMAT ".ORG", pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
207 if (!(in = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
208 {
209 char fname[256];
210 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
211 rename (fname, FName);
212 if (!(in = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
213 FatalError (1, "Unable to open \"%s\".\n", FName);
214 }
215 else
216 {
217 char fname[256];
218 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
219 unlink (fname);
220 }
221 Message ("Opening \"%s\"\n", FName);
222
223 /* check the magic number for .invf.ORG */
224 if (fread (&magic, sizeof (magic), 1, in) != 1 || NTOHUL(magic) != MAGIC_INVF) /* [RPAP - Jan 97: Endian Ordering] */
225 FatalError (1, "Bad magic number in \"%s\".\n", FName);
226
227
228 /* open .invf.idx.ORG, rename .invf.idx if have to */
229 sprintf (FName, FILE_NAME_FORMAT ".ORG", pathname, filename, INVF_IDX_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
230 if (!(idx = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
231 {
232 char fname[256];
233 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX);
234 rename (fname, FName);
235 if (!(idx = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
236 FatalError (1, "Unable to open \"%s\".\n", FName);
237 }
238 else
239 {
240 char fname[256];
241 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX);
242 unlink (fname);
243 }
244 Message ("Opening \"%s\"\n", FName);
245
246 /* check the magic number for .invf.idx.ORG */
247 if (fread (&magic, sizeof (magic), 1, idx) != 1 || NTOHUL(magic) != MAGIC_INVI) /* [RPAP - Jan 97: Endian Ordering] */
248 FatalError (1, "Bad magic number in \"%s\".\n", FName);
249
250 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
251 Message ("Creating \"%s\"\n", FName);
252 if (!(out = fopen (FName, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
253 FatalError (1, "Unable to open \"%s\".\n", FName);
254
255 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
256 Message ("Creating \"%s\"\n", FName);
257 if (!(odx = fopen (FName, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
258 FatalError (1, "Unable to open \"%s\".\n", FName);
259
260 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_DICT_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
261 Message ("Opening \"%s\"\n", FName);
262 if (!(dict = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
263 FatalError (1, "Unable to open \"%s\".\n", FName);
264
265 if (fread (&magic, sizeof (magic), 1, dict) != 1 || NTOHUL(magic) != MAGIC_STEM_BUILD) /* [RPAP - Jan 97: Endian Ordering] */
266 FatalError (1, "Bad magic number in \"%s\".\n", FName);
267
268 fread ((char *) &idh, sizeof (idh), 1, dict);
269
270 /* [RPAP - Jan 97: Endian Ordering] */
271 NTOHUL(idh.lookback);
272 NTOHUL(idh.dict_size);
273 NTOHUL(idh.total_bytes);
274 NTOHUL(idh.index_string_bytes);
275 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
276 NTOHUL(idh.num_of_docs);
277 NTOHUL(idh.static_num_of_docs);
278 NTOHUL(idh.num_of_words);
279 NTOHUL(idh.stemmer_num);
280 NTOHUL(idh.stem_method);
281
282 HTONUL2(MAGIC_INVF, magic); /* [RPAP - Jan 97: Endian Ordering] */
283 fwrite ((char *) &magic, sizeof (magic), 1, out);
284
285 fread ((char *) &ifh_in, sizeof (ifh_in), 1, in);
286
287 /* [RPAP - Jan 97: Endian Ordering] */
288 NTOHUL(ifh_in.no_of_words);
289 NTOHUL(ifh_in.no_of_ptrs);
290 NTOHUL(ifh_in.skip_mode);
291 for (i = 0; i < 16; i++)
292 NTOHUL(ifh_in.params[i]);
293 NTOHUL(ifh_in.InvfLevel);
294
295 ifh_out = ifh_in;
296 ifh_out.skip_mode = outmode;
297 bzero ((char *) ifh_out.params, sizeof (ifh_out.params));
298 switch (outmode)
299 {
300 case 0:
301 break;
302 case 1:
303 ifh_out.params[0] = k;
304 break;
305 case 2:
306 ifh_out.params[0] = max_nodes;
307 ifh_out.params[1] = mins;
308 break;
309 }
310
311 /* [RPAP - Jan 97: Endian Ordering] */
312 HTONUL(ifh_out.no_of_words);
313 HTONUL(ifh_out.no_of_ptrs);
314 HTONUL(ifh_out.skip_mode);
315 for (i = 0; i < 16; i++)
316 HTONUL(ifh_out.params[i]);
317 HTONUL(ifh_out.InvfLevel);
318
319 fwrite ((char *) &ifh_out, sizeof (ifh_out), 1, out);
320
321 /* [RPAP - Jan 97: Endian Ordering] */
322 NTOHUL(ifh_out.no_of_words);
323 NTOHUL(ifh_out.no_of_ptrs);
324 NTOHUL(ifh_out.skip_mode);
325 for (i = 0; i < 16; i++)
326 NTOHUL(ifh_out.params[i]);
327 NTOHUL(ifh_out.InvfLevel);
328
329 Message ("The file is a level %d inverted file.\n", ifh_in.InvfLevel);
330
331 bits_out = ftell (out) * 8;
332
333
334 HTONUL2(MAGIC_INVI, magic); /* [RPAP - Jan 97: Endian Ordering] */
335 fwrite ((char *) &magic, sizeof (magic), 1, odx);
336
337 DECODE_START (in)
338 DECODE_PAUSE (in_buf)
339
340 ENCODE_START (out)
341 ENCODE_PAUSE (out_buf)
342
343 N = idh.num_of_docs;
344
345 for (i = 0; i < ifh_in.no_of_words; i++)
346 {
347 unsigned long blk, p;
348 unsigned long odn_blk = 0, olen_blk = 0;
349 unsigned long idn_blk = 0, ilen_blk = 0;
350 register unsigned long suff;
351 unsigned long fcnt, wcnt, doc_num, bits_so_far, last_major;
352 unsigned long next_mjr_dn, kd;
353 char dummy2[MAXSTEMLEN + 1];
354 invf_info *ii;
355
356 fgetc (dict);
357 suff = fgetc (dict);
358 fread (dummy2, sizeof (u_char), suff, dict);
359 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
360 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
361
362 /* [RPAP - Jan 97: Endian Ordering] */
363 NTOHUL(fcnt);
364 NTOHUL(wcnt);
365
366 HTONUL2(bits_out >> 3, bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
367 fwrite ((char *) &bytes_out, sizeof (bytes_out), 1, odx);
368 NTOHUL(bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
369
370 p = fcnt;
371 blk = BIO_Bblock_Init (idh.static_num_of_docs, p);
372 switch (outmode)
373 {
374 case 1:
375 {
376 unsigned long len;
377 if (p <= ifh_out.params[0])
378 out_k = 0;
379 else
380 {
381 out_k = ifh_out.params[0];
382 len = BIO_Bblock_Bound (N, p);
383 if (ifh_in.InvfLevel >= 2)
384 len += wcnt;
385 odn_blk = BIO_Bblock_Init (idh.num_of_docs, (p + out_k - 1) / out_k);
386 olen_blk = BIO_Bblock_Init (len, (p + out_k - 1) / out_k);
387 }
388 break;
389 }
390 case 2:
391 {
392 unsigned long len;
393 if (p <= mins)
394 out_k = 0;
395 else
396 {
397 out_k = (int) (2 * sqrt ((double) p / max_nodes));
398 if (out_k <= mins)
399 out_k = mins;
400 len = BIO_Bblock_Bound (N, p);
401 if (ifh_in.InvfLevel >= 2)
402 len += wcnt;
403 odn_blk = BIO_Bblock_Init (idh.num_of_docs,
404 (p + out_k - 1) / out_k);
405 olen_blk = BIO_Bblock_Init (len, (p + out_k - 1) / out_k);
406 }
407 break;
408 }
409 default:
410 out_k = 0;
411 }
412
413 switch (ifh_in.skip_mode)
414 {
415 case 1:
416 {
417 unsigned long len;
418 if (p <= ifh_in.params[0])
419 in_k = 0;
420 else
421 {
422 in_k = ifh_in.params[0];
423 len = BIO_Bblock_Bound (N, p);
424 if (ifh_in.InvfLevel >= 2)
425 len += wcnt;
426 idn_blk = BIO_Bblock_Init (idh.num_of_docs, (p + in_k - 1) / in_k);
427 ilen_blk = BIO_Bblock_Init (len, (p + in_k - 1) / in_k);
428 }
429 break;
430 }
431 case 2:
432 {
433 unsigned long len;
434 if (p <= ifh_in.params[1])
435 {
436 in_k = 0;
437 }
438 else
439 {
440 in_k = (int) (2 * sqrt ((double) p / ifh_in.params[0]));
441 if (in_k <= ifh_in.params[1])
442 in_k = ifh_in.params[1];
443 len = BIO_Bblock_Bound (N, p);
444 if (ifh_in.InvfLevel >= 2)
445 len += wcnt;
446 idn_blk = BIO_Bblock_Init (idh.num_of_docs,
447 (p + in_k - 1) / in_k);
448 ilen_blk = BIO_Bblock_Init (len, (p + in_k - 1) / in_k);
449 }
450 break;
451 }
452 default:
453 in_k = 0;
454 }
455
456 if (!(ii = Xmalloc (sizeof (invf_info) * p)))
457 FatalError (1, "Unable to allocate memory for \"ii\"\n");
458
459 doc_num = bits_so_far = 0;
460 next_mjr_dn = 0;
461 kd = 0;
462 DECODE_CONTINUE (in_buf)
463 for (j = 0; j < p; j++, kd++)
464 {
465 unsigned long doc_diff, count = 0;
466 if (kd == in_k)
467 kd = 0;
468 if (in_k && kd == 0 && j + in_k < p)
469 {
470 int temp;
471 BBLOCK_DECODE (next_mjr_dn, idn_blk);
472 next_mjr_dn += doc_num;
473 BBLOCK_DECODE (temp, ilen_blk);
474 }
475 ii[j].bits_so_far = bits_so_far;
476 if (in_k && kd == in_k - 1 && j != p - 1)
477 {
478 int count;
479 BBLOCK_LENGTH (next_mjr_dn - doc_num, blk, count);
480 bits_so_far += count;
481 doc_num = next_mjr_dn;
482 }
483 else
484 {
485 BBLOCK_DECODE_L (doc_diff, blk, bits_so_far);
486 doc_num += doc_diff;
487 }
488 ii[j].doc_num = doc_num;
489 if (ifh_in.InvfLevel >= 2)
490 {
491 int count_bits = 0;
492 GAMMA_DECODE_L (count, count_bits);
493 ii[j].count_bits = count_bits;
494 bits_so_far += count_bits;
495 ii[j].count = count;
496 }
497 }
498
499 /* read till a byte boundary */
500 while (__btg)
501 {
502 DECODE_BIT;
503 bits_so_far++;
504 }
505
506 DECODE_PAUSE (in_buf)
507
508 doc_num = bits_so_far = 0;
509 last_major = 0;
510 kd = 0;
511 ENCODE_CONTINUE (out_buf)
512 for (j = 0; j < p; j++, kd++)
513 {
514 if (kd == out_k)
515 kd = 0;
516 if (out_k && kd == 0)
517 {
518 if (j + out_k < p)
519 {
520 int num = ii[j + out_k - 1].doc_num - last_major;
521 BBLOCK_ENCODE_L (num, odn_blk, bits_out);
522 last_major = ii[j + out_k - 1].doc_num;
523
524 num = ii[j + out_k - 1].bits_so_far + ii[j + out_k - 1].count_bits -
525 bits_so_far;
526 BBLOCK_ENCODE_L (num, olen_blk, bits_out);
527 bits_so_far = ii[j + out_k].bits_so_far;
528 }
529 }
530 if (!(out_k && kd == out_k - 1 && j != p - 1))
531 BBLOCK_ENCODE_L (ii[j].doc_num - doc_num, blk, bits_out);
532 doc_num = ii[j].doc_num;
533 if (ifh_in.InvfLevel >= 2)
534 GAMMA_ENCODE_L (ii[j].count, bits_out);
535 }
536
537 /* write till a byte boundary */
538 while (__btg != 8)
539 {
540 ENCODE_BIT (0);
541 bits_out++;
542 }
543 ENCODE_PAUSE (out_buf)
544
545 Xfree (ii);
546
547 }
548 ENCODE_CONTINUE (out_buf)
549 ENCODE_DONE
550
551 HTONUL2(bits_out >> 3, bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
552 fwrite ((char *) &bytes_out, sizeof (bytes_out), 1, odx);
553 NTOHUL(bytes_out);
554
555 fclose (idx);
556 fclose (odx);
557 fclose (in);
558 fclose (out);
559 fclose (dict);
560
561}
Note: See TracBrowser for help on using the repository browser.