source: trunk/gsdl/packages/mg/src/text/mg_invf_rebuild.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_rebuild.c -- Program to rebuild an inverted file with skipping
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_rebuild.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "timing.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "bitio_gen.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "invf.h"
36#include "locallib.h"
37#include "words.h"
38#include "mg.h"
39
40#define LEN 40
41#define WRD 1
42
43typedef struct invf_info
44 {
45 unsigned long doc_num, count, bits_so_far, count_bits;
46 }
47invf_info;
48
49/*
50 $Log$
51 Revision 1.1 1999/08/10 21:18:11 sjboddie
52 renamed mg-1.3d directory mg
53
54 Revision 1.2 1998/11/25 07:55:46 rjmcnab
55
56 Modified mg to that you can specify the stemmer you want
57 to use via a command line option. You specify it to
58 mg_passes during the build process. The number of the
59 stemmer that you used is stored within the inverted
60 dictionary header and the stemmed dictionary header so
61 the correct stemmer is used in later stages of building
62 and querying.
63
64 Revision 1.1 1998/11/17 09:35:10 rjmcnab
65 *** empty log message ***
66
67 * Revision 1.4 1994/11/29 00:32:03 tes
68 * Committing the new merged files and changes.
69 *
70 * Revision 1.3 1994/10/20 03:56:56 tes
71 * I have rewritten the boolean query optimiser and abstracted out the
72 * components of the boolean query.
73 *
74 * Revision 1.2 1994/09/20 04:41:51 tes
75 * For version 1.1
76 *
77 */
78
79static char *RCSID = "$Id: mg_invf_rebuild.c 439 1999-08-10 21:23:37Z sjboddie $";
80
81static char pathname[256];
82
83static void process_files (char *filename);
84
85static int k = -1;
86static int mode = 0;
87static int max_nodes = -1;
88static int mins = -1;
89
90void
91usage (char *pgname)
92{
93 fprintf (stderr, "usage: %s [-f input_file]"
94 "[-d data directory] -0|-1|-2 [-k num] [-m num] [-s num]\n", pgname);
95 printf ("Mode 0: (the default)\n");
96 printf ("\tk, m, and s have no meaning and if specified produce an error\n");
97 printf ("Mode 1:\n");
98 printf ("\tm, and s have no meaning and if specified produce an error\n");
99 printf ("\tk The size of skips\n");
100 printf ("Mode 2:\n");
101 printf ("\tk has no meaning and if specified will produce an error\n");
102 printf ("\tm is the the number of accumulators that will be used for the\n");
103 printf ("\t ranking. The program builds an inverted file that is \n");
104 printf ("\t \"optimal\" for that number of accumulators.\n");
105 printf ("\ts is the minimum size for skips.\n");
106 exit (1);
107}
108
109
110
111int main (int argc, char **argv)
112{
113 ProgTime start;
114 char *dir_name, *file_name = "";
115 int ch;
116 msg_prefix = argv[0];
117 dir_name = getenv ("MGDATA");
118 strcpy (pathname, dir_name ? dir_name : ".");
119 opterr = 0;
120 msg_prefix = argv[0];
121 while ((ch = getopt (argc, argv, "012hf:d:k:b:s:m:")) != -1)
122 switch (ch)
123 {
124 case '0':
125 mode = 0;
126 break;
127 case '1':
128 mode = 1;
129 break;
130 case '2':
131 mode = 2;
132 break;
133 case 'f': /* input file */
134 file_name = optarg;
135 break;
136 case 'd':
137 strcpy (pathname, optarg);
138 break;
139 case 'k':
140 k = atoi (optarg);
141 break;
142 case 'm':
143 max_nodes = atoi (optarg);
144 break;
145 case 's':
146 mins = atoi (optarg);
147 if (mins < 1)
148 FatalError (1, "The number for the -m option must be greater"
149 " than or equal to 1");
150 break;
151 case 'h':
152 case '?':
153 usage (argv[0]);
154 }
155
156 if ((mode == 0 && (k != -1 || max_nodes != -1 || mins != -1)) ||
157 (mode == 1 && (max_nodes != -1 || mins != -1)) ||
158 (mode == 2 && (k != -1)))
159 {
160 Message ("Illegal parameters for mode");
161 usage (argv[0]);
162 }
163
164 if (mode == 1 && k == -1)
165 {
166 k = 8;
167 Message ("k is required for mode 1: defaulting k to %d", k);
168 }
169 if (mode == 2 && max_nodes == -1)
170 {
171 max_nodes = 1024;
172 Message ("m is required for mode 2: defaulting m to %d", max_nodes);
173 }
174 if (mode == 2 && mins == -1)
175 {
176 mins = 1;
177 Message ("s is required for mode 2: defaulting s to %d", mins);
178 }
179
180
181 GetTime (&start);
182 process_files (file_name);
183 Message ("%s\n", ElapsedTime (&start, NULL));
184 Message ("**** Don\'t forget to rebuild the stemmed dictionary with mg_invf_dict. ****\n");
185 Message ("**** If the collection was built with stem indexes don\'t forget to ****\n");
186 Message ("**** rebuild them with mg_stem_idx. ****\n");
187
188 return 0;
189}
190
191
192
193
194
195static void
196process_files (char *filename)
197{
198 FILE *in, *out, *idx, *odx, *dict;
199 unsigned long magic, outmode, N, in_k, out_k;
200 unsigned long bits_out, bytes_out, i, j;
201 stdio_bitio_state out_buf, in_buf;
202 struct invf_dict_header idh;
203 struct invf_file_header ifh_in, ifh_out;
204 char FName[256];
205
206 outmode = mode;
207
208 /* open .invf.ORG, rename .invf if have to */
209 sprintf (FName, FILE_NAME_FORMAT ".ORG", pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
210 if (!(in = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
211 {
212 char fname[256];
213 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
214 rename (fname, FName);
215 if (!(in = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
216 FatalError (1, "Unable to open \"%s\".\n", FName);
217 }
218 else
219 {
220 char fname[256];
221 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
222 unlink (fname);
223 }
224 Message ("Opening \"%s\"\n", FName);
225
226 /* check the magic number for .invf.ORG */
227 if (fread (&magic, sizeof (magic), 1, in) != 1 || NTOHUL(magic) != MAGIC_INVF) /* [RPAP - Jan 97: Endian Ordering] */
228 FatalError (1, "Bad magic number in \"%s\".\n", FName);
229
230
231 /* open .invf.idx.ORG, rename .invf.idx if have to */
232 sprintf (FName, FILE_NAME_FORMAT ".ORG", pathname, filename, INVF_IDX_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
233 if (!(idx = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
234 {
235 char fname[256];
236 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX);
237 rename (fname, FName);
238 if (!(idx = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
239 FatalError (1, "Unable to open \"%s\".\n", FName);
240 }
241 else
242 {
243 char fname[256];
244 sprintf (fname, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX);
245 unlink (fname);
246 }
247 Message ("Opening \"%s\"\n", FName);
248
249 /* check the magic number for .invf.idx.ORG */
250 if (fread (&magic, sizeof (magic), 1, idx) != 1 || NTOHUL(magic) != MAGIC_INVI) /* [RPAP - Jan 97: Endian Ordering] */
251 FatalError (1, "Bad magic number in \"%s\".\n", FName);
252
253 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
254 Message ("Creating \"%s\"\n", FName);
255 if (!(out = fopen (FName, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
256 FatalError (1, "Unable to open \"%s\".\n", FName);
257
258 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_IDX_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
259 Message ("Creating \"%s\"\n", FName);
260 if (!(odx = fopen (FName, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
261 FatalError (1, "Unable to open \"%s\".\n", FName);
262
263 sprintf (FName, FILE_NAME_FORMAT, pathname, filename, INVF_DICT_SUFFIX); /* [RPAP - Feb 97: WIN32 Port] */
264 Message ("Opening \"%s\"\n", FName);
265 if (!(dict = fopen (FName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
266 FatalError (1, "Unable to open \"%s\".\n", FName);
267
268 if (fread (&magic, sizeof (magic), 1, dict) != 1 || NTOHUL(magic) != MAGIC_STEM_BUILD) /* [RPAP - Jan 97: Endian Ordering] */
269 FatalError (1, "Bad magic number in \"%s\".\n", FName);
270
271 fread ((char *) &idh, sizeof (idh), 1, dict);
272
273 /* [RPAP - Jan 97: Endian Ordering] */
274 NTOHUL(idh.lookback);
275 NTOHUL(idh.dict_size);
276 NTOHUL(idh.total_bytes);
277 NTOHUL(idh.index_string_bytes);
278 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
279 NTOHUL(idh.num_of_docs);
280 NTOHUL(idh.static_num_of_docs);
281 NTOHUL(idh.num_of_words);
282 NTOHUL(idh.stemmer_num);
283 NTOHUL(idh.stem_method);
284
285 HTONUL2(MAGIC_INVF, magic); /* [RPAP - Jan 97: Endian Ordering] */
286 fwrite ((char *) &magic, sizeof (magic), 1, out);
287
288 fread ((char *) &ifh_in, sizeof (ifh_in), 1, in);
289
290 /* [RPAP - Jan 97: Endian Ordering] */
291 NTOHUL(ifh_in.no_of_words);
292 NTOHUL(ifh_in.no_of_ptrs);
293 NTOHUL(ifh_in.skip_mode);
294 for (i = 0; i < 16; i++)
295 NTOHUL(ifh_in.params[i]);
296 NTOHUL(ifh_in.InvfLevel);
297
298 ifh_out = ifh_in;
299 ifh_out.skip_mode = outmode;
300 bzero ((char *) ifh_out.params, sizeof (ifh_out.params));
301 switch (outmode)
302 {
303 case 0:
304 break;
305 case 1:
306 ifh_out.params[0] = k;
307 break;
308 case 2:
309 ifh_out.params[0] = max_nodes;
310 ifh_out.params[1] = mins;
311 break;
312 }
313
314 /* [RPAP - Jan 97: Endian Ordering] */
315 HTONUL(ifh_out.no_of_words);
316 HTONUL(ifh_out.no_of_ptrs);
317 HTONUL(ifh_out.skip_mode);
318 for (i = 0; i < 16; i++)
319 HTONUL(ifh_out.params[i]);
320 HTONUL(ifh_out.InvfLevel);
321
322 fwrite ((char *) &ifh_out, sizeof (ifh_out), 1, out);
323
324 /* [RPAP - Jan 97: Endian Ordering] */
325 NTOHUL(ifh_out.no_of_words);
326 NTOHUL(ifh_out.no_of_ptrs);
327 NTOHUL(ifh_out.skip_mode);
328 for (i = 0; i < 16; i++)
329 NTOHUL(ifh_out.params[i]);
330 NTOHUL(ifh_out.InvfLevel);
331
332 Message ("The file is a level %d inverted file.\n", ifh_in.InvfLevel);
333
334 bits_out = ftell (out) * 8;
335
336
337 HTONUL2(MAGIC_INVI, magic); /* [RPAP - Jan 97: Endian Ordering] */
338 fwrite ((char *) &magic, sizeof (magic), 1, odx);
339
340 DECODE_START (in)
341 DECODE_PAUSE (in_buf)
342
343 ENCODE_START (out)
344 ENCODE_PAUSE (out_buf)
345
346 N = idh.num_of_docs;
347
348 for (i = 0; i < ifh_in.no_of_words; i++)
349 {
350 unsigned long blk, p;
351 unsigned long odn_blk = 0, olen_blk = 0;
352 unsigned long idn_blk = 0, ilen_blk = 0;
353 register unsigned long suff;
354 unsigned long fcnt, wcnt, doc_num, bits_so_far, last_major;
355 unsigned long next_mjr_dn, kd;
356 char dummy2[MAXSTEMLEN + 1];
357 invf_info *ii;
358
359 fgetc (dict);
360 suff = fgetc (dict);
361 fread (dummy2, sizeof (u_char), suff, dict);
362 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
363 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
364
365 /* [RPAP - Jan 97: Endian Ordering] */
366 NTOHUL(fcnt);
367 NTOHUL(wcnt);
368
369 HTONUL2(bits_out >> 3, bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
370 fwrite ((char *) &bytes_out, sizeof (bytes_out), 1, odx);
371 NTOHUL(bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
372
373 p = fcnt;
374 blk = BIO_Bblock_Init (idh.static_num_of_docs, p);
375 switch (outmode)
376 {
377 case 1:
378 {
379 unsigned long len;
380 if (p <= ifh_out.params[0])
381 out_k = 0;
382 else
383 {
384 out_k = ifh_out.params[0];
385 len = BIO_Bblock_Bound (N, p);
386 if (ifh_in.InvfLevel >= 2)
387 len += wcnt;
388 odn_blk = BIO_Bblock_Init (idh.num_of_docs, (p + out_k - 1) / out_k);
389 olen_blk = BIO_Bblock_Init (len, (p + out_k - 1) / out_k);
390 }
391 break;
392 }
393 case 2:
394 {
395 unsigned long len;
396 if (p <= mins)
397 out_k = 0;
398 else
399 {
400 out_k = (int) (2 * sqrt ((double) p / max_nodes));
401 if (out_k <= mins)
402 out_k = mins;
403 len = BIO_Bblock_Bound (N, p);
404 if (ifh_in.InvfLevel >= 2)
405 len += wcnt;
406 odn_blk = BIO_Bblock_Init (idh.num_of_docs,
407 (p + out_k - 1) / out_k);
408 olen_blk = BIO_Bblock_Init (len, (p + out_k - 1) / out_k);
409 }
410 break;
411 }
412 default:
413 out_k = 0;
414 }
415
416 switch (ifh_in.skip_mode)
417 {
418 case 1:
419 {
420 unsigned long len;
421 if (p <= ifh_in.params[0])
422 in_k = 0;
423 else
424 {
425 in_k = ifh_in.params[0];
426 len = BIO_Bblock_Bound (N, p);
427 if (ifh_in.InvfLevel >= 2)
428 len += wcnt;
429 idn_blk = BIO_Bblock_Init (idh.num_of_docs, (p + in_k - 1) / in_k);
430 ilen_blk = BIO_Bblock_Init (len, (p + in_k - 1) / in_k);
431 }
432 break;
433 }
434 case 2:
435 {
436 unsigned long len;
437 if (p <= ifh_in.params[1])
438 {
439 in_k = 0;
440 }
441 else
442 {
443 in_k = (int) (2 * sqrt ((double) p / ifh_in.params[0]));
444 if (in_k <= ifh_in.params[1])
445 in_k = ifh_in.params[1];
446 len = BIO_Bblock_Bound (N, p);
447 if (ifh_in.InvfLevel >= 2)
448 len += wcnt;
449 idn_blk = BIO_Bblock_Init (idh.num_of_docs,
450 (p + in_k - 1) / in_k);
451 ilen_blk = BIO_Bblock_Init (len, (p + in_k - 1) / in_k);
452 }
453 break;
454 }
455 default:
456 in_k = 0;
457 }
458
459 if (!(ii = Xmalloc (sizeof (invf_info) * p)))
460 FatalError (1, "Unable to allocate memory for \"ii\"\n");
461
462 doc_num = bits_so_far = 0;
463 next_mjr_dn = 0;
464 kd = 0;
465 DECODE_CONTINUE (in_buf)
466 for (j = 0; j < p; j++, kd++)
467 {
468 unsigned long doc_diff, count = 0;
469 if (kd == in_k)
470 kd = 0;
471 if (in_k && kd == 0 && j + in_k < p)
472 {
473 int temp;
474 BBLOCK_DECODE (next_mjr_dn, idn_blk);
475 next_mjr_dn += doc_num;
476 BBLOCK_DECODE (temp, ilen_blk);
477 }
478 ii[j].bits_so_far = bits_so_far;
479 if (in_k && kd == in_k - 1 && j != p - 1)
480 {
481 int count;
482 BBLOCK_LENGTH (next_mjr_dn - doc_num, blk, count);
483 bits_so_far += count;
484 doc_num = next_mjr_dn;
485 }
486 else
487 {
488 BBLOCK_DECODE_L (doc_diff, blk, bits_so_far);
489 doc_num += doc_diff;
490 }
491 ii[j].doc_num = doc_num;
492 if (ifh_in.InvfLevel >= 2)
493 {
494 int count_bits = 0;
495 GAMMA_DECODE_L (count, count_bits);
496 ii[j].count_bits = count_bits;
497 bits_so_far += count_bits;
498 ii[j].count = count;
499 }
500 }
501
502 /* read till a byte boundary */
503 while (__btg)
504 {
505 DECODE_BIT;
506 bits_so_far++;
507 }
508
509 DECODE_PAUSE (in_buf)
510
511 doc_num = bits_so_far = 0;
512 last_major = 0;
513 kd = 0;
514 ENCODE_CONTINUE (out_buf)
515 for (j = 0; j < p; j++, kd++)
516 {
517 if (kd == out_k)
518 kd = 0;
519 if (out_k && kd == 0)
520 {
521 if (j + out_k < p)
522 {
523 int num = ii[j + out_k - 1].doc_num - last_major;
524 BBLOCK_ENCODE_L (num, odn_blk, bits_out);
525 last_major = ii[j + out_k - 1].doc_num;
526
527 num = ii[j + out_k - 1].bits_so_far + ii[j + out_k - 1].count_bits -
528 bits_so_far;
529 BBLOCK_ENCODE_L (num, olen_blk, bits_out);
530 bits_so_far = ii[j + out_k].bits_so_far;
531 }
532 }
533 if (!(out_k && kd == out_k - 1 && j != p - 1))
534 BBLOCK_ENCODE_L (ii[j].doc_num - doc_num, blk, bits_out);
535 doc_num = ii[j].doc_num;
536 if (ifh_in.InvfLevel >= 2)
537 GAMMA_ENCODE_L (ii[j].count, bits_out);
538 }
539
540 /* write till a byte boundary */
541 while (__btg != 8)
542 {
543 ENCODE_BIT (0);
544 bits_out++;
545 }
546 ENCODE_PAUSE (out_buf)
547
548 Xfree (ii);
549
550 }
551 ENCODE_CONTINUE (out_buf)
552 ENCODE_DONE
553
554 HTONUL2(bits_out >> 3, bytes_out); /* [RPAP - Jan 97: Endian Ordering] */
555 fwrite ((char *) &bytes_out, sizeof (bytes_out), 1, odx);
556 NTOHUL(bytes_out);
557
558 fclose (idx);
559 fclose (odx);
560 fclose (in);
561 fclose (out);
562 fclose (dict);
563
564}
Note: See TracBrowser for help on using the repository browser.