source: trunk/indexers/mg/src/text/mg_weights_build.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.c -- Program to build the document weights file
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_weights_build.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "timing.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "invf.h"
37#include "text.h"
38#include "words.h"
39
40#define MAXBITS (sizeof(unsigned long) * 8)
41
42/*
43 $Log$
44 Revision 1.1 2003/02/20 21:18:24 mdewsnip
45 Addition of MG package for search and retrieval
46
47 Revision 1.1 1999/08/10 21:18:16 sjboddie
48 renamed mg-1.3d directory mg
49
50 Revision 1.2 1998/11/25 07:55:49 rjmcnab
51
52 Modified mg to that you can specify the stemmer you want
53 to use via a command line option. You specify it to
54 mg_passes during the build process. The number of the
55 stemmer that you used is stored within the inverted
56 dictionary header and the stemmed dictionary header so
57 the correct stemmer is used in later stages of building
58 and querying.
59
60 Revision 1.1 1998/11/17 09:35:22 rjmcnab
61 *** empty log message ***
62
63 * Revision 1.4 1994/11/29 00:32:05 tes
64 * Committing the new merged files and changes.
65 *
66 * Revision 1.3 1994/10/20 03:57:00 tes
67 * I have rewritten the boolean query optimiser and abstracted out the
68 * components of the boolean query.
69 *
70 * Revision 1.2 1994/09/20 04:41:55 tes
71 * For version 1.1
72 *
73 */
74
75static char *RCSID = "$Id: mg_weights_build.c 3745 2003-02-20 21:20:24Z mdewsnip $";
76
77unsigned char bits = 8;
78static char *file_name = "";
79static char *text_file_name = "";
80static unsigned long NumPara = 0;
81static unsigned long StaticNumOfDocs = 0;
82
83unsigned long get_NumPara (void);
84unsigned long get_StaticNumOfDocs (void);
85void GenerateWeights (void);
86void Make_weight_approx (void);
87void Make_text_idx_wgt (void);
88
89
90int main (int argc, char **argv)
91{
92 ProgTime StartTime;
93 int ch;
94 opterr = 0;
95 msg_prefix = argv[0];
96 while ((ch = getopt (argc, argv, "f:t:d:b:sh")) != -1) /* [RJM 10/98 - Text Filename] */
97 switch (ch)
98 {
99 case 'f': /* input file */
100 file_name = optarg;
101 if (strlen(text_file_name) == 0) text_file_name = optarg;
102 break;
103 /* [RJM 10/98 - Text Filename] */
104 case 't': /* text input file */
105 text_file_name = optarg;
106 break;
107 case 'd':
108 set_basepath (optarg);
109 break;
110 case 'b':
111 bits = atoi (optarg);
112 if (bits > 32)
113 {
114 fprintf (stderr, "b may only take values 0-32\n");
115 exit (1);
116 }
117 break;
118 case 'h':
119 case '?':
120 fprintf (stderr, "usage: %s [-f input_file]"
121 "[-d data directory] [-b bits] [-s] [-h]\n", argv[0]);
122 exit (1);
123 }
124 GetTime (&StartTime);
125
126 GenerateWeights ();
127
128 Make_weight_approx ();
129
130 Make_text_idx_wgt ();
131
132 Message ("%s", ElapsedTime (&StartTime, NULL));
133
134 return 0;
135}
136
137
138
139
140unsigned long
141get_NumPara (void)
142{
143 struct invf_dict_header idh;
144 FILE *invf_dict;
145 if (NumPara)
146 return (NumPara);
147 invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
148 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
149 fread ((char *) &idh, sizeof (idh), 1, invf_dict);
150 fclose (invf_dict);
151 NTOHUL2(idh.num_of_docs, NumPara); /* [RPAP - Jan 97: Endian Ordering] */
152 return NumPara;
153}
154
155
156
157unsigned long
158get_StaticNumOfDocs (void)
159/* the static number of documents is the N parameter used to
160 * decode document gaps in the inverted file encoded using
161 * the Bblock method.
162 */
163{
164 struct invf_dict_header idh;
165 FILE *invf_dict;
166 if (StaticNumOfDocs)
167 return (StaticNumOfDocs);
168 invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
169 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
170 fread ((char *) &idh, sizeof (idh), 1, invf_dict);
171 fclose (invf_dict);
172 NTOHUL2(idh.static_num_of_docs, StaticNumOfDocs); /* [RPAP - Jan 97: Endian Ordering] */
173 return StaticNumOfDocs;
174}
175
176
177
178void GenerateWeights (void) {
179 FILE *dict, *invf, *f, *idx;
180 struct invf_dict_header idh;
181 struct invf_file_header ifh;
182 int i;
183 double logN;
184 float *DocWeights;
185
186 /* make sure the globals NumPara and StaticNumOfDocs are loaded */
187 get_NumPara ();
188 get_StaticNumOfDocs ();
189
190 /* check to see if the weights file has already been built */
191 if ((f = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
192 MG_CONTINUE)) != NULL) {
193 fclose (f);
194 return;
195 }
196 Message ("The file \"%s.weight\" does not exist.", file_name);
197 Message ("Building the weight data from the file \"%s.invf\".", file_name);
198
199 logN = log ((double) NumPara);
200
201 /* allocate memory for the weights */
202 if (!(DocWeights = Xmalloc (sizeof (float) * (NumPara + 1))))
203 FatalError (1, "No memory for doc weights");
204 bzero ((char *) DocWeights, sizeof (float) * (NumPara + 1));
205
206 /* open the .invf.dict file and read in its header */
207 dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
208 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
209 fread ((char *) &idh, sizeof (idh), 1, dict);
210
211 /* [RPAP - Jan 97: Endian Ordering] */
212 NTOHUL(idh.lookback);
213 NTOHUL(idh.dict_size);
214 NTOHUL(idh.total_bytes);
215 NTOHUL(idh.index_string_bytes);
216 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
217 NTOHUL(idh.num_of_docs);
218 NTOHUL(idh.static_num_of_docs);
219 NTOHUL(idh.num_of_words);
220 NTOHUL(idh.stemmer_num);
221 NTOHUL(idh.stem_method);
222
223 /* open .invf.idx */
224 idx = open_file (file_name, INVF_IDX_SUFFIX, "rb", MAGIC_INVI,
225 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
226
227 /* open .invf and read in its header */
228 invf = open_file (file_name, INVF_SUFFIX, "rb", MAGIC_INVF,
229 MG_ABORT);
230 fread ((char *) &ifh, sizeof (ifh), 1, invf);
231
232 /* [RPAP - Jan 97: Endian Ordering] */
233 NTOHUL(ifh.no_of_words);
234 NTOHUL(ifh.no_of_ptrs);
235 NTOHUL(ifh.skip_mode);
236 for (i = 0; i <= 15; i++)
237 NTOHUL(ifh.params[i]);
238 NTOHUL(ifh.InvfLevel);
239
240 /* make sure the inverted file does not contain skips and is not level 1 */
241 if (ifh.skip_mode != 0)
242 FatalError (0, "Can\'t make weights file from a skipped inverted file.");
243 if (ifh.InvfLevel == 1)
244 FatalError (0, "Can\'t make weights file from level 1 inverted file.");
245
246 DECODE_START (invf)
247
248 /* process each word adding its contributions to the document weights */
249 for (i = 0; i < ifh.no_of_words; i++)
250 {
251 u_char dummy1, dummy2[MAXSTEMLEN + 1];
252 unsigned long fcnt, wcnt, blk, CurrDoc, p, j;
253 float idf;
254
255 /* give a little feedback every 4096 words */
256 if ((i & 0xfff) == 0)
257 fprintf (stderr, ".");
258
259 /* read an entry for a word, just to get p value */
260 dummy1 = fgetc (dict);
261 dummy1 = fgetc (dict);
262 fread (dummy2, sizeof (u_char), dummy1, dict);
263 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
264 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
265
266 dummy2[dummy1] = '\0';
267
268 /* [RPAP - Jan 97: Endian Ordering] */
269 NTOHUL(fcnt);
270 NTOHUL(wcnt);
271
272 p = fcnt;
273
274 idf = logN - log ((double) fcnt);
275 blk = BIO_Bblock_Init (StaticNumOfDocs, p);
276 CurrDoc = 0;
277
278 /* check the inverted file index entry for this word */
279 {
280 unsigned long loc;
281 fread ((char *) &loc, sizeof (loc), 1, idx);
282 NTOHUL(loc); /* [RPAP - Jan 97: Endian Ordering] */
283 if (ftell (invf) != loc)
284 {
285 FatalError (1, "Word %d %d != %d", i, ftell (invf), loc);
286 }
287 }
288
289 for (j = 0; j < p; j++)
290 {
291 unsigned long x, tf;
292 BBLOCK_DECODE (x, blk);
293 CurrDoc += x;
294
295 if (CurrDoc > idh.num_of_docs) {
296 FatalError (1, "CurrDoc = %d, number of documents = %d",
297 CurrDoc, idh.num_of_docs);
298 }
299
300 if (ifh.InvfLevel >= 2)
301 {
302 double weight;
303 GAMMA_DECODE (tf);
304 weight = tf * idf;
305 DocWeights[CurrDoc - 1] += weight * weight;
306 }
307 }
308
309 while (__btg)
310 DECODE_BIT;
311 }
312
313 DECODE_DONE
314
315 fclose (dict);
316 fclose (invf);
317 fprintf (stderr, "\n");
318
319 /* [RPAP - Jan 97: Endian Ordering] */
320 for (i = 0; i < NumPara; i++)
321 HTONF(DocWeights[i]);
322
323 f = create_file (file_name, WEIGHTS_SUFFIX, "wb", MAGIC_WGHT,
324 MG_ABORT);
325
326 fwrite ((char *) DocWeights, sizeof (float), NumPara, f);
327 fclose (f);
328 Xfree (DocWeights);
329}
330
331
332
333
334
335
336
337
338
339
340
341void
342Make_weight_approx (void)
343{
344 int i, pos, max;
345 unsigned long buf;
346 double U, L, B;
347 FILE *approx, *exact;
348
349 exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
350 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
351
352 /* calculate U and L */
353 L = 1e300;
354 U = 0;
355 for (i = 0; i < NumPara; i++)
356 {
357 float wgt;
358 fread ((char *) &wgt, sizeof (wgt), 1, exact);
359 NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
360 wgt = sqrt (wgt);
361 if (wgt > U)
362 U = wgt;
363 if (wgt > 0 && wgt < L)
364 L = wgt;
365
366 }
367 fseek (exact, sizeof (u_long), SEEK_SET);
368
369 B = pow (U / L, pow (2.0, -(double) bits));
370
371 fprintf (stderr, "L = %f\n", L);
372 fprintf (stderr, "U = %f\n", U);
373 fprintf (stderr, "B = %f\n", B);
374
375
376
377 approx = create_file (file_name, APPROX_WEIGHTS_SUFFIX, "wb",
378 MAGIC_WGHT_APPROX, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
379
380 fwrite ((char *) &bits, sizeof (bits), 1, approx);
381 HTOND(L); /* [RPAP - Jan 97: Endian Ordering] */
382 HTOND(B); /* [RPAP - Jan 97: Endian Ordering] */
383 fwrite ((char *) &L, sizeof (L), 1, approx);
384 fwrite ((char *) &B, sizeof (B), 1, approx);
385 NTOHD(L); /* [RPAP - Jan 97: Endian Ordering] */
386 NTOHD(B); /* [RPAP - Jan 97: Endian Ordering] */
387
388 max = bits == 32 ? 0xffffffff : (1 << bits) - 1;
389 for (buf = pos = i = 0; i < NumPara; i++)
390 {
391 unsigned long fx;
392 float wgt;
393 fread ((char *) &wgt, sizeof (wgt), 1, exact);
394 NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
395 wgt = sqrt (wgt);
396 if (wgt == 0)
397 {
398 wgt = L;
399#ifndef QUIET
400 Message ("Warning: Document %d had a weight of 0.", i);
401#endif
402 }
403 fx = (int) floor (log (wgt / L) / log (B));
404
405 if (fx > max)
406 fx = max;
407
408 buf |= (fx << pos);
409 pos += bits;
410
411 if (pos >= MAXBITS)
412 {
413 HTONUL(buf);
414 fwrite ((char *) &buf, sizeof (buf), 1, approx);
415 buf = fx >> (bits - (pos - MAXBITS));
416 pos = pos - MAXBITS;
417 }
418 }
419 if (pos > 0)
420 {
421 /* [RPAP - Jan 97: Endian Ordering] */
422 HTONUL(buf);
423 fwrite ((char *) &buf, sizeof (buf), 1, approx);
424 }
425
426 fclose (approx);
427 fclose (exact);
428}
429
430
431
432
433
434void
435Make_text_idx_wgt (void)
436{
437 compressed_text_header cth;
438 int i;
439 FILE *idx_wgt, *idx, *para, *exact;
440
441 idx_wgt = create_file (file_name, TEXT_IDX_WGT_SUFFIX, "wb", MAGIC_TEXI_WGT,
442 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
443
444 /* [RJM 10/98 - Text Filename] */
445 idx = open_file (text_file_name, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI,
446 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
447 if (fread (&cth, sizeof (cth), 1, idx) != 1)
448 FatalError (1, "Unable to read header of index file");
449
450 /* [RPAP - Jan 97: Endian Ordering] */
451 NTOHUL(cth.num_of_docs);
452 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
453 NTOHUL(cth.num_of_words);
454 NTOHUL(cth.length_of_longest_doc);
455 NTOHD(cth.ratio);
456
457 exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
458 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
459
460 get_NumPara ();
461 if (cth.num_of_docs != NumPara)
462 {
463 Message ("The number of documents does not equal "
464 "the number of paragraphs.");
465 Message ("Using the \"%s.invf.paragraph\" file\n", file_name);
466 para = open_file (file_name, INVF_PARAGRAPH_SUFFIX, "rb", MAGIC_PARAGRAPH,
467 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
468 }
469 else
470 para = NULL;
471
472 {
473 struct
474 {
475 unsigned long Start;
476 float Weight;
477 }
478 data;
479 for (i = 0; i < cth.num_of_docs; i++)
480 {
481 int count;
482 fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
483 if (para && i < cth.num_of_docs)
484 {
485 /* [RPAP - Jan 97: Endian Ordering] */
486 fread ((char *) &count, sizeof (count), 1, para);
487 NTOHSI(count);
488 }
489 else
490 count = 1;
491 while (count--)
492 {
493 fread ((char *) &data.Weight, sizeof (float), 1, exact);
494 NTOHF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
495 data.Weight = sqrt (data.Weight);
496 HTONF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
497 fwrite ((char *) &data, sizeof (data), 1, idx_wgt);
498 }
499 }
500 /* Write out the extra entry for the idx file */
501 fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
502 data.Weight = 0;
503 fwrite((char*)&data, sizeof(data), 1, idx_wgt);
504 }
505
506 fclose (idx_wgt);
507 fclose (idx);
508 fclose (exact);
509 if (para)
510 fclose (para);
511}
Note: See TracBrowser for help on using the repository browser.