source: trunk/gsdl/packages/mg/src/text/mg_weights_build.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.c -- Program to build the document weights file
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_weights_build.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "timing.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "invf.h"
37#include "text.h"
38#include "words.h"
39
40#define MAXBITS (sizeof(unsigned long) * 8)
41
42/*
43 $Log$
44 Revision 1.1 1999/08/10 21:18:16 sjboddie
45 renamed mg-1.3d directory mg
46
47 Revision 1.2 1998/11/25 07:55:49 rjmcnab
48
49 Modified mg to that you can specify the stemmer you want
50 to use via a command line option. You specify it to
51 mg_passes during the build process. The number of the
52 stemmer that you used is stored within the inverted
53 dictionary header and the stemmed dictionary header so
54 the correct stemmer is used in later stages of building
55 and querying.
56
57 Revision 1.1 1998/11/17 09:35:22 rjmcnab
58 *** empty log message ***
59
60 * Revision 1.4 1994/11/29 00:32:05 tes
61 * Committing the new merged files and changes.
62 *
63 * Revision 1.3 1994/10/20 03:57:00 tes
64 * I have rewritten the boolean query optimiser and abstracted out the
65 * components of the boolean query.
66 *
67 * Revision 1.2 1994/09/20 04:41:55 tes
68 * For version 1.1
69 *
70 */
71
72static char *RCSID = "$Id: mg_weights_build.c 439 1999-08-10 21:23:37Z sjboddie $";
73
74unsigned char bits = 8;
75static char *file_name = "";
76static char *text_file_name = "";
77static unsigned long NumPara = 0;
78static unsigned long StaticNumOfDocs = 0;
79
80unsigned long get_NumPara (void);
81unsigned long get_StaticNumOfDocs (void);
82void GenerateWeights (void);
83void Make_weight_approx (void);
84void Make_text_idx_wgt (void);
85
86
87int main (int argc, char **argv)
88{
89 ProgTime StartTime;
90 int ch;
91 opterr = 0;
92 msg_prefix = argv[0];
93 while ((ch = getopt (argc, argv, "f:t:d:b:sh")) != -1) /* [RJM 10/98 - Text Filename] */
94 switch (ch)
95 {
96 case 'f': /* input file */
97 file_name = optarg;
98 if (strlen(text_file_name) == 0) text_file_name = optarg;
99 break;
100 /* [RJM 10/98 - Text Filename] */
101 case 't': /* text input file */
102 text_file_name = optarg;
103 break;
104 case 'd':
105 set_basepath (optarg);
106 break;
107 case 'b':
108 bits = atoi (optarg);
109 if (bits > 32)
110 {
111 fprintf (stderr, "b may only take values 0-32\n");
112 exit (1);
113 }
114 break;
115 case 'h':
116 case '?':
117 fprintf (stderr, "usage: %s [-f input_file]"
118 "[-d data directory] [-b bits] [-s] [-h]\n", argv[0]);
119 exit (1);
120 }
121 GetTime (&StartTime);
122
123 GenerateWeights ();
124
125 Make_weight_approx ();
126
127 Make_text_idx_wgt ();
128
129 Message ("%s", ElapsedTime (&StartTime, NULL));
130
131 return 0;
132}
133
134
135
136
137unsigned long
138get_NumPara (void)
139{
140 struct invf_dict_header idh;
141 FILE *invf_dict;
142 if (NumPara)
143 return (NumPara);
144 invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
145 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
146 fread ((char *) &idh, sizeof (idh), 1, invf_dict);
147 fclose (invf_dict);
148 NTOHUL2(idh.num_of_docs, NumPara); /* [RPAP - Jan 97: Endian Ordering] */
149 return NumPara;
150}
151
152
153
154unsigned long
155get_StaticNumOfDocs (void)
156/* the static number of documents is the N parameter used to
157 * decode document gaps in the inverted file encoded using
158 * the Bblock method.
159 */
160{
161 struct invf_dict_header idh;
162 FILE *invf_dict;
163 if (StaticNumOfDocs)
164 return (StaticNumOfDocs);
165 invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
166 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
167 fread ((char *) &idh, sizeof (idh), 1, invf_dict);
168 fclose (invf_dict);
169 NTOHUL2(idh.static_num_of_docs, StaticNumOfDocs); /* [RPAP - Jan 97: Endian Ordering] */
170 return StaticNumOfDocs;
171}
172
173
174
175void GenerateWeights (void) {
176 FILE *dict, *invf, *f, *idx;
177 struct invf_dict_header idh;
178 struct invf_file_header ifh;
179 int i;
180 double logN;
181 float *DocWeights;
182
183 /* make sure the globals NumPara and StaticNumOfDocs are loaded */
184 get_NumPara ();
185 get_StaticNumOfDocs ();
186
187 /* check to see if the weights file has already been built */
188 if ((f = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
189 MG_CONTINUE)) != NULL) {
190 fclose (f);
191 return;
192 }
193 Message ("The file \"%s.weight\" does not exist.", file_name);
194 Message ("Building the weight data from the file \"%s.invf\".", file_name);
195
196 logN = log ((double) NumPara);
197
198 /* allocate memory for the weights */
199 if (!(DocWeights = Xmalloc (sizeof (float) * (NumPara + 1))))
200 FatalError (1, "No memory for doc weights");
201 bzero ((char *) DocWeights, sizeof (float) * (NumPara + 1));
202
203 /* open the .invf.dict file and read in its header */
204 dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
205 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
206 fread ((char *) &idh, sizeof (idh), 1, dict);
207
208 /* [RPAP - Jan 97: Endian Ordering] */
209 NTOHUL(idh.lookback);
210 NTOHUL(idh.dict_size);
211 NTOHUL(idh.total_bytes);
212 NTOHUL(idh.index_string_bytes);
213 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
214 NTOHUL(idh.num_of_docs);
215 NTOHUL(idh.static_num_of_docs);
216 NTOHUL(idh.num_of_words);
217 NTOHUL(idh.stemmer_num);
218 NTOHUL(idh.stem_method);
219
220 /* open .invf.idx */
221 idx = open_file (file_name, INVF_IDX_SUFFIX, "rb", MAGIC_INVI,
222 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
223
224 /* open .invf and read in its header */
225 invf = open_file (file_name, INVF_SUFFIX, "rb", MAGIC_INVF,
226 MG_ABORT);
227 fread ((char *) &ifh, sizeof (ifh), 1, invf);
228
229 /* [RPAP - Jan 97: Endian Ordering] */
230 NTOHUL(ifh.no_of_words);
231 NTOHUL(ifh.no_of_ptrs);
232 NTOHUL(ifh.skip_mode);
233 for (i = 0; i <= 15; i++)
234 NTOHUL(ifh.params[i]);
235 NTOHUL(ifh.InvfLevel);
236
237 /* make sure the inverted file does not contain skips and is not level 1 */
238 if (ifh.skip_mode != 0)
239 FatalError (0, "Can\'t make weights file from a skipped inverted file.");
240 if (ifh.InvfLevel == 1)
241 FatalError (0, "Can\'t make weights file from level 1 inverted file.");
242
243 DECODE_START (invf)
244
245 /* process each word adding its contributions to the document weights */
246 for (i = 0; i < ifh.no_of_words; i++)
247 {
248 u_char dummy1, dummy2[MAXSTEMLEN + 1];
249 unsigned long fcnt, wcnt, blk, CurrDoc, p, j;
250 float idf;
251
252 /* give a little feedback every 4096 words */
253 if ((i & 0xfff) == 0)
254 fprintf (stderr, ".");
255
256 /* read an entry for a word, just to get p value */
257 dummy1 = fgetc (dict);
258 dummy1 = fgetc (dict);
259 fread (dummy2, sizeof (u_char), dummy1, dict);
260 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
261 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
262
263 dummy2[dummy1] = '\0';
264
265 /* [RPAP - Jan 97: Endian Ordering] */
266 NTOHUL(fcnt);
267 NTOHUL(wcnt);
268
269 p = fcnt;
270
271 idf = logN - log ((double) fcnt);
272 blk = BIO_Bblock_Init (StaticNumOfDocs, p);
273 CurrDoc = 0;
274
275 /* check the inverted file index entry for this word */
276 {
277 unsigned long loc;
278 fread ((char *) &loc, sizeof (loc), 1, idx);
279 NTOHUL(loc); /* [RPAP - Jan 97: Endian Ordering] */
280 if (ftell (invf) != loc)
281 {
282 FatalError (1, "Word %d %d != %d", i, ftell (invf), loc);
283 }
284 }
285
286 for (j = 0; j < p; j++)
287 {
288 unsigned long x, tf;
289 BBLOCK_DECODE (x, blk);
290 CurrDoc += x;
291
292 if (CurrDoc > idh.num_of_docs) {
293 FatalError (1, "CurrDoc = %d, number of documents = %d",
294 CurrDoc, idh.num_of_docs);
295 }
296
297 if (ifh.InvfLevel >= 2)
298 {
299 double weight;
300 GAMMA_DECODE (tf);
301 weight = tf * idf;
302 DocWeights[CurrDoc - 1] += weight * weight;
303 }
304 }
305
306 while (__btg)
307 DECODE_BIT;
308 }
309
310 DECODE_DONE
311
312 fclose (dict);
313 fclose (invf);
314 fprintf (stderr, "\n");
315
316 /* [RPAP - Jan 97: Endian Ordering] */
317 for (i = 0; i < NumPara; i++)
318 HTONF(DocWeights[i]);
319
320 f = create_file (file_name, WEIGHTS_SUFFIX, "wb", MAGIC_WGHT,
321 MG_ABORT);
322
323 fwrite ((char *) DocWeights, sizeof (float), NumPara, f);
324 fclose (f);
325 Xfree (DocWeights);
326}
327
328
329
330
331
332
333
334
335
336
337
338void
339Make_weight_approx (void)
340{
341 int i, pos, max;
342 unsigned long buf;
343 double U, L, B;
344 FILE *approx, *exact;
345
346 exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
347 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
348
349 /* calculate U and L */
350 L = 1e300;
351 U = 0;
352 for (i = 0; i < NumPara; i++)
353 {
354 float wgt;
355 fread ((char *) &wgt, sizeof (wgt), 1, exact);
356 NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
357 wgt = sqrt (wgt);
358 if (wgt > U)
359 U = wgt;
360 if (wgt > 0 && wgt < L)
361 L = wgt;
362
363 }
364 fseek (exact, sizeof (u_long), SEEK_SET);
365
366 B = pow (U / L, pow (2.0, -(double) bits));
367
368 fprintf (stderr, "L = %f\n", L);
369 fprintf (stderr, "U = %f\n", U);
370 fprintf (stderr, "B = %f\n", B);
371
372
373
374 approx = create_file (file_name, APPROX_WEIGHTS_SUFFIX, "wb",
375 MAGIC_WGHT_APPROX, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
376
377 fwrite ((char *) &bits, sizeof (bits), 1, approx);
378 HTOND(L); /* [RPAP - Jan 97: Endian Ordering] */
379 HTOND(B); /* [RPAP - Jan 97: Endian Ordering] */
380 fwrite ((char *) &L, sizeof (L), 1, approx);
381 fwrite ((char *) &B, sizeof (B), 1, approx);
382 NTOHD(L); /* [RPAP - Jan 97: Endian Ordering] */
383 NTOHD(B); /* [RPAP - Jan 97: Endian Ordering] */
384
385 max = bits == 32 ? 0xffffffff : (1 << bits) - 1;
386 for (buf = pos = i = 0; i < NumPara; i++)
387 {
388 unsigned long fx;
389 float wgt;
390 fread ((char *) &wgt, sizeof (wgt), 1, exact);
391 NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
392 wgt = sqrt (wgt);
393 if (wgt == 0)
394 {
395 wgt = L;
396#ifndef QUIET
397 Message ("Warning: Document %d had a weight of 0.", i);
398#endif
399 }
400 fx = (int) floor (log (wgt / L) / log (B));
401
402 if (fx > max)
403 fx = max;
404
405 buf |= (fx << pos);
406 pos += bits;
407
408 if (pos >= MAXBITS)
409 {
410 HTONUL(buf);
411 fwrite ((char *) &buf, sizeof (buf), 1, approx);
412 buf = fx >> (bits - (pos - MAXBITS));
413 pos = pos - MAXBITS;
414 }
415 }
416 if (pos > 0)
417 {
418 /* [RPAP - Jan 97: Endian Ordering] */
419 HTONUL(buf);
420 fwrite ((char *) &buf, sizeof (buf), 1, approx);
421 }
422
423 fclose (approx);
424 fclose (exact);
425}
426
427
428
429
430
431void
432Make_text_idx_wgt (void)
433{
434 compressed_text_header cth;
435 int i;
436 FILE *idx_wgt, *idx, *para, *exact;
437
438 idx_wgt = create_file (file_name, TEXT_IDX_WGT_SUFFIX, "wb", MAGIC_TEXI_WGT,
439 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
440
441 /* [RJM 10/98 - Text Filename] */
442 idx = open_file (text_file_name, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI,
443 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
444 if (fread (&cth, sizeof (cth), 1, idx) != 1)
445 FatalError (1, "Unable to read header of index file");
446
447 /* [RPAP - Jan 97: Endian Ordering] */
448 NTOHUL(cth.num_of_docs);
449 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
450 NTOHUL(cth.num_of_words);
451 NTOHUL(cth.length_of_longest_doc);
452 NTOHD(cth.ratio);
453
454 exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
455 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
456
457 get_NumPara ();
458 if (cth.num_of_docs != NumPara)
459 {
460 Message ("The number of documents does not equal "
461 "the number of paragraphs.");
462 Message ("Using the \"%s.invf.paragraph\" file\n", file_name);
463 para = open_file (file_name, INVF_PARAGRAPH_SUFFIX, "rb", MAGIC_PARAGRAPH,
464 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
465 }
466 else
467 para = NULL;
468
469 {
470 struct
471 {
472 unsigned long Start;
473 float Weight;
474 }
475 data;
476 for (i = 0; i < cth.num_of_docs; i++)
477 {
478 int count;
479 fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
480 if (para && i < cth.num_of_docs)
481 {
482 /* [RPAP - Jan 97: Endian Ordering] */
483 fread ((char *) &count, sizeof (count), 1, para);
484 NTOHSI(count);
485 }
486 else
487 count = 1;
488 while (count--)
489 {
490 fread ((char *) &data.Weight, sizeof (float), 1, exact);
491 NTOHF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
492 data.Weight = sqrt (data.Weight);
493 HTONF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
494 fwrite ((char *) &data, sizeof (data), 1, idx_wgt);
495 }
496 }
497 /* Write out the extra entry for the idx file */
498 fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
499 data.Weight = 0;
500 fwrite((char*)&data, sizeof(data), 1, idx_wgt);
501 }
502
503 fclose (idx_wgt);
504 fclose (idx);
505 fclose (exact);
506 if (para)
507 fclose (para);
508}
Note: See TracBrowser for help on using the repository browser.