source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mg/src/text/mg_weights_build.c@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.c -- Program to build the document weights file
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_weights_build.c 23508 2010-12-17 01:04:10Z sjm84 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "timing.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "invf.h"
37#include "text.h"
38#include "words.h"
39
40#define MAXBITS (sizeof(mg_u_long) * 8)
41
42/*
43 $Log$
44 Revision 1.2 2004/05/24 21:12:18 kjdon
45 changed a message
46
47 Revision 1.1 2003/02/20 21:18:24 mdewsnip
48 Addition of MG package for search and retrieval
49
50 Revision 1.1 1999/08/10 21:18:16 sjboddie
51 renamed mg-1.3d directory mg
52
53 Revision 1.2 1998/11/25 07:55:49 rjmcnab
54
55 Modified mg to that you can specify the stemmer you want
56 to use via a command line option. You specify it to
57 mg_passes during the build process. The number of the
58 stemmer that you used is stored within the inverted
59 dictionary header and the stemmed dictionary header so
60 the correct stemmer is used in later stages of building
61 and querying.
62
63 Revision 1.1 1998/11/17 09:35:22 rjmcnab
64 *** empty log message ***
65
66 * Revision 1.4 1994/11/29 00:32:05 tes
67 * Committing the new merged files and changes.
68 *
69 * Revision 1.3 1994/10/20 03:57:00 tes
70 * I have rewritten the boolean query optimiser and abstracted out the
71 * components of the boolean query.
72 *
73 * Revision 1.2 1994/09/20 04:41:55 tes
74 * For version 1.1
75 *
76 */
77
78static char *RCSID = "$Id: mg_weights_build.c 23508 2010-12-17 01:04:10Z sjm84 $";
79
80unsigned char bits = 8;
81static char *file_name = "";
82static char *text_file_name = "";
83static mg_u_long NumPara = 0;
84static mg_u_long StaticNumOfDocs = 0;
85
86mg_u_long get_NumPara (void);
87mg_u_long get_StaticNumOfDocs (void);
88void GenerateWeights (void);
89void Make_weight_approx (void);
90void Make_text_idx_wgt (void);
91
92
93int main (int argc, char **argv)
94{
95 ProgTime StartTime;
96 int ch;
97 opterr = 0;
98 msg_prefix = argv[0];
99 while ((ch = getopt (argc, argv, "f:t:d:b:sh")) != -1) /* [RJM 10/98 - Text Filename] */
100 switch (ch)
101 {
102 case 'f': /* input file */
103 file_name = optarg;
104 if (strlen(text_file_name) == 0) text_file_name = optarg;
105 break;
106 /* [RJM 10/98 - Text Filename] */
107 case 't': /* text input file */
108 text_file_name = optarg;
109 break;
110 case 'd':
111 set_basepath (optarg);
112 break;
113 case 'b':
114 bits = atoi (optarg);
115 if (bits > 32)
116 {
117 fprintf (stderr, "b may only take values 0-32\n");
118 exit (1);
119 }
120 break;
121 case 'h':
122 case '?':
123 fprintf (stderr, "usage: %s [-f input_file]"
124 "[-d data directory] [-b bits] [-s] [-h]\n", argv[0]);
125 exit (1);
126 }
127 GetTime (&StartTime);
128
129 GenerateWeights ();
130
131 Make_weight_approx ();
132
133 Make_text_idx_wgt ();
134
135 Message ("%s", ElapsedTime (&StartTime, NULL));
136
137 return 0;
138}
139
140
141
142
143mg_u_long
144get_NumPara (void)
145{
146 struct invf_dict_header idh;
147 FILE *invf_dict;
148 if (NumPara)
149 return (NumPara);
150 invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
151 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
152 fread ((char *) &idh, sizeof (idh), 1, invf_dict);
153 fclose (invf_dict);
154 NTOHUL2(idh.num_of_docs, NumPara); /* [RPAP - Jan 97: Endian Ordering] */
155 return NumPara;
156}
157
158
159
160mg_u_long
161get_StaticNumOfDocs (void)
162/* the static number of documents is the N parameter used to
163 * decode document gaps in the inverted file encoded using
164 * the Bblock method.
165 */
166{
167 struct invf_dict_header idh;
168 FILE *invf_dict;
169 if (StaticNumOfDocs)
170 return (StaticNumOfDocs);
171 invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
172 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
173 fread ((char *) &idh, sizeof (idh), 1, invf_dict);
174 fclose (invf_dict);
175 NTOHUL2(idh.static_num_of_docs, StaticNumOfDocs); /* [RPAP - Jan 97: Endian Ordering] */
176 return StaticNumOfDocs;
177}
178
179
180
181void GenerateWeights (void) {
182 FILE *dict, *invf, *f, *idx;
183 struct invf_dict_header idh;
184 struct invf_file_header ifh;
185 int i;
186 double logN;
187 float *DocWeights;
188
189 /* make sure the globals NumPara and StaticNumOfDocs are loaded */
190 get_NumPara ();
191 get_StaticNumOfDocs ();
192
193 /* check to see if the weights file has already been built */
194 if ((f = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
195 MG_CONTINUE)) != NULL) {
196 fclose (f);
197 return;
198 }
199 Message ("The file \"%s.weight\" does not exist.", file_name);
200 Message ("Building the weight data from the file \"%s.invf\".", file_name);
201
202 logN = log ((double) NumPara);
203
204 /* allocate memory for the weights */
205 if (!(DocWeights = Xmalloc (sizeof (float) * (NumPara + 1))))
206 FatalError (1, "No memory for doc weights");
207 bzero ((char *) DocWeights, sizeof (float) * (NumPara + 1));
208
209 /* open the .invf.dict file and read in its header */
210 dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
211 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
212 fread ((char *) &idh, sizeof (idh), 1, dict);
213
214 /* [RPAP - Jan 97: Endian Ordering] */
215 NTOHUL(idh.lookback);
216 NTOHUL(idh.dict_size);
217 NTOHUL(idh.total_bytes);
218 NTOHUL(idh.index_string_bytes);
219 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
220 NTOHUL(idh.num_of_docs);
221 NTOHUL(idh.static_num_of_docs);
222 NTOHUL(idh.num_of_words);
223 NTOHUL(idh.stemmer_num);
224 NTOHUL(idh.stem_method);
225
226 /* open .invf.idx */
227 idx = open_file (file_name, INVF_IDX_SUFFIX, "rb", MAGIC_INVI,
228 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
229
230 /* open .invf and read in its header */
231 invf = open_file (file_name, INVF_SUFFIX, "rb", MAGIC_INVF,
232 MG_ABORT);
233 fread ((char *) &ifh, sizeof (ifh), 1, invf);
234
235 /* [RPAP - Jan 97: Endian Ordering] */
236 NTOHUL(ifh.no_of_words);
237 NTOHUL(ifh.no_of_ptrs);
238 NTOHUL(ifh.skip_mode);
239 for (i = 0; i <= 15; i++)
240 NTOHUL(ifh.params[i]);
241 NTOHUL(ifh.InvfLevel);
242
243 /* make sure the inverted file does not contain skips and is not level 1 */
244 if (ifh.skip_mode != 0)
245 FatalError (0, "Can\'t make weights file from a skipped inverted file.");
246 if (ifh.InvfLevel == 1)
247 FatalError (0, "Can\'t make weights file from level 1 inverted file.");
248
249 DECODE_START (invf)
250
251 /* process each word adding its contributions to the document weights */
252 for (i = 0; i < ifh.no_of_words; i++)
253 {
254 u_char dummy1, dummy2[MAXSTEMLEN + 1];
255 mg_u_long fcnt, wcnt, blk, CurrDoc, p, j;
256 float idf;
257
258 /* give a little feedback every 4096 words */
259 if ((i & 0xfff) == 0)
260 fprintf (stderr, ".");
261
262 /* read an entry for a word, just to get p value */
263 dummy1 = fgetc (dict);
264 dummy1 = fgetc (dict);
265 fread (dummy2, sizeof (u_char), dummy1, dict);
266 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
267 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
268
269 dummy2[dummy1] = '\0';
270
271 /* [RPAP - Jan 97: Endian Ordering] */
272 NTOHUL(fcnt);
273 NTOHUL(wcnt);
274
275 p = fcnt;
276
277 idf = logN - log ((double) fcnt);
278 blk = BIO_Bblock_Init (StaticNumOfDocs, p);
279 CurrDoc = 0;
280
281 /* check the inverted file index entry for this word */
282 {
283 mg_u_long loc;
284 fread ((char *) &loc, sizeof (loc), 1, idx);
285 NTOHUL(loc); /* [RPAP - Jan 97: Endian Ordering] */
286 if (ftell (invf) != loc)
287 {
288 FatalError (1, "Word %d %d != %d", i, ftell (invf), loc);
289 }
290 }
291
292 for (j = 0; j < p; j++)
293 {
294 mg_u_long x, tf;
295 BBLOCK_DECODE (x, blk);
296 CurrDoc += x;
297
298 if (CurrDoc > idh.num_of_docs) {
299 FatalError (1, "CurrDoc = %d, number of documents = %d",
300 CurrDoc, idh.num_of_docs);
301 }
302
303 if (ifh.InvfLevel >= 2)
304 {
305 double weight;
306 GAMMA_DECODE (tf);
307 weight = tf * idf;
308 DocWeights[CurrDoc - 1] += weight * weight;
309 }
310 }
311
312 while (__btg)
313 DECODE_BIT;
314 }
315
316 DECODE_DONE
317
318 fclose (dict);
319 fclose (invf);
320 fprintf (stderr, "\n");
321
322 /* [RPAP - Jan 97: Endian Ordering] */
323 for (i = 0; i < NumPara; i++)
324 HTONF(DocWeights[i]);
325
326 f = create_file (file_name, WEIGHTS_SUFFIX, "wb", MAGIC_WGHT,
327 MG_ABORT);
328
329 fwrite ((char *) DocWeights, sizeof (float), NumPara, f);
330 fclose (f);
331 Xfree (DocWeights);
332}
333
334
335
336
337
338
339
340
341
342
343
344void
345Make_weight_approx (void)
346{
347 int i, pos, max;
348 mg_u_long buf;
349 double U, L, B;
350 FILE *approx, *exact;
351
352 exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
353 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
354
355 /* calculate U and L */
356 L = 1e300;
357 U = 0;
358 for (i = 0; i < NumPara; i++)
359 {
360 float wgt;
361 fread ((char *) &wgt, sizeof (wgt), 1, exact);
362 NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
363 wgt = sqrt (wgt);
364 if (wgt > U)
365 U = wgt;
366 if (wgt > 0 && wgt < L)
367 L = wgt;
368
369 }
370 fseek (exact, sizeof (mg_u_long), SEEK_SET);
371
372 B = pow (U / L, pow (2.0, -(double) bits));
373
374 fprintf (stderr, "L = %f\n", L);
375 fprintf (stderr, "U = %f\n", U);
376 fprintf (stderr, "B = %f\n", B);
377
378
379
380 approx = create_file (file_name, APPROX_WEIGHTS_SUFFIX, "wb",
381 MAGIC_WGHT_APPROX, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
382
383 fwrite ((char *) &bits, sizeof (bits), 1, approx);
384 HTOND(L); /* [RPAP - Jan 97: Endian Ordering] */
385 HTOND(B); /* [RPAP - Jan 97: Endian Ordering] */
386 fwrite ((char *) &L, sizeof (L), 1, approx);
387 fwrite ((char *) &B, sizeof (B), 1, approx);
388 NTOHD(L); /* [RPAP - Jan 97: Endian Ordering] */
389 NTOHD(B); /* [RPAP - Jan 97: Endian Ordering] */
390
391 max = bits == 32 ? 0xffffffff : (1 << bits) - 1;
392 for (buf = pos = i = 0; i < NumPara; i++)
393 {
394 mg_u_long fx;
395 float wgt;
396 fread ((char *) &wgt, sizeof (wgt), 1, exact);
397 NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
398 wgt = sqrt (wgt);
399 if (wgt == 0)
400 {
401 wgt = L;
402#ifndef QUIET
403 Message ("Warning: Document %d had a weight of 0.", i);
404#endif
405 }
406 fx = (int) floor (log (wgt / L) / log (B));
407
408 if (fx > max)
409 fx = max;
410
411 buf |= (fx << pos);
412 pos += bits;
413
414 if (pos >= MAXBITS)
415 {
416 HTONUL(buf);
417 fwrite ((char *) &buf, sizeof (buf), 1, approx);
418 buf = fx >> (bits - (pos - MAXBITS));
419 pos = pos - MAXBITS;
420 }
421 }
422 if (pos > 0)
423 {
424 /* [RPAP - Jan 97: Endian Ordering] */
425 HTONUL(buf);
426 fwrite ((char *) &buf, sizeof (buf), 1, approx);
427 }
428
429 fclose (approx);
430 fclose (exact);
431}
432
433
434
435
436
437void
438Make_text_idx_wgt (void)
439{
440 compressed_text_header cth;
441 int i;
442 FILE *idx_wgt, *idx, *para, *exact;
443
444 idx_wgt = create_file (file_name, TEXT_IDX_WGT_SUFFIX, "wb", MAGIC_TEXI_WGT,
445 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
446
447 /* [RJM 10/98 - Text Filename] */
448 idx = open_file (text_file_name, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI,
449 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
450 if (fread (&cth, sizeof (cth), 1, idx) != 1)
451 FatalError (1, "Unable to read header of index file");
452
453 /* [RPAP - Jan 97: Endian Ordering] */
454 NTOHUL(cth.num_of_docs);
455 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
456 NTOHUL(cth.num_of_words);
457 NTOHUL(cth.length_of_longest_doc);
458 NTOHD(cth.ratio);
459
460 exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
461 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
462
463 get_NumPara ();
464 if (cth.num_of_docs != NumPara)
465 {
466 Message ("The number of documents %d does not equal "
467 "the number of paragraphs %d.", cth.num_of_docs, NumPara);
468 Message ("Using the \"%s.invf.paragraph\" file\n", file_name);
469 para = open_file (file_name, INVF_PARAGRAPH_SUFFIX, "rb", MAGIC_PARAGRAPH,
470 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
471 }
472 else
473 para = NULL;
474
475 {
476 struct
477 {
478 mg_u_long Start;
479 float Weight;
480 }
481 data;
482 for (i = 0; i < cth.num_of_docs; i++)
483 {
484 int count;
485 fread ((char *) &data.Start, sizeof (mg_u_long), 1, idx);
486 if (para && i < cth.num_of_docs)
487 {
488 /* [RPAP - Jan 97: Endian Ordering] */
489 fread ((char *) &count, sizeof (count), 1, para);
490 NTOHSI(count);
491 }
492 else
493 count = 1;
494 while (count--)
495 {
496 fread ((char *) &data.Weight, sizeof (float), 1, exact);
497 NTOHF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
498 data.Weight = sqrt (data.Weight);
499 HTONF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
500 fwrite ((char *) &data, sizeof (data), 1, idx_wgt);
501 }
502 }
503 /* Write out the extra entry for the idx file */
504 fread ((char *) &data.Start, sizeof (mg_u_long), 1, idx);
505 data.Weight = 0;
506 fwrite((char*)&data, sizeof(data), 1, idx_wgt);
507 }
508
509 fclose (idx_wgt);
510 fclose (idx);
511 fclose (exact);
512 if (para)
513 fclose (para);
514}
Note: See TracBrowser for help on using the repository browser.