root/gsdl/trunk/trunk/mg/src/text/mg_weights_build.c @ 16583

Revision 16583, 13.1 KB (checked in by davidb, 12 years ago)

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.c -- Program to build the document weights file
4 * Copyright (C) 1994  Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id$
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m.h"
30#include "bitio_m_stdio.h"
31#include "timing.h"
32#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "invf.h"
37#include "text.h"
38#include "words.h"
39
40#define MAXBITS (sizeof(unsigned long) * 8)
41
42/*
43   $Log$
44   Revision 1.2  2004/05/24 21:12:18  kjdon
45   changed a message
46
47   Revision 1.1  2003/02/20 21:18:24  mdewsnip
48   Addition of MG package for search and retrieval
49
50   Revision 1.1  1999/08/10 21:18:16  sjboddie
51   renamed mg-1.3d directory mg
52
53   Revision 1.2  1998/11/25 07:55:49  rjmcnab
54
55   Modified mg to that you can specify the stemmer you want
56   to use via a command line option. You specify it to
57   mg_passes during the build process. The number of the
58   stemmer that you used is stored within the inverted
59   dictionary header and the stemmed dictionary header so
60   the correct stemmer is used in later stages of building
61   and querying.
62
63   Revision 1.1  1998/11/17 09:35:22  rjmcnab
64   *** empty log message ***
65
66   * Revision 1.4  1994/11/29  00:32:05  tes
67   * Committing the new merged files and changes.
68   *
69   * Revision 1.3  1994/10/20  03:57:00  tes
70   * I have rewritten the boolean query optimiser and abstracted out the
71   * components of the boolean query.
72   *
73   * Revision 1.2  1994/09/20  04:41:55  tes
74   * For version 1.1
75   *
76 */
77
78static char *RCSID = "$Id$";
79
80unsigned char bits = 8;
81static char *file_name = "";
82static char *text_file_name = "";
83static unsigned long NumPara = 0;
84static unsigned long StaticNumOfDocs = 0;
85
86unsigned long get_NumPara (void);
87unsigned long get_StaticNumOfDocs (void);
88void GenerateWeights (void);
89void Make_weight_approx (void);
90void Make_text_idx_wgt (void);
91
92
93int main (int argc, char **argv)
94{
95  ProgTime StartTime;
96  int ch;
97  opterr = 0;
98  msg_prefix = argv[0];
99  while ((ch = getopt (argc, argv, "f:t:d:b:sh")) != -1) /* [RJM 10/98 - Text Filename] */
100    switch (ch)
101      {
102      case 'f':     /* input file */
103    file_name = optarg;
104    if (strlen(text_file_name) == 0) text_file_name = optarg;
105    break;
106      /* [RJM 10/98 - Text Filename] */
107      case 't':     /* text input file */
108    text_file_name = optarg;
109    break;
110      case 'd':
111    set_basepath (optarg);
112    break;
113      case 'b':
114    bits = atoi (optarg);
115    if (bits > 32)
116      {
117        fprintf (stderr, "b may only take values 0-32\n");
118        exit (1);
119      }
120    break;
121      case 'h':
122      case '?':
123    fprintf (stderr, "usage: %s [-f input_file]"
124         "[-d data directory] [-b bits] [-s] [-h]\n", argv[0]);
125    exit (1);
126      }
127  GetTime (&StartTime);
128
129  GenerateWeights ();
130
131  Make_weight_approx ();
132
133  Make_text_idx_wgt ();
134
135  Message ("%s", ElapsedTime (&StartTime, NULL));
136
137  return 0;
138}
139
140
141
142
143unsigned long
144get_NumPara (void)
145{
146  struct invf_dict_header idh;
147  FILE *invf_dict;
148  if (NumPara)
149    return (NumPara);
150  invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
151             MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
152  fread ((char *) &idh, sizeof (idh), 1, invf_dict);
153  fclose (invf_dict);
154  NTOHUL2(idh.num_of_docs, NumPara);  /* [RPAP - Jan 97: Endian Ordering] */
155  return NumPara;
156}
157
158
159
160unsigned long
161get_StaticNumOfDocs (void)
162/* the static number of documents is the N parameter used to
163 * decode document gaps in the inverted file encoded using
164 * the Bblock method.
165 */
166{
167  struct invf_dict_header idh;
168  FILE *invf_dict;
169  if (StaticNumOfDocs)
170    return (StaticNumOfDocs);
171  invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
172             MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
173  fread ((char *) &idh, sizeof (idh), 1, invf_dict);
174  fclose (invf_dict);
175  NTOHUL2(idh.static_num_of_docs, StaticNumOfDocs);  /* [RPAP - Jan 97: Endian Ordering] */
176  return StaticNumOfDocs;
177}
178
179
180
181void GenerateWeights (void) {
182  FILE *dict, *invf, *f, *idx;
183  struct invf_dict_header idh;
184  struct invf_file_header ifh;
185  int i;
186  double logN;
187  float *DocWeights;
188
189  /* make sure the globals NumPara and StaticNumOfDocs are loaded */
190  get_NumPara ();
191  get_StaticNumOfDocs ();
192
193  /* check to see if the weights file has already been built */
194  if ((f = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
195              MG_CONTINUE)) != NULL) {
196      fclose (f);
197      return;
198  }
199  Message ("The file \"%s.weight\" does not exist.", file_name);
200  Message ("Building the weight data from the file \"%s.invf\".", file_name);
201
202  logN = log ((double) NumPara);
203
204  /* allocate memory for the weights */
205  if (!(DocWeights = Xmalloc (sizeof (float) * (NumPara + 1))))
206      FatalError (1, "No memory for doc weights");
207  bzero ((char *) DocWeights, sizeof (float) * (NumPara + 1));
208
209  /* open the .invf.dict file and read in its header */
210  dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
211            MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
212  fread ((char *) &idh, sizeof (idh), 1, dict);
213
214  /* [RPAP - Jan 97: Endian Ordering] */
215  NTOHUL(idh.lookback);
216  NTOHUL(idh.dict_size);
217  NTOHUL(idh.total_bytes);
218  NTOHUL(idh.index_string_bytes);
219  NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
220  NTOHUL(idh.num_of_docs);
221  NTOHUL(idh.static_num_of_docs);
222  NTOHUL(idh.num_of_words);
223  NTOHUL(idh.stemmer_num);
224  NTOHUL(idh.stem_method);
225
226  /* open .invf.idx */
227  idx = open_file (file_name, INVF_IDX_SUFFIX, "rb", MAGIC_INVI,
228           MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
229
230  /* open .invf and read in its header */
231  invf = open_file (file_name, INVF_SUFFIX, "rb", MAGIC_INVF,
232            MG_ABORT);
233  fread ((char *) &ifh, sizeof (ifh), 1, invf);
234
235  /* [RPAP - Jan 97: Endian Ordering] */
236  NTOHUL(ifh.no_of_words);
237  NTOHUL(ifh.no_of_ptrs);
238  NTOHUL(ifh.skip_mode);
239  for (i = 0; i <= 15; i++)
240    NTOHUL(ifh.params[i]);
241  NTOHUL(ifh.InvfLevel);
242
243  /* make sure the inverted file does not contain skips and is not level 1 */
244  if (ifh.skip_mode != 0)
245    FatalError (0, "Can\'t make weights file from a skipped inverted file.");
246  if (ifh.InvfLevel == 1)
247    FatalError (0, "Can\'t make weights file from level 1 inverted file.");
248
249  DECODE_START (invf)
250
251    /* process each word adding its contributions to the document weights */
252    for (i = 0; i < ifh.no_of_words; i++)
253    {
254      u_char dummy1, dummy2[MAXSTEMLEN + 1];
255      unsigned long fcnt, wcnt, blk, CurrDoc, p, j;
256      float idf;
257
258      /* give a little feedback every 4096 words */
259      if ((i & 0xfff) == 0)
260    fprintf (stderr, ".");
261
262      /* read an entry for a word, just to get p value */
263      dummy1 = fgetc (dict);
264      dummy1 = fgetc (dict);
265      fread (dummy2, sizeof (u_char), dummy1, dict);
266      fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
267      fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
268
269      dummy2[dummy1] = '\0';
270
271      /* [RPAP - Jan 97: Endian Ordering] */
272      NTOHUL(fcnt);
273      NTOHUL(wcnt);
274
275      p = fcnt;
276
277      idf = logN - log ((double) fcnt);
278      blk = BIO_Bblock_Init (StaticNumOfDocs, p);
279      CurrDoc = 0;
280
281      /* check the inverted file index entry for this word */
282      {
283    unsigned long loc;
284    fread ((char *) &loc, sizeof (loc), 1, idx);
285    NTOHUL(loc);  /* [RPAP - Jan 97: Endian Ordering] */
286    if (ftell (invf) != loc)
287      {
288        FatalError (1, "Word %d  %d != %d", i, ftell (invf), loc);
289      }
290      }
291
292      for (j = 0; j < p; j++)
293    {
294      unsigned long x, tf;
295      BBLOCK_DECODE (x, blk);
296      CurrDoc += x;
297
298      if (CurrDoc > idh.num_of_docs) {
299        FatalError (1, "CurrDoc = %d, number of documents = %d",
300            CurrDoc, idh.num_of_docs);
301      }   
302
303      if (ifh.InvfLevel >= 2)
304        {
305          double weight;
306          GAMMA_DECODE (tf);
307          weight = tf * idf;
308          DocWeights[CurrDoc - 1] += weight * weight;
309        }
310    }
311     
312      while (__btg)
313    DECODE_BIT;
314    }
315
316  DECODE_DONE
317
318  fclose (dict);
319  fclose (invf);
320  fprintf (stderr, "\n");
321
322  /* [RPAP - Jan 97: Endian Ordering] */
323  for (i = 0; i < NumPara; i++)
324    HTONF(DocWeights[i]);
325
326  f = create_file (file_name, WEIGHTS_SUFFIX, "wb", MAGIC_WGHT,
327           MG_ABORT);
328
329  fwrite ((char *) DocWeights, sizeof (float), NumPara, f);
330  fclose (f);
331  Xfree (DocWeights);
332}
333
334
335
336
337
338
339
340
341
342
343
344void
345Make_weight_approx (void)
346{
347  int i, pos, max;
348  unsigned long buf;
349  double U, L, B;
350  FILE *approx, *exact;
351
352  exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
353             MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
354
355  /* calculate U and L */
356  L = 1e300;
357  U = 0;
358  for (i = 0; i < NumPara; i++)
359    {
360      float wgt;
361      fread ((char *) &wgt, sizeof (wgt), 1, exact);
362      NTOHF(wgt);  /* [RPAP - Jan 97: Endian Ordering] */
363      wgt = sqrt (wgt);
364      if (wgt > U)
365    U = wgt;
366      if (wgt > 0 && wgt < L)
367    L = wgt;
368
369    }
370  fseek (exact, sizeof (u_long), SEEK_SET);
371
372  B = pow (U / L, pow (2.0, -(double) bits));
373
374  fprintf (stderr, "L = %f\n", L);
375  fprintf (stderr, "U = %f\n", U);
376  fprintf (stderr, "B = %f\n", B);
377
378
379
380  approx = create_file (file_name, APPROX_WEIGHTS_SUFFIX, "wb",
381            MAGIC_WGHT_APPROX, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
382
383  fwrite ((char *) &bits, sizeof (bits), 1, approx);
384  HTOND(L);  /* [RPAP - Jan 97: Endian Ordering] */
385  HTOND(B);  /* [RPAP - Jan 97: Endian Ordering] */
386  fwrite ((char *) &L, sizeof (L), 1, approx);
387  fwrite ((char *) &B, sizeof (B), 1, approx);
388  NTOHD(L);  /* [RPAP - Jan 97: Endian Ordering] */
389  NTOHD(B);  /* [RPAP - Jan 97: Endian Ordering] */
390
391  max = bits == 32 ? 0xffffffff : (1 << bits) - 1;
392  for (buf = pos = i = 0; i < NumPara; i++)
393    {
394      unsigned long fx;
395      float wgt;
396      fread ((char *) &wgt, sizeof (wgt), 1, exact);
397      NTOHF(wgt);  /* [RPAP - Jan 97: Endian Ordering] */
398      wgt = sqrt (wgt);
399      if (wgt == 0)
400    {
401      wgt = L;
402#ifndef QUIET
403      Message ("Warning: Document %d had a weight of 0.", i);
404#endif
405    }
406      fx = (int) floor (log (wgt / L) / log (B));
407
408      if (fx > max)
409    fx = max;
410
411      buf |= (fx << pos);
412      pos += bits;
413
414      if (pos >= MAXBITS)
415    {
416      HTONUL(buf);
417      fwrite ((char *) &buf, sizeof (buf), 1, approx);
418      buf = fx >> (bits - (pos - MAXBITS));
419      pos = pos - MAXBITS;
420    }
421    }
422  if (pos > 0)
423    {
424      /* [RPAP - Jan 97: Endian Ordering] */
425      HTONUL(buf);
426      fwrite ((char *) &buf, sizeof (buf), 1, approx);
427    }
428
429  fclose (approx);
430  fclose (exact);
431}
432
433
434
435
436
437void
438Make_text_idx_wgt (void)
439{
440  compressed_text_header cth;
441  int i;
442  FILE *idx_wgt, *idx, *para, *exact;
443
444  idx_wgt = create_file (file_name, TEXT_IDX_WGT_SUFFIX, "wb", MAGIC_TEXI_WGT,
445             MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
446
447  /* [RJM 10/98 - Text Filename] */
448  idx = open_file (text_file_name, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI,
449           MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
450  if (fread (&cth, sizeof (cth), 1, idx) != 1)
451    FatalError (1, "Unable to read header of index file");
452
453  /* [RPAP - Jan 97: Endian Ordering] */
454  NTOHUL(cth.num_of_docs);
455  NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
456  NTOHUL(cth.num_of_words);
457  NTOHUL(cth.length_of_longest_doc);
458  NTOHD(cth.ratio);
459
460  exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
461             MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
462
463  get_NumPara ();
464  if (cth.num_of_docs != NumPara)
465    {
466      Message ("The number of documents %d does not equal "
467           "the number of paragraphs %d.", cth.num_of_docs, NumPara);
468      Message ("Using the \"%s.invf.paragraph\" file\n", file_name);
469      para = open_file (file_name, INVF_PARAGRAPH_SUFFIX, "rb", MAGIC_PARAGRAPH,
470            MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */
471    }
472  else
473    para = NULL;
474
475  {
476    struct
477      {
478    unsigned long Start;
479    float Weight;
480      }
481    data;
482    for (i = 0; i < cth.num_of_docs; i++)
483      {
484    int count;
485    fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
486    if (para && i < cth.num_of_docs)
487      {
488        /* [RPAP - Jan 97: Endian Ordering] */
489        fread ((char *) &count, sizeof (count), 1, para);
490        NTOHSI(count);
491      }
492    else
493      count = 1;
494    while (count--)
495      {
496        fread ((char *) &data.Weight, sizeof (float), 1, exact);
497        NTOHF(data.Weight);  /* [RPAP - Jan 97: Endian Ordering] */
498        data.Weight = sqrt (data.Weight);
499        HTONF(data.Weight);  /* [RPAP - Jan 97: Endian Ordering] */
500        fwrite ((char *) &data, sizeof (data), 1, idx_wgt);
501      }
502      }
503    /* Write out the extra entry for the idx file */
504    fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
505    data.Weight = 0;
506    fwrite((char*)&data, sizeof(data), 1, idx_wgt);
507  }
508
509  fclose (idx_wgt);
510  fclose (idx);
511  fclose (exact);
512  if (para)
513    fclose (para);
514}
Note: See TracBrowser for help on using the browser.