root/gsdl/trunk/trunk/mgpp/text/mgpp_weights_build.cpp @ 16583

Revision 16583, 10.8 KB (checked in by davidb, 12 years ago)

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * mgpp_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
26#if defined (__WIN32__) || defined (__CYGWIN__)
27# include "getopt_old.h"
28#else
29# include <unistd.h>
30#endif
31
32#include "UCArray.h"
33#include "sysfuncs.h"
34#include "memlib.h"
35#include "messages.h"
36#include "local_strings.h"
37#include "bitio_gen.h"
38#include "bitio_m_stdio.h"
39#include "mg_files.h"
40#include "locallib.h"
41#include "invf.h"
42#include "FIvfLevelInfo.h"
43#include "FragLevelConvert.h"
44
45#if defined(GSDL_USE_OBJECTSPACE)
46#  include <ospace\std\map>
47#elif defined(GSDL_USE_STL_H)
48#  include <map.h>
49#else
50#  include <map>
51#endif
52
53#define MAXBITS (sizeof(unsigned long) * 8)
54
55struct WBTagPtr {
56  unsigned long tagNum;
57  unsigned long tagPtr;
58  unsigned long fragOccur;
59 
60  WBTagPtr () {
61    tagNum = 0;
62    tagPtr = 0;
63    fragOccur = 0;
64  }
65};
66
67// maps tags to tag information
68typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
69
70typedef vector<float> Weights;
71
72
73static void ReadTagDict (const invf_dict_header &idh,
74             FILE *dictFile,
75             FILE *invfIdxFile,
76             WBTagDict &tagDict) {
77  tagDict.erase (tagDict.begin(), tagDict.end());
78
79  // seek to the start of the tag information
80  fseek (dictFile, idh.tag_dict_start, SEEK_SET);
81  fseek (invfIdxFile, sizeof(unsigned long) +
82     idh.word_dict_size*sizeof(unsigned long), SEEK_SET);
83 
84  unsigned long tagNum;
85  unsigned long tagPtr;
86  dict_el thisEl;
87  for (tagNum = 0; tagNum < idh.tag_dict_size; ++tagNum) {
88    thisEl.Read (dictFile);
89    ReadUL (invfIdxFile, tagPtr);
90    tagDict[thisEl.el].tagNum = tagNum;
91    tagDict[thisEl.el].tagPtr = tagPtr;
92    tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
93  }
94}
95
96static void AddWeight (Weights &w,
97               unsigned long levelDocNum,
98               unsigned long termFreq,
99               float idf) {
100  double weight = termFreq * idf;
101  w[levelDocNum-1] += weight * weight;
102}
103
104static void GenerateLevelWeights (const invf_dict_header &idh,
105                  const invf_file_header &ifh,
106                  unsigned long numLevelDocs,
107                  unsigned long levelNum,
108                  FILE *dictFile,
109                  FILE *invfFile,
110                  FILE *invfIdxFile,
111                  const FragLevelConvert &fragLevelConvert,
112                  Weights &w) {
113  // pre-allocate the right number of weights
114  w.erase (w.begin(), w.end());
115  w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
116
117  double logN = log ((double) numLevelDocs);
118
119  // reset the files
120  fseek (dictFile, idh.word_dict_start, SEEK_SET);
121  fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET);
122 
123  // process each word adding its contributions to the document weights
124  unsigned long wordNum;
125  unsigned long wordStart;
126  word_dict_el wordEl;
127  wordEl.SetNumLevels (idh.num_levels);
128  for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
129#ifndef SILENT
130    // give a little feedback every 4096 words
131    if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
132#endif
133   
134    wordEl.Read (dictFile, idh.num_levels);
135    ReadUL (invfIdxFile, wordStart);
136
137    float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
138
139    // seek to the appropriate place in the inverted file
140    fseek (invfFile, wordStart, SEEK_SET);
141    stdio_bitio_buffer buffer (invfFile);
142   
143    unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
144    unsigned long fragNum = 0;
145    unsigned long levelDocNum = 0;
146    unsigned long lastLevelDocNum = 0;
147    unsigned long termFreq = 0;
148    unsigned long checkLevelFreq = 0;
149   
150    unsigned long count, i;
151    for (i=0; i<wordEl.frag_occur; ++i) {
152      fragNum += buffer.bblock_decode (B, NULL);
153      if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
154      else count = 1;
155
156      if (fragNum > idh.num_frags)
157    FatalError (1, "fragNum = %d, "
158            "number of fragments = %d\n"
159            "wordNum = %d\n"
160            "i = %d, frag_occur = %d\n",
161            fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
162
163      if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
164    FatalError (1, "could not convert fragment number %d in level %d", fragNum, levelNum);
165
166      if (levelDocNum == 0 || levelDocNum > numLevelDocs)
167    FatalError (1, "bad level document number %d in level %d", levelDocNum, levelNum);
168     
169      if (levelDocNum != lastLevelDocNum) {
170    // new level document
171    if (lastLevelDocNum > 0) {
172      AddWeight (w, lastLevelDocNum, termFreq, idf);
173      ++checkLevelFreq;
174    }
175    lastLevelDocNum = levelDocNum;
176    termFreq = 0;
177      }
178      termFreq += count;
179    }
180
181    if (lastLevelDocNum > 0) {
182      AddWeight (w, lastLevelDocNum, termFreq, idf);
183      ++checkLevelFreq;
184    }
185
186    if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
187      cerr << "bad level freq at level " <<levelNum<<" "<< checkLevelFreq << " != "
188       << wordEl.levelFreqs[levelNum] << ", word \""
189       << wordEl.el << "\" (" << wordNum << ")\n";
190      exit (1);
191    }
192   
193    buffer.done();
194  }
195
196  if (w.size() != numLevelDocs)
197    FatalError (1, "wrong number of weights created %d != %d",
198        w.size(), numLevelDocs);
199}
200
201static void WriteExactWeights (FILE *weightsFile,
202                   unsigned long &diskPtr,
203                   const Weights &w) {
204  diskPtr = ftell(weightsFile);
205
206  Weights::const_iterator here = w.begin();
207  Weights::const_iterator end = w.end();
208  while (here != end) {
209//      cout << *here << "\n";
210    WriteF (weightsFile, sqrt (*here));
211    ++here;
212  }
213}
214
215static void WriteApproxWeights (FILE *approxWeightsFile,
216                unsigned long &diskPtr,
217                const Weights &w,
218                unsigned char bits) {
219  diskPtr = ftell(approxWeightsFile);
220
221  // calculate L, U and B
222  double L = 1e300;
223  double U = 0;
224  float wgt;
225  Weights::const_iterator here = w.begin();
226  Weights::const_iterator end = w.end();
227  while (here != end) {
228    wgt = sqrt (*here);
229    if (wgt > U) U = wgt;
230    if (wgt > 0 && wgt < L) L = wgt;
231    ++here;
232  }
233
234  double B = pow (U / L, pow (2.0, -(double) bits));
235
236#ifndef SILENT
237  fprintf (stderr, "L = %f\n", L);
238  fprintf (stderr, "U = %f\n", U);
239  fprintf (stderr, "B = %f\n", B);
240#endif
241 
242  WriteUC (approxWeightsFile, bits);
243  WriteD (approxWeightsFile, L);
244  WriteD (approxWeightsFile, B);
245 
246
247  unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
248  unsigned long i=0, buf=0, pos=0;
249  here = w.begin();
250  end = w.end();
251  while (here != end) {
252    unsigned long fx;
253    wgt = sqrt (*here);
254    if (wgt == 0) {
255      wgt = L;
256#ifndef SILENT
257      Message ("Warning: Document %d had a weight of 0.", i);
258#endif
259    }
260    fx = (unsigned long) floor (log (wgt / L) / log (B));
261
262    if (fx > max) fx = max;
263
264    buf |= (fx << pos);
265    pos += bits;
266
267    if (pos >= MAXBITS) {
268      WriteUL (approxWeightsFile, buf);
269      buf = fx >> (bits - (pos - MAXBITS));
270      pos = pos - MAXBITS;
271    }
272
273    ++here; ++i;
274  }
275
276  // write out the last bits
277  if (pos > 0) WriteUL (approxWeightsFile, buf);
278}
279
280int main (int argc, char **argv) {
281  unsigned char bits = 8;
282  char *filename = "";
283  int ch;
284  opterr = 0;
285  msg_prefix = argv[0];
286
287  while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
288    switch (ch) {
289    case 'f':       // input file
290      filename = optarg;
291      break;
292    case 'd':
293      set_basepath (optarg);
294      break;
295    case 'b':
296      bits = atoi (optarg);
297      if (bits > 32) {
298    fprintf (stderr, "b may only take values 0-32\n");
299    exit (1);
300      }
301      break;
302    case 'h':
303    case '?':
304      fprintf (stderr, "usage: %s [-f input_file]"
305           "[-d data directory] [-b bits] [-h]\n", argv[0]);
306      exit (1);
307    }
308  }
309
310 
311  // open the dictionary
312  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
313                  MAGIC_STEM_BUILD, MG_ABORT);
314  invf_dict_header idh;
315  idh.Read (dictFile);
316
317  // open the inverted file
318  FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
319            MAGIC_INVF, MG_ABORT);
320  invf_file_header ifh;
321  ifh.Read (invfFile);
322  if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
323    FatalError (1, "The invf file contains skips. Unable to create weights.");
324
325  // open the inverted index file
326  FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
327                 MAGIC_INVI, MG_ABORT);
328 
329  // read the level information
330  FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
331                   MAGIC_INVF_LEVELS, MG_ABORT);
332  FIvfLevel ivfLevel;
333  ivfLevel.Read (levelFile);
334  fclose (levelFile);
335
336  // read in the tag dictionary and inverted file pointers
337  WBTagDict tagDict;
338  ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
339
340 
341  // create the weights file
342  FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
343                   MAGIC_WGHT, MG_ABORT);
344
345  // create the approx weights file
346  FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
347                     MAGIC_WGHT_APPROX, MG_ABORT);
348
349 
350  // create weights for each document level
351  FragLevelConvert fragLevelConvert;
352  Weights w;
353  IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
354  IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
355  unsigned long levelNum = 0;
356  while (levelHere != levelEnd) {
357    const UCArray &levelName = (*levelHere).first;
358   
359    // read the tag information about this level
360    fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
361              idh.num_frags, tagDict[levelName].fragOccur);
362   
363    // create the weights for this level
364    GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
365              levelNum, dictFile, invfFile, invfIdxFile,
366              fragLevelConvert, w);
367
368    // write out the exact weights
369    WriteExactWeights (weightsFile,
370               (*levelHere).second.exactWeightsDiskPtr,
371               w);
372   
373    // write out the approximate weights
374    WriteApproxWeights (approxWeightsFile,
375            (*levelHere).second.approxWeightsDiskPtr,
376            w, bits);
377   
378    ++levelHere; ++levelNum;
379  }
380 
381
382  // close input files
383  fclose (dictFile);
384  fclose (invfFile);
385  fclose (invfIdxFile);
386
387  // update the level information
388  levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
389               MAGIC_INVF_LEVELS, MG_ABORT);
390  ivfLevel.Write (levelFile);
391  fclose (levelFile);
392
393  // close output files
394  fclose (weightsFile);
395  fclose (approxWeightsFile);
396
397
398  return 0;
399}
Note: See TracBrowser for help on using the browser.