source: trunk/gsdl/src/mgpp/text/mg_weights_build.cpp@ 860

Last change on this file since 860 was 860, checked in by rjmcnab, 23 years ago

Fixed a couple of bugs and made building silent if needed.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_weights_build.cpp 860 2000-01-18 03:53:24Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m_stdio.h"
30
31#include "mg_files.h"
32#include "locallib.h"
33#include "invf.h"
34// #include "WordData.h"
35#include "UCArray.h"
36#include "FIvfLevelInfo.h"
37#include "FragLevelConvert.h"
38
39#if defined(GSDL_USE_OBJECTSPACE)
40# include <ospace\std\map>
41#elif defined(GSDL_USE_STL_H)
42# include <map.h>
43#else
44# include <map>
45#endif
46
47
48/*
49 $Log$
50 Revision 1.2 2000/01/18 03:53:24 rjmcnab
51 Fixed a couple of bugs and made building silent if needed.
52
53 Revision 1.1 2000/01/14 02:26:21 sjboddie
54 Rodgers new C++ mg
55
56 Revision 1.1 1999/10/11 02:58:06 cs025
57 Base install of MG-PP
58
59 Revision 1.1 1999/08/10 21:18:16 sjboddie
60 renamed mg-1.3d directory mg
61
62 Revision 1.2 1998/11/25 07:55:49 rjmcnab
63
64 Modified mg to that you can specify the stemmer you want
65 to use via a command line option. You specify it to
66 mg_passes during the build process. The number of the
67 stemmer that you used is stored within the inverted
68 dictionary header and the stemmed dictionary header so
69 the correct stemmer is used in later stages of building
70 and querying.
71
72 Revision 1.1 1998/11/17 09:35:22 rjmcnab
73 *** empty log message ***
74
75 * Revision 1.4 1994/11/29 00:32:05 tes
76 * Committing the new merged files and changes.
77 *
78 * Revision 1.3 1994/10/20 03:57:00 tes
79 * I have rewritten the boolean query optimiser and abstracted out the
80 * components of the boolean query.
81 *
82 * Revision 1.2 1994/09/20 04:41:55 tes
83 * For version 1.1
84 *
85 */
86
87#define MAXBITS (sizeof(unsigned long) * 8)
88
89struct WBTagPtr {
90 unsigned long tagNum;
91 unsigned long tagPtr;
92 unsigned long fragOccur;
93
94 WBTagPtr () {
95 tagNum = 0;
96 tagPtr = 0;
97 fragOccur = 0;
98 }
99};
100
101// maps tags to tag information
102typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
103
104typedef vector<float> Weights;
105
106
107static void ReadTagDict (const invf_dict_header &idh,
108 FILE *dictFile,
109 FILE *invfIdxFile,
110 WBTagDict &tagDict) {
111 tagDict.erase (tagDict.begin(), tagDict.end());
112
113 // seek to the start of the tag information
114 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
115 fseek (invfIdxFile, sizeof(unsigned long) +
116 idh.word_dict_size*sizeof(unsigned long), SEEK_SET);
117
118 unsigned long tagNum;
119 unsigned long tagPtr;
120 dict_el thisEl;
121 for (tagNum = 0; tagNum < idh.tag_dict_size; tagNum++) {
122 thisEl.Read (dictFile);
123 ReadUL (invfIdxFile, tagPtr);
124 tagDict[thisEl.el].tagNum = tagNum;
125 tagDict[thisEl.el].tagPtr = tagPtr;
126 tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
127 }
128}
129
130static void AddWeight (Weights &w,
131 unsigned long levelDocNum,
132 unsigned long termFreq,
133 float idf) {
134 double weight = termFreq * idf;
135 w[levelDocNum-1] += weight * weight;
136}
137
138static void GenerateLevelWeights (const invf_dict_header &idh,
139 const invf_file_header &ifh,
140 unsigned long numLevelDocs,
141 unsigned long levelNum,
142 FILE *dictFile,
143 FILE *invfFile,
144 FILE *invfIdxFile,
145 const FragLevelConvert &fragLevelConvert,
146 Weights &w) {
147 // pre-allocate the right number of weights
148 w.erase (w.begin(), w.end());
149 w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
150
151 double logN = log ((double) numLevelDocs);
152
153 // reset the files
154 fseek (dictFile, idh.word_dict_start, SEEK_SET);
155 fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET);
156
157 // process each word adding its contributions to the document weights
158 unsigned long wordNum;
159 unsigned long wordStart;
160 word_dict_el wordEl;
161 wordEl.SetNumLevels (idh.num_levels);
162 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
163#ifndef SILENT
164 // give a little feedback every 4096 words
165 if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
166#endif
167
168 wordEl.Read (dictFile, idh.num_levels);
169 ReadUL (invfIdxFile, wordStart);
170
171 float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
172
173 // seek to the appropriate place in the inverted file
174 fseek (invfFile, wordStart, SEEK_SET);
175 stdio_bitio_buffer buffer (invfFile);
176
177 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
178 unsigned long fragNum = 0;
179 unsigned long levelDocNum = 0;
180 unsigned long lastLevelDocNum = 0;
181 unsigned long termFreq = 0;
182 unsigned long checkLevelFreq = 0;
183
184 unsigned long count, i;
185 for (i=0; i<wordEl.frag_occur; i++) {
186 fragNum += buffer.bblock_decode (B, NULL);
187 if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
188 else count = 1;
189
190 if (fragNum > idh.num_frags)
191 FatalError (1, "fragNum = %d, "
192 "number of fragments = %d\n"
193 "wordNum = %d\n"
194 "i = %d, frag_occur = %d\n",
195 fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
196
197 if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
198 FatalError (1, "could not convert fragment number %d", fragNum);
199
200 if (levelDocNum == 0 || levelDocNum > numLevelDocs)
201 FatalError (1, "bad level document number %d", levelDocNum);
202
203 if (levelDocNum != lastLevelDocNum) {
204 // new level document
205 if (lastLevelDocNum > 0) {
206 AddWeight (w, lastLevelDocNum, termFreq, idf);
207 checkLevelFreq++;
208 }
209 lastLevelDocNum = levelDocNum;
210 termFreq = 0;
211 }
212 termFreq += count;
213 }
214
215 if (lastLevelDocNum > 0) {
216 AddWeight (w, lastLevelDocNum, termFreq, idf);
217 checkLevelFreq++;
218 }
219
220 if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
221 cerr << "bad level freq " << checkLevelFreq << " != "
222 << wordEl.levelFreqs[levelNum] << ", word \""
223 << wordEl.el << "\" (" << wordNum << ")\n";
224 exit (1);
225 }
226
227 buffer.done();
228 }
229
230 if (w.size() != numLevelDocs)
231 FatalError (1, "wrong number of weights created %d != %d",
232 w.size(), numLevelDocs);
233}
234
235static void WriteExactWeights (FILE *weightsFile,
236 unsigned long &diskPtr,
237 const Weights &w) {
238 diskPtr = ftell(weightsFile);
239
240 Weights::const_iterator here = w.begin();
241 Weights::const_iterator end = w.end();
242 while (here != end) {
243// cout << *here << "\n";
244 WriteF (weightsFile, *here);
245 here++;
246 }
247}
248
249static void WriteApproxWeights (FILE *approxWeightsFile,
250 unsigned long &diskPtr,
251 const Weights &w,
252 unsigned char bits) {
253 diskPtr = ftell(approxWeightsFile);
254
255 // calculate L, U and B
256 double L = 1e300;
257 double U = 0;
258 float wgt;
259 Weights::const_iterator here = w.begin();
260 Weights::const_iterator end = w.end();
261 while (here != end) {
262 wgt = sqrt (*here);
263 if (wgt > U) U = wgt;
264 if (wgt > 0 && wgt < L) L = wgt;
265 here++;
266 }
267
268 double B = pow (U / L, pow (2.0, -(double) bits));
269
270#ifndef SILENT
271 fprintf (stderr, "L = %f\n", L);
272 fprintf (stderr, "U = %f\n", U);
273 fprintf (stderr, "B = %f\n", B);
274#endif
275
276 WriteUC (approxWeightsFile, bits);
277 WriteD (approxWeightsFile, L);
278 WriteD (approxWeightsFile, B);
279
280
281 unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
282 unsigned long i=0, buf=0, pos=0;
283 here = w.begin();
284 end = w.end();
285 while (here != end) {
286 unsigned long fx;
287 wgt = sqrt (*here);
288 if (wgt == 0) {
289 wgt = L;
290#ifndef SILENT
291 Message ("Warning: Document %d had a weight of 0.", i);
292#endif
293 }
294 fx = (unsigned long) floor (log (wgt / L) / log (B));
295
296 if (fx > max) fx = max;
297
298 buf |= (fx << pos);
299 pos += bits;
300
301 if (pos >= MAXBITS) {
302 WriteUL (approxWeightsFile, buf);
303 buf = fx >> (bits - (pos - MAXBITS));
304 pos = pos - MAXBITS;
305 }
306
307 here++; i++;
308 }
309
310 // write out the last bits
311 if (pos > 0) WriteUL (approxWeightsFile, buf);
312}
313
314int main (int argc, char **argv) {
315 unsigned char bits = 8;
316 char *filename = "";
317 int ch;
318 opterr = 0;
319 msg_prefix = argv[0];
320
321 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
322 switch (ch) {
323 case 'f': // input file
324 filename = optarg;
325 break;
326 case 'd':
327 set_basepath (optarg);
328 break;
329 case 'b':
330 bits = atoi (optarg);
331 if (bits > 32) {
332 fprintf (stderr, "b may only take values 0-32\n");
333 exit (1);
334 }
335 break;
336 case 'h':
337 case '?':
338 fprintf (stderr, "usage: %s [-f input_file]"
339 "[-d data directory] [-b bits] [-h]\n", argv[0]);
340 exit (1);
341 }
342 }
343
344
345 // open the dictionary
346 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
347 MAGIC_STEM_BUILD, MG_ABORT);
348 invf_dict_header idh;
349 idh.Read (dictFile);
350
351 // open the inverted file
352 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
353 MAGIC_INVF, MG_ABORT);
354 invf_file_header ifh;
355 ifh.Read (invfFile);
356 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
357 FatalError (1, "The invf file contains skips. Unable to create weights.");
358
359 // open the inverted index file
360 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
361 MAGIC_INVI, MG_ABORT);
362
363 // read the level information
364 FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
365 MAGIC_INVF_LEVELS, MG_ABORT);
366 FIvfLevel ivfLevel;
367 ivfLevel.Read (levelFile);
368 fclose (levelFile);
369
370 // read in the tag dictionary and inverted file pointers
371 WBTagDict tagDict;
372 ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
373
374
375 // create the weights file
376 FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
377 MAGIC_WGHT, MG_ABORT);
378
379 // create the approx weights file
380 FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
381 MAGIC_WGHT_APPROX, MG_ABORT);
382
383
384 // create weights for each document level
385 FragLevelConvert fragLevelConvert;
386 Weights w;
387 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
388 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
389 unsigned long levelNum = 0;
390 while (levelHere != levelEnd) {
391 const UCArray &levelName = (*levelHere).first;
392
393 // read the tag information about this level
394 fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
395 idh.num_frags, tagDict[levelName].fragOccur);
396
397 // create the weights for this level
398 GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
399 levelNum, dictFile, invfFile, invfIdxFile,
400 fragLevelConvert, w);
401
402 // write out the exact weights
403 WriteExactWeights (weightsFile,
404 (*levelHere).second.exactWeightsDiskPtr,
405 w);
406
407 // write out the approximate weights
408 WriteApproxWeights (approxWeightsFile,
409 (*levelHere).second.approxWeightsDiskPtr,
410 w, bits);
411
412 levelHere++; levelNum++;
413 }
414
415
416 // close input files
417 fclose (dictFile);
418 fclose (invfFile);
419 fclose (invfIdxFile);
420
421 // update the level information
422 levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
423 MAGIC_INVF_LEVELS, MG_ABORT);
424 ivfLevel.Write (levelFile);
425 fclose (levelFile);
426
427 // close output files
428 fclose (weightsFile);
429 fclose (approxWeightsFile);
430
431
432 return 0;
433}
Note: See TracBrowser for help on using the repository browser.