source: trunk/gsdl/src/mgpp/text/mg_weights_build.cpp@ 861

Last change on this file since 861 was 861, checked in by rjmcnab, 24 years ago

fixed a few more bugs

  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_weights_build.cpp 861 2000-01-18 23:24:19Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m_stdio.h"
30
31#include "mg_files.h"
32#include "locallib.h"
33#include "invf.h"
34// #include "WordData.h"
35#include "UCArray.h"
36#include "FIvfLevelInfo.h"
37#include "FragLevelConvert.h"
38
39#if defined(GSDL_USE_OBJECTSPACE)
40# include <ospace\std\map>
41#elif defined(GSDL_USE_STL_H)
42# include <map.h>
43#else
44# include <map>
45#endif
46
47
48/*
49 $Log$
50 Revision 1.3 2000/01/18 23:24:19 rjmcnab
51 fixed a few more bugs
52
53 Revision 1.2 2000/01/18 03:53:24 rjmcnab
54 Fixed a couple of bugs and made building silent if needed.
55
56 Revision 1.1 2000/01/14 02:26:21 sjboddie
57 Rodgers new C++ mg
58
59 Revision 1.1 1999/10/11 02:58:06 cs025
60 Base install of MG-PP
61
62 Revision 1.1 1999/08/10 21:18:16 sjboddie
63 renamed mg-1.3d directory mg
64
65 Revision 1.2 1998/11/25 07:55:49 rjmcnab
66
67 Modified mg to that you can specify the stemmer you want
68 to use via a command line option. You specify it to
69 mg_passes during the build process. The number of the
70 stemmer that you used is stored within the inverted
71 dictionary header and the stemmed dictionary header so
72 the correct stemmer is used in later stages of building
73 and querying.
74
75 Revision 1.1 1998/11/17 09:35:22 rjmcnab
76 *** empty log message ***
77
78 * Revision 1.4 1994/11/29 00:32:05 tes
79 * Committing the new merged files and changes.
80 *
81 * Revision 1.3 1994/10/20 03:57:00 tes
82 * I have rewritten the boolean query optimiser and abstracted out the
83 * components of the boolean query.
84 *
85 * Revision 1.2 1994/09/20 04:41:55 tes
86 * For version 1.1
87 *
88 */
89
90#define MAXBITS (sizeof(unsigned long) * 8)
91
92struct WBTagPtr {
93 unsigned long tagNum;
94 unsigned long tagPtr;
95 unsigned long fragOccur;
96
97 WBTagPtr () {
98 tagNum = 0;
99 tagPtr = 0;
100 fragOccur = 0;
101 }
102};
103
104// maps tags to tag information
105typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
106
107typedef vector<float> Weights;
108
109
110static void ReadTagDict (const invf_dict_header &idh,
111 FILE *dictFile,
112 FILE *invfIdxFile,
113 WBTagDict &tagDict) {
114 tagDict.erase (tagDict.begin(), tagDict.end());
115
116 // seek to the start of the tag information
117 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
118 fseek (invfIdxFile, sizeof(unsigned long) +
119 idh.word_dict_size*sizeof(unsigned long), SEEK_SET);
120
121 unsigned long tagNum;
122 unsigned long tagPtr;
123 dict_el thisEl;
124 for (tagNum = 0; tagNum < idh.tag_dict_size; tagNum++) {
125 thisEl.Read (dictFile);
126 ReadUL (invfIdxFile, tagPtr);
127 tagDict[thisEl.el].tagNum = tagNum;
128 tagDict[thisEl.el].tagPtr = tagPtr;
129 tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
130 }
131}
132
133static void AddWeight (Weights &w,
134 unsigned long levelDocNum,
135 unsigned long termFreq,
136 float idf) {
137 double weight = termFreq * idf;
138 w[levelDocNum-1] += weight * weight;
139}
140
141static void GenerateLevelWeights (const invf_dict_header &idh,
142 const invf_file_header &ifh,
143 unsigned long numLevelDocs,
144 unsigned long levelNum,
145 FILE *dictFile,
146 FILE *invfFile,
147 FILE *invfIdxFile,
148 const FragLevelConvert &fragLevelConvert,
149 Weights &w) {
150 // pre-allocate the right number of weights
151 w.erase (w.begin(), w.end());
152 w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
153
154 double logN = log ((double) numLevelDocs);
155
156 // reset the files
157 fseek (dictFile, idh.word_dict_start, SEEK_SET);
158 fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET);
159
160 // process each word adding its contributions to the document weights
161 unsigned long wordNum;
162 unsigned long wordStart;
163 word_dict_el wordEl;
164 wordEl.SetNumLevels (idh.num_levels);
165 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
166#ifndef SILENT
167 // give a little feedback every 4096 words
168 if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
169#endif
170
171 wordEl.Read (dictFile, idh.num_levels);
172 ReadUL (invfIdxFile, wordStart);
173
174 float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
175
176 // seek to the appropriate place in the inverted file
177 fseek (invfFile, wordStart, SEEK_SET);
178 stdio_bitio_buffer buffer (invfFile);
179
180 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
181 unsigned long fragNum = 0;
182 unsigned long levelDocNum = 0;
183 unsigned long lastLevelDocNum = 0;
184 unsigned long termFreq = 0;
185 unsigned long checkLevelFreq = 0;
186
187 unsigned long count, i;
188 for (i=0; i<wordEl.frag_occur; i++) {
189 fragNum += buffer.bblock_decode (B, NULL);
190 if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
191 else count = 1;
192
193 if (fragNum > idh.num_frags)
194 FatalError (1, "fragNum = %d, "
195 "number of fragments = %d\n"
196 "wordNum = %d\n"
197 "i = %d, frag_occur = %d\n",
198 fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
199
200 if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
201 FatalError (1, "could not convert fragment number %d", fragNum);
202
203 if (levelDocNum == 0 || levelDocNum > numLevelDocs)
204 FatalError (1, "bad level document number %d", levelDocNum);
205
206 if (levelDocNum != lastLevelDocNum) {
207 // new level document
208 if (lastLevelDocNum > 0) {
209 AddWeight (w, lastLevelDocNum, termFreq, idf);
210 checkLevelFreq++;
211 }
212 lastLevelDocNum = levelDocNum;
213 termFreq = 0;
214 }
215 termFreq += count;
216 }
217
218 if (lastLevelDocNum > 0) {
219 AddWeight (w, lastLevelDocNum, termFreq, idf);
220 checkLevelFreq++;
221 }
222
223 if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
224 cerr << "bad level freq " << checkLevelFreq << " != "
225 << wordEl.levelFreqs[levelNum] << ", word \""
226 << wordEl.el << "\" (" << wordNum << ")\n";
227 exit (1);
228 }
229
230 buffer.done();
231 }
232
233 if (w.size() != numLevelDocs)
234 FatalError (1, "wrong number of weights created %d != %d",
235 w.size(), numLevelDocs);
236}
237
238static void WriteExactWeights (FILE *weightsFile,
239 unsigned long &diskPtr,
240 const Weights &w) {
241 diskPtr = ftell(weightsFile);
242
243 Weights::const_iterator here = w.begin();
244 Weights::const_iterator end = w.end();
245 while (here != end) {
246// cout << *here << "\n";
247 WriteF (weightsFile, sqrt (*here));
248 here++;
249 }
250}
251
252static void WriteApproxWeights (FILE *approxWeightsFile,
253 unsigned long &diskPtr,
254 const Weights &w,
255 unsigned char bits) {
256 diskPtr = ftell(approxWeightsFile);
257
258 // calculate L, U and B
259 double L = 1e300;
260 double U = 0;
261 float wgt;
262 Weights::const_iterator here = w.begin();
263 Weights::const_iterator end = w.end();
264 while (here != end) {
265 wgt = sqrt (*here);
266 if (wgt > U) U = wgt;
267 if (wgt > 0 && wgt < L) L = wgt;
268 here++;
269 }
270
271 double B = pow (U / L, pow (2.0, -(double) bits));
272
273#ifndef SILENT
274 fprintf (stderr, "L = %f\n", L);
275 fprintf (stderr, "U = %f\n", U);
276 fprintf (stderr, "B = %f\n", B);
277#endif
278
279 WriteUC (approxWeightsFile, bits);
280 WriteD (approxWeightsFile, L);
281 WriteD (approxWeightsFile, B);
282
283
284 unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
285 unsigned long i=0, buf=0, pos=0;
286 here = w.begin();
287 end = w.end();
288 while (here != end) {
289 unsigned long fx;
290 wgt = sqrt (*here);
291 if (wgt == 0) {
292 wgt = L;
293#ifndef SILENT
294 Message ("Warning: Document %d had a weight of 0.", i);
295#endif
296 }
297 fx = (unsigned long) floor (log (wgt / L) / log (B));
298
299 if (fx > max) fx = max;
300
301 buf |= (fx << pos);
302 pos += bits;
303
304 if (pos >= MAXBITS) {
305 WriteUL (approxWeightsFile, buf);
306 buf = fx >> (bits - (pos - MAXBITS));
307 pos = pos - MAXBITS;
308 }
309
310 here++; i++;
311 }
312
313 // write out the last bits
314 if (pos > 0) WriteUL (approxWeightsFile, buf);
315}
316
317int main (int argc, char **argv) {
318 unsigned char bits = 8;
319 char *filename = "";
320 int ch;
321 opterr = 0;
322 msg_prefix = argv[0];
323
324 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
325 switch (ch) {
326 case 'f': // input file
327 filename = optarg;
328 break;
329 case 'd':
330 set_basepath (optarg);
331 break;
332 case 'b':
333 bits = atoi (optarg);
334 if (bits > 32) {
335 fprintf (stderr, "b may only take values 0-32\n");
336 exit (1);
337 }
338 break;
339 case 'h':
340 case '?':
341 fprintf (stderr, "usage: %s [-f input_file]"
342 "[-d data directory] [-b bits] [-h]\n", argv[0]);
343 exit (1);
344 }
345 }
346
347
348 // open the dictionary
349 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
350 MAGIC_STEM_BUILD, MG_ABORT);
351 invf_dict_header idh;
352 idh.Read (dictFile);
353
354 // open the inverted file
355 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
356 MAGIC_INVF, MG_ABORT);
357 invf_file_header ifh;
358 ifh.Read (invfFile);
359 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
360 FatalError (1, "The invf file contains skips. Unable to create weights.");
361
362 // open the inverted index file
363 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
364 MAGIC_INVI, MG_ABORT);
365
366 // read the level information
367 FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
368 MAGIC_INVF_LEVELS, MG_ABORT);
369 FIvfLevel ivfLevel;
370 ivfLevel.Read (levelFile);
371 fclose (levelFile);
372
373 // read in the tag dictionary and inverted file pointers
374 WBTagDict tagDict;
375 ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
376
377
378 // create the weights file
379 FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
380 MAGIC_WGHT, MG_ABORT);
381
382 // create the approx weights file
383 FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
384 MAGIC_WGHT_APPROX, MG_ABORT);
385
386
387 // create weights for each document level
388 FragLevelConvert fragLevelConvert;
389 Weights w;
390 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
391 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
392 unsigned long levelNum = 0;
393 while (levelHere != levelEnd) {
394 const UCArray &levelName = (*levelHere).first;
395
396 // read the tag information about this level
397 fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
398 idh.num_frags, tagDict[levelName].fragOccur);
399
400 // create the weights for this level
401 GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
402 levelNum, dictFile, invfFile, invfIdxFile,
403 fragLevelConvert, w);
404
405 // write out the exact weights
406 WriteExactWeights (weightsFile,
407 (*levelHere).second.exactWeightsDiskPtr,
408 w);
409
410 // write out the approximate weights
411 WriteApproxWeights (approxWeightsFile,
412 (*levelHere).second.approxWeightsDiskPtr,
413 w, bits);
414
415 levelHere++; levelNum++;
416 }
417
418
419 // close input files
420 fclose (dictFile);
421 fclose (invfFile);
422 fclose (invfIdxFile);
423
424 // update the level information
425 levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
426 MAGIC_INVF_LEVELS, MG_ABORT);
427 ivfLevel.Write (levelFile);
428 fclose (levelFile);
429
430 // close output files
431 fclose (weightsFile);
432 fclose (approxWeightsFile);
433
434
435 return 0;
436}
Note: See TracBrowser for help on using the repository browser.