source: trunk/gsdl/src/mgpp/text/mg_weights_build.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_weights_build.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "bitio_gen.h"
29#include "bitio_m_stdio.h"
30
31#include "mg_files.h"
32#include "locallib.h"
33#include "invf.h"
34// #include "WordData.h"
35#include "UCArray.h"
36#include "FIvfLevelInfo.h"
37#include "FragLevelConvert.h"
38
39#if defined(GSDL_USE_OBJECTSPACE)
40# include <ospace\std\map>
41#elif defined(GSDL_USE_STL_H)
42# include <map.h>
43#else
44# include <map>
45#endif
46
47
48/*
49 $Log$
50 Revision 1.1 2000/01/14 02:26:21 sjboddie
51 Rodgers new C++ mg
52
53 Revision 1.1 1999/10/11 02:58:06 cs025
54 Base install of MG-PP
55
56 Revision 1.1 1999/08/10 21:18:16 sjboddie
57 renamed mg-1.3d directory mg
58
59 Revision 1.2 1998/11/25 07:55:49 rjmcnab
60
61 Modified mg to that you can specify the stemmer you want
62 to use via a command line option. You specify it to
63 mg_passes during the build process. The number of the
64 stemmer that you used is stored within the inverted
65 dictionary header and the stemmed dictionary header so
66 the correct stemmer is used in later stages of building
67 and querying.
68
69 Revision 1.1 1998/11/17 09:35:22 rjmcnab
70 *** empty log message ***
71
72 * Revision 1.4 1994/11/29 00:32:05 tes
73 * Committing the new merged files and changes.
74 *
75 * Revision 1.3 1994/10/20 03:57:00 tes
76 * I have rewritten the boolean query optimiser and abstracted out the
77 * components of the boolean query.
78 *
79 * Revision 1.2 1994/09/20 04:41:55 tes
80 * For version 1.1
81 *
82 */
83
84#define MAXBITS (sizeof(unsigned long) * 8)
85
86struct WBTagPtr {
87 unsigned long tagNum;
88 unsigned long tagPtr;
89 unsigned long fragOccur;
90
91 WBTagPtr () {
92 tagNum = 0;
93 tagPtr = 0;
94 fragOccur = 0;
95 }
96};
97
98// maps tags to tag information
99typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
100
101typedef vector<float> Weights;
102
103
104static void ReadTagDict (const invf_dict_header &idh,
105 FILE *dictFile,
106 FILE *invfIdxFile,
107 WBTagDict &tagDict) {
108 tagDict.erase (tagDict.begin(), tagDict.end());
109
110 // seek to the start of the tag information
111 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
112 fseek (invfIdxFile, sizeof(unsigned long) +
113 idh.word_dict_size*sizeof(unsigned long), SEEK_SET);
114
115 unsigned long tagNum;
116 unsigned long tagPtr;
117 dict_el thisEl;
118 for (tagNum = 0; tagNum < idh.tag_dict_size; tagNum++) {
119 thisEl.Read (dictFile);
120 ReadUL (invfIdxFile, tagPtr);
121 tagDict[thisEl.el].tagNum = tagNum;
122 tagDict[thisEl.el].tagPtr = tagPtr;
123 tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
124 }
125}
126
127static void AddWeight (Weights &w,
128 unsigned long levelDocNum,
129 unsigned long termFreq,
130 float idf) {
131 double weight = termFreq * idf;
132 w[levelDocNum-1] += weight * weight;
133}
134
135static void GenerateLevelWeights (const invf_dict_header &idh,
136 const invf_file_header &ifh,
137 unsigned long numLevelDocs,
138 unsigned long levelNum,
139 FILE *dictFile,
140 FILE *invfFile,
141 FILE *invfIdxFile,
142 const FragLevelConvert &fragLevelConvert,
143 Weights &w) {
144 // pre-allocate the right number of weights
145 w.erase (w.begin(), w.end());
146 w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
147
148 double logN = log ((double) numLevelDocs);
149
150 // reset the files
151 fseek (dictFile, idh.word_dict_start, SEEK_SET);
152 fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET);
153
154 // process each word adding its contributions to the document weights
155 unsigned long wordNum;
156 unsigned long wordStart;
157 word_dict_el wordEl;
158 wordEl.SetNumLevels (idh.num_levels);
159 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
160 // give a little feedback every 4096 words
161 if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
162
163 wordEl.Read (dictFile, idh.num_levels);
164 ReadUL (invfIdxFile, wordStart);
165
166 float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
167
168 // seek to the appropriate place in the inverted file
169 fseek (invfFile, wordStart, SEEK_SET);
170 stdio_bitio_buffer buffer (invfFile);
171
172 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
173 unsigned long fragNum = 0;
174 unsigned long levelDocNum = 0;
175 unsigned long lastLevelDocNum = 0;
176 unsigned long termFreq = 0;
177 unsigned long checkLevelFreq = 0;
178
179 unsigned long count, i;
180 for (i=0; i<wordEl.frag_occur; i++) {
181 fragNum += buffer.bblock_decode (B, NULL);
182 if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
183 else count = 1;
184
185 if (fragNum > idh.num_frags)
186 FatalError (1, "fragNum = %d, "
187 "number of fragments = %d\n"
188 "wordNum = %d\n"
189 "i = %d, frag_occur = %d\n",
190 fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
191
192 if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
193 FatalError (1, "could not convert fragment number %d", fragNum);
194
195 if (levelDocNum == 0 || levelDocNum > numLevelDocs)
196 FatalError (1, "bad level document number %d", levelDocNum);
197
198 if (levelDocNum != lastLevelDocNum) {
199 // new level document
200 if (lastLevelDocNum > 0) {
201 AddWeight (w, lastLevelDocNum, termFreq, idf);
202 checkLevelFreq++;
203 }
204 lastLevelDocNum = levelDocNum;
205 termFreq = 0;
206 }
207 termFreq += count;
208 }
209
210 if (lastLevelDocNum > 0) {
211 AddWeight (w, lastLevelDocNum, termFreq, idf);
212 checkLevelFreq++;
213 }
214
215 if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
216 cerr << "bad level freq " << checkLevelFreq << " != "
217 << wordEl.levelFreqs[levelNum] << ", word \""
218 << wordEl.el << "\" (" << wordNum << ")\n";
219 exit (1);
220 }
221
222 buffer.done();
223 }
224
225 if (w.size() != numLevelDocs)
226 FatalError (1, "wrong number of weights created %d != %d",
227 w.size(), numLevelDocs);
228}
229
230static void WriteExactWeights (FILE *weightsFile,
231 unsigned long &diskPtr,
232 const Weights &w) {
233 diskPtr = ftell(weightsFile);
234
235 Weights::const_iterator here = w.begin();
236 Weights::const_iterator end = w.end();
237 while (here != end) {
238// cout << *here << "\n";
239 WriteF (weightsFile, *here);
240 here++;
241 }
242}
243
244static void WriteApproxWeights (FILE *approxWeightsFile,
245 unsigned long &diskPtr,
246 const Weights &w,
247 unsigned char bits) {
248 diskPtr = ftell(approxWeightsFile);
249
250 // calculate L, U and B
251 double L = 1e300;
252 double U = 0;
253 float wgt;
254 Weights::const_iterator here = w.begin();
255 Weights::const_iterator end = w.end();
256 while (here != end) {
257 wgt = sqrt (*here);
258 if (wgt > U) U = wgt;
259 if (wgt > 0 && wgt < L) L = wgt;
260 here++;
261 }
262
263 double B = pow (U / L, pow (2.0, -(double) bits));
264
265 fprintf (stderr, "L = %f\n", L);
266 fprintf (stderr, "U = %f\n", U);
267 fprintf (stderr, "B = %f\n", B);
268
269 WriteUC (approxWeightsFile, bits);
270 WriteD (approxWeightsFile, L);
271 WriteD (approxWeightsFile, B);
272
273
274 unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
275 unsigned long i=0, buf=0, pos=0;
276 here = w.begin();
277 end = w.end();
278 while (here != end) {
279 unsigned long fx;
280 wgt = sqrt (*here);
281 if (wgt == 0) {
282 wgt = L;
283#ifndef QUIET
284 Message ("Warning: Document %d had a weight of 0.", i);
285#endif
286 }
287 fx = (unsigned long) floor (log (wgt / L) / log (B));
288
289 if (fx > max) fx = max;
290
291 buf |= (fx << pos);
292 pos += bits;
293
294 if (pos >= MAXBITS) {
295 WriteUL (approxWeightsFile, buf);
296 buf = fx >> (bits - (pos - MAXBITS));
297 pos = pos - MAXBITS;
298 }
299
300 here++; i++;
301 }
302
303 // write out the last bits
304 if (pos > 0) WriteUL (approxWeightsFile, buf);
305}
306
307int main (int argc, char **argv) {
308 unsigned char bits = 8;
309 char *filename = "";
310 int ch;
311 opterr = 0;
312 msg_prefix = argv[0];
313
314 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
315 switch (ch) {
316 case 'f': // input file
317 filename = optarg;
318 break;
319 case 'd':
320 set_basepath (optarg);
321 break;
322 case 'b':
323 bits = atoi (optarg);
324 if (bits > 32) {
325 fprintf (stderr, "b may only take values 0-32\n");
326 exit (1);
327 }
328 break;
329 case 'h':
330 case '?':
331 fprintf (stderr, "usage: %s [-f input_file]"
332 "[-d data directory] [-b bits] [-h]\n", argv[0]);
333 exit (1);
334 }
335 }
336
337
338 // open the dictionary
339 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
340 MAGIC_STEM_BUILD, MG_ABORT);
341 invf_dict_header idh;
342 idh.Read (dictFile);
343
344 // open the inverted file
345 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
346 MAGIC_INVF, MG_ABORT);
347 invf_file_header ifh;
348 ifh.Read (invfFile);
349 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
350 FatalError (1, "The invf file contains skips. Unable to create weights.");
351
352 // open the inverted index file
353 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
354 MAGIC_INVI, MG_ABORT);
355
356 // read the level information
357 FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
358 MAGIC_INVF_LEVELS, MG_ABORT);
359 FIvfLevel ivfLevel;
360 ivfLevel.Read (levelFile);
361 fclose (levelFile);
362
363 // read in the tag dictionary and inverted file pointers
364 WBTagDict tagDict;
365 ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
366
367
368 // create the weights file
369 FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
370 MAGIC_WGHT, MG_ABORT);
371
372 // create the approx weights file
373 FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
374 MAGIC_WGHT_APPROX, MG_ABORT);
375
376
377 // create weights for each document level
378 FragLevelConvert fragLevelConvert;
379 Weights w;
380 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
381 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
382 unsigned long levelNum = 0;
383 while (levelHere != levelEnd) {
384 const UCArray &levelName = (*levelHere).first;
385
386 // read the tag information about this level
387 fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
388 idh.num_frags, tagDict[levelName].fragOccur);
389
390 // create the weights for this level
391 GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
392 levelNum, dictFile, invfFile, invfIdxFile,
393 fragLevelConvert, w);
394
395 // write out the exact weights
396 WriteExactWeights (weightsFile,
397 (*levelHere).second.exactWeightsDiskPtr,
398 w);
399
400 // write out the approximate weights
401 WriteApproxWeights (approxWeightsFile,
402 (*levelHere).second.approxWeightsDiskPtr,
403 w, bits);
404
405 levelHere++; levelNum++;
406 }
407
408
409 // close input files
410 fclose (dictFile);
411 fclose (invfFile);
412 fclose (invfIdxFile);
413
414 // update the level information
415 levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
416 MAGIC_INVF_LEVELS, MG_ABORT);
417 ivfLevel.Write (levelFile);
418 fclose (levelFile);
419
420 // close output files
421 fclose (weightsFile);
422 fclose (approxWeightsFile);
423
424
425 return 0;
426}
Note: See TracBrowser for help on using the repository browser.