source: trunk/gsdl/src/mgpp/text/mg_weights_build.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25#if defined (__WIN32__)
26# include "getopt.h"
27#else
28# include <unistd.h>
29#endif
30
31#include "UCArray.h"
32#include "sysfuncs.h"
33#include "memlib.h"
34#include "messages.h"
35#include "local_strings.h"
36#include "bitio_gen.h"
37#include "bitio_m_stdio.h"
38#include "mg_files.h"
39#include "locallib.h"
40#include "invf.h"
41#include "FIvfLevelInfo.h"
42#include "FragLevelConvert.h"
43
44#if defined(GSDL_USE_OBJECTSPACE)
45# include <ospace\std\map>
46#elif defined(GSDL_USE_STL_H)
47# include <map.h>
48#else
49# include <map>
50#endif
51
52#define MAXBITS (sizeof(unsigned long) * 8)
53
54struct WBTagPtr {
55 unsigned long tagNum;
56 unsigned long tagPtr;
57 unsigned long fragOccur;
58
59 WBTagPtr () {
60 tagNum = 0;
61 tagPtr = 0;
62 fragOccur = 0;
63 }
64};
65
66// maps tags to tag information
67typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
68
69typedef vector<float> Weights;
70
71
72static void ReadTagDict (const invf_dict_header &idh,
73 FILE *dictFile,
74 FILE *invfIdxFile,
75 WBTagDict &tagDict) {
76 tagDict.erase (tagDict.begin(), tagDict.end());
77
78 // seek to the start of the tag information
79 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
80 fseek (invfIdxFile, sizeof(unsigned long) +
81 idh.word_dict_size*sizeof(unsigned long), SEEK_SET);
82
83 unsigned long tagNum;
84 unsigned long tagPtr;
85 dict_el thisEl;
86 for (tagNum = 0; tagNum < idh.tag_dict_size; tagNum++) {
87 thisEl.Read (dictFile);
88 ReadUL (invfIdxFile, tagPtr);
89 tagDict[thisEl.el].tagNum = tagNum;
90 tagDict[thisEl.el].tagPtr = tagPtr;
91 tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
92 }
93}
94
95static void AddWeight (Weights &w,
96 unsigned long levelDocNum,
97 unsigned long termFreq,
98 float idf) {
99 double weight = termFreq * idf;
100 w[levelDocNum-1] += weight * weight;
101}
102
103static void GenerateLevelWeights (const invf_dict_header &idh,
104 const invf_file_header &ifh,
105 unsigned long numLevelDocs,
106 unsigned long levelNum,
107 FILE *dictFile,
108 FILE *invfFile,
109 FILE *invfIdxFile,
110 const FragLevelConvert &fragLevelConvert,
111 Weights &w) {
112 // pre-allocate the right number of weights
113 w.erase (w.begin(), w.end());
114 w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
115
116 double logN = log ((double) numLevelDocs);
117
118 // reset the files
119 fseek (dictFile, idh.word_dict_start, SEEK_SET);
120 fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET);
121
122 // process each word adding its contributions to the document weights
123 unsigned long wordNum;
124 unsigned long wordStart;
125 word_dict_el wordEl;
126 wordEl.SetNumLevels (idh.num_levels);
127 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
128#ifndef SILENT
129 // give a little feedback every 4096 words
130 if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
131#endif
132
133 wordEl.Read (dictFile, idh.num_levels);
134 ReadUL (invfIdxFile, wordStart);
135
136 float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
137
138 // seek to the appropriate place in the inverted file
139 fseek (invfFile, wordStart, SEEK_SET);
140 stdio_bitio_buffer buffer (invfFile);
141
142 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
143 unsigned long fragNum = 0;
144 unsigned long levelDocNum = 0;
145 unsigned long lastLevelDocNum = 0;
146 unsigned long termFreq = 0;
147 unsigned long checkLevelFreq = 0;
148
149 unsigned long count, i;
150 for (i=0; i<wordEl.frag_occur; i++) {
151 fragNum += buffer.bblock_decode (B, NULL);
152 if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
153 else count = 1;
154
155 if (fragNum > idh.num_frags)
156 FatalError (1, "fragNum = %d, "
157 "number of fragments = %d\n"
158 "wordNum = %d\n"
159 "i = %d, frag_occur = %d\n",
160 fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
161
162 if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
163 FatalError (1, "could not convert fragment number %d", fragNum);
164
165 if (levelDocNum == 0 || levelDocNum > numLevelDocs)
166 FatalError (1, "bad level document number %d", levelDocNum);
167
168 if (levelDocNum != lastLevelDocNum) {
169 // new level document
170 if (lastLevelDocNum > 0) {
171 AddWeight (w, lastLevelDocNum, termFreq, idf);
172 checkLevelFreq++;
173 }
174 lastLevelDocNum = levelDocNum;
175 termFreq = 0;
176 }
177 termFreq += count;
178 }
179
180 if (lastLevelDocNum > 0) {
181 AddWeight (w, lastLevelDocNum, termFreq, idf);
182 checkLevelFreq++;
183 }
184
185 if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
186 cerr << "bad level freq " << checkLevelFreq << " != "
187 << wordEl.levelFreqs[levelNum] << ", word \""
188 << wordEl.el << "\" (" << wordNum << ")\n";
189 exit (1);
190 }
191
192 buffer.done();
193 }
194
195 if (w.size() != numLevelDocs)
196 FatalError (1, "wrong number of weights created %d != %d",
197 w.size(), numLevelDocs);
198}
199
200static void WriteExactWeights (FILE *weightsFile,
201 unsigned long &diskPtr,
202 const Weights &w) {
203 diskPtr = ftell(weightsFile);
204
205 Weights::const_iterator here = w.begin();
206 Weights::const_iterator end = w.end();
207 while (here != end) {
208// cout << *here << "\n";
209 WriteF (weightsFile, sqrt (*here));
210 here++;
211 }
212}
213
214static void WriteApproxWeights (FILE *approxWeightsFile,
215 unsigned long &diskPtr,
216 const Weights &w,
217 unsigned char bits) {
218 diskPtr = ftell(approxWeightsFile);
219
220 // calculate L, U and B
221 double L = 1e300;
222 double U = 0;
223 float wgt;
224 Weights::const_iterator here = w.begin();
225 Weights::const_iterator end = w.end();
226 while (here != end) {
227 wgt = sqrt (*here);
228 if (wgt > U) U = wgt;
229 if (wgt > 0 && wgt < L) L = wgt;
230 here++;
231 }
232
233 double B = pow (U / L, pow (2.0, -(double) bits));
234
235#ifndef SILENT
236 fprintf (stderr, "L = %f\n", L);
237 fprintf (stderr, "U = %f\n", U);
238 fprintf (stderr, "B = %f\n", B);
239#endif
240
241 WriteUC (approxWeightsFile, bits);
242 WriteD (approxWeightsFile, L);
243 WriteD (approxWeightsFile, B);
244
245
246 unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
247 unsigned long i=0, buf=0, pos=0;
248 here = w.begin();
249 end = w.end();
250 while (here != end) {
251 unsigned long fx;
252 wgt = sqrt (*here);
253 if (wgt == 0) {
254 wgt = L;
255#ifndef SILENT
256 Message ("Warning: Document %d had a weight of 0.", i);
257#endif
258 }
259 fx = (unsigned long) floor (log (wgt / L) / log (B));
260
261 if (fx > max) fx = max;
262
263 buf |= (fx << pos);
264 pos += bits;
265
266 if (pos >= MAXBITS) {
267 WriteUL (approxWeightsFile, buf);
268 buf = fx >> (bits - (pos - MAXBITS));
269 pos = pos - MAXBITS;
270 }
271
272 here++; i++;
273 }
274
275 // write out the last bits
276 if (pos > 0) WriteUL (approxWeightsFile, buf);
277}
278
279int main (int argc, char **argv) {
280 unsigned char bits = 8;
281 char *filename = "";
282 int ch;
283 opterr = 0;
284 msg_prefix = argv[0];
285
286 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
287 switch (ch) {
288 case 'f': // input file
289 filename = optarg;
290 break;
291 case 'd':
292 set_basepath (optarg);
293 break;
294 case 'b':
295 bits = atoi (optarg);
296 if (bits > 32) {
297 fprintf (stderr, "b may only take values 0-32\n");
298 exit (1);
299 }
300 break;
301 case 'h':
302 case '?':
303 fprintf (stderr, "usage: %s [-f input_file]"
304 "[-d data directory] [-b bits] [-h]\n", argv[0]);
305 exit (1);
306 }
307 }
308
309
310 // open the dictionary
311 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
312 MAGIC_STEM_BUILD, MG_ABORT);
313 invf_dict_header idh;
314 idh.Read (dictFile);
315
316 // open the inverted file
317 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
318 MAGIC_INVF, MG_ABORT);
319 invf_file_header ifh;
320 ifh.Read (invfFile);
321 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
322 FatalError (1, "The invf file contains skips. Unable to create weights.");
323
324 // open the inverted index file
325 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
326 MAGIC_INVI, MG_ABORT);
327
328 // read the level information
329 FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
330 MAGIC_INVF_LEVELS, MG_ABORT);
331 FIvfLevel ivfLevel;
332 ivfLevel.Read (levelFile);
333 fclose (levelFile);
334
335 // read in the tag dictionary and inverted file pointers
336 WBTagDict tagDict;
337 ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
338
339
340 // create the weights file
341 FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
342 MAGIC_WGHT, MG_ABORT);
343
344 // create the approx weights file
345 FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
346 MAGIC_WGHT_APPROX, MG_ABORT);
347
348
349 // create weights for each document level
350 FragLevelConvert fragLevelConvert;
351 Weights w;
352 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
353 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
354 unsigned long levelNum = 0;
355 while (levelHere != levelEnd) {
356 const UCArray &levelName = (*levelHere).first;
357
358 // read the tag information about this level
359 fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
360 idh.num_frags, tagDict[levelName].fragOccur);
361
362 // create the weights for this level
363 GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
364 levelNum, dictFile, invfFile, invfIdxFile,
365 fragLevelConvert, w);
366
367 // write out the exact weights
368 WriteExactWeights (weightsFile,
369 (*levelHere).second.exactWeightsDiskPtr,
370 w);
371
372 // write out the approximate weights
373 WriteApproxWeights (approxWeightsFile,
374 (*levelHere).second.approxWeightsDiskPtr,
375 w, bits);
376
377 levelHere++; levelNum++;
378 }
379
380
381 // close input files
382 fclose (dictFile);
383 fclose (invfFile);
384 fclose (invfIdxFile);
385
386 // update the level information
387 levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
388 MAGIC_INVF_LEVELS, MG_ABORT);
389 ivfLevel.Write (levelFile);
390 fclose (levelFile);
391
392 // close output files
393 fclose (weightsFile);
394 fclose (approxWeightsFile);
395
396
397 return 0;
398}
Note: See TracBrowser for help on using the repository browser.