source: gsdl/trunk/trunk/mgpp/text/mgpp_weights_build.cpp@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
26#if defined (__WIN32__) || defined (__CYGWIN__)
27# include "getopt_old.h"
28#else
29# include <unistd.h>
30#endif
31
32#include "UCArray.h"
33#include "sysfuncs.h"
34#include "memlib.h"
35#include "messages.h"
36#include "local_strings.h"
37#include "bitio_gen.h"
38#include "bitio_m_stdio.h"
39#include "mg_files.h"
40#include "locallib.h"
41#include "invf.h"
42#include "FIvfLevelInfo.h"
43#include "FragLevelConvert.h"
44
45#if defined(GSDL_USE_OBJECTSPACE)
46# include <ospace\std\map>
47#elif defined(GSDL_USE_STL_H)
48# include <map.h>
49#else
50# include <map>
51#endif
52
53#define MAXBITS (sizeof(unsigned long) * 8)
54
55struct WBTagPtr {
56 unsigned long tagNum;
57 unsigned long tagPtr;
58 unsigned long fragOccur;
59
60 WBTagPtr () {
61 tagNum = 0;
62 tagPtr = 0;
63 fragOccur = 0;
64 }
65};
66
67// maps tags to tag information
68typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
69
70typedef vector<float> Weights;
71
72
73static void ReadTagDict (const invf_dict_header &idh,
74 FILE *dictFile,
75 FILE *invfIdxFile,
76 WBTagDict &tagDict) {
77 tagDict.erase (tagDict.begin(), tagDict.end());
78
79 // seek to the start of the tag information
80 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
81 fseek (invfIdxFile, sizeof(unsigned long) +
82 idh.word_dict_size*sizeof(unsigned long), SEEK_SET);
83
84 unsigned long tagNum;
85 unsigned long tagPtr;
86 dict_el thisEl;
87 for (tagNum = 0; tagNum < idh.tag_dict_size; ++tagNum) {
88 thisEl.Read (dictFile);
89 ReadUL (invfIdxFile, tagPtr);
90 tagDict[thisEl.el].tagNum = tagNum;
91 tagDict[thisEl.el].tagPtr = tagPtr;
92 tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
93 }
94}
95
96static void AddWeight (Weights &w,
97 unsigned long levelDocNum,
98 unsigned long termFreq,
99 float idf) {
100 double weight = termFreq * idf;
101 w[levelDocNum-1] += weight * weight;
102}
103
104static void GenerateLevelWeights (const invf_dict_header &idh,
105 const invf_file_header &ifh,
106 unsigned long numLevelDocs,
107 unsigned long levelNum,
108 FILE *dictFile,
109 FILE *invfFile,
110 FILE *invfIdxFile,
111 const FragLevelConvert &fragLevelConvert,
112 Weights &w) {
113 // pre-allocate the right number of weights
114 w.erase (w.begin(), w.end());
115 w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
116
117 double logN = log ((double) numLevelDocs);
118
119 // reset the files
120 fseek (dictFile, idh.word_dict_start, SEEK_SET);
121 fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET);
122
123 // process each word adding its contributions to the document weights
124 unsigned long wordNum;
125 unsigned long wordStart;
126 word_dict_el wordEl;
127 wordEl.SetNumLevels (idh.num_levels);
128 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
129#ifndef SILENT
130 // give a little feedback every 4096 words
131 if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
132#endif
133
134 wordEl.Read (dictFile, idh.num_levels);
135 ReadUL (invfIdxFile, wordStart);
136
137 float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
138
139 // seek to the appropriate place in the inverted file
140 fseek (invfFile, wordStart, SEEK_SET);
141 stdio_bitio_buffer buffer (invfFile);
142
143 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
144 unsigned long fragNum = 0;
145 unsigned long levelDocNum = 0;
146 unsigned long lastLevelDocNum = 0;
147 unsigned long termFreq = 0;
148 unsigned long checkLevelFreq = 0;
149
150 unsigned long count, i;
151 for (i=0; i<wordEl.frag_occur; ++i) {
152 fragNum += buffer.bblock_decode (B, NULL);
153 if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
154 else count = 1;
155
156 if (fragNum > idh.num_frags)
157 FatalError (1, "fragNum = %d, "
158 "number of fragments = %d\n"
159 "wordNum = %d\n"
160 "i = %d, frag_occur = %d\n",
161 fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
162
163 if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
164 FatalError (1, "could not convert fragment number %d in level %d", fragNum, levelNum);
165
166 if (levelDocNum == 0 || levelDocNum > numLevelDocs)
167 FatalError (1, "bad level document number %d in level %d", levelDocNum, levelNum);
168
169 if (levelDocNum != lastLevelDocNum) {
170 // new level document
171 if (lastLevelDocNum > 0) {
172 AddWeight (w, lastLevelDocNum, termFreq, idf);
173 ++checkLevelFreq;
174 }
175 lastLevelDocNum = levelDocNum;
176 termFreq = 0;
177 }
178 termFreq += count;
179 }
180
181 if (lastLevelDocNum > 0) {
182 AddWeight (w, lastLevelDocNum, termFreq, idf);
183 ++checkLevelFreq;
184 }
185
186 if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
187 cerr << "bad level freq at level " <<levelNum<<" "<< checkLevelFreq << " != "
188 << wordEl.levelFreqs[levelNum] << ", word \""
189 << wordEl.el << "\" (" << wordNum << ")\n";
190 exit (1);
191 }
192
193 buffer.done();
194 }
195
196 if (w.size() != numLevelDocs)
197 FatalError (1, "wrong number of weights created %d != %d",
198 w.size(), numLevelDocs);
199}
200
201static void WriteExactWeights (FILE *weightsFile,
202 unsigned long &diskPtr,
203 const Weights &w) {
204 diskPtr = ftell(weightsFile);
205
206 Weights::const_iterator here = w.begin();
207 Weights::const_iterator end = w.end();
208 while (here != end) {
209// cout << *here << "\n";
210 WriteF (weightsFile, sqrt (*here));
211 ++here;
212 }
213}
214
215static void WriteApproxWeights (FILE *approxWeightsFile,
216 unsigned long &diskPtr,
217 const Weights &w,
218 unsigned char bits) {
219 diskPtr = ftell(approxWeightsFile);
220
221 // calculate L, U and B
222 double L = 1e300;
223 double U = 0;
224 float wgt;
225 Weights::const_iterator here = w.begin();
226 Weights::const_iterator end = w.end();
227 while (here != end) {
228 wgt = sqrt (*here);
229 if (wgt > U) U = wgt;
230 if (wgt > 0 && wgt < L) L = wgt;
231 ++here;
232 }
233
234 double B = pow (U / L, pow (2.0, -(double) bits));
235
236#ifndef SILENT
237 fprintf (stderr, "L = %f\n", L);
238 fprintf (stderr, "U = %f\n", U);
239 fprintf (stderr, "B = %f\n", B);
240#endif
241
242 WriteUC (approxWeightsFile, bits);
243 WriteD (approxWeightsFile, L);
244 WriteD (approxWeightsFile, B);
245
246
247 unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
248 unsigned long i=0, buf=0, pos=0;
249 here = w.begin();
250 end = w.end();
251 while (here != end) {
252 unsigned long fx;
253 wgt = sqrt (*here);
254 if (wgt == 0) {
255 wgt = L;
256#ifndef SILENT
257 Message ("Warning: Document %d had a weight of 0.", i);
258#endif
259 }
260 fx = (unsigned long) floor (log (wgt / L) / log (B));
261
262 if (fx > max) fx = max;
263
264 buf |= (fx << pos);
265 pos += bits;
266
267 if (pos >= MAXBITS) {
268 WriteUL (approxWeightsFile, buf);
269 buf = fx >> (bits - (pos - MAXBITS));
270 pos = pos - MAXBITS;
271 }
272
273 ++here; ++i;
274 }
275
276 // write out the last bits
277 if (pos > 0) WriteUL (approxWeightsFile, buf);
278}
279
280int main (int argc, char **argv) {
281 unsigned char bits = 8;
282 char *filename = "";
283 int ch;
284 opterr = 0;
285 msg_prefix = argv[0];
286
287 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
288 switch (ch) {
289 case 'f': // input file
290 filename = optarg;
291 break;
292 case 'd':
293 set_basepath (optarg);
294 break;
295 case 'b':
296 bits = atoi (optarg);
297 if (bits > 32) {
298 fprintf (stderr, "b may only take values 0-32\n");
299 exit (1);
300 }
301 break;
302 case 'h':
303 case '?':
304 fprintf (stderr, "usage: %s [-f input_file]"
305 "[-d data directory] [-b bits] [-h]\n", argv[0]);
306 exit (1);
307 }
308 }
309
310
311 // open the dictionary
312 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
313 MAGIC_STEM_BUILD, MG_ABORT);
314 invf_dict_header idh;
315 idh.Read (dictFile);
316
317 // open the inverted file
318 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
319 MAGIC_INVF, MG_ABORT);
320 invf_file_header ifh;
321 ifh.Read (invfFile);
322 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
323 FatalError (1, "The invf file contains skips. Unable to create weights.");
324
325 // open the inverted index file
326 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
327 MAGIC_INVI, MG_ABORT);
328
329 // read the level information
330 FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
331 MAGIC_INVF_LEVELS, MG_ABORT);
332 FIvfLevel ivfLevel;
333 ivfLevel.Read (levelFile);
334 fclose (levelFile);
335
336 // read in the tag dictionary and inverted file pointers
337 WBTagDict tagDict;
338 ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
339
340
341 // create the weights file
342 FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
343 MAGIC_WGHT, MG_ABORT);
344
345 // create the approx weights file
346 FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
347 MAGIC_WGHT_APPROX, MG_ABORT);
348
349
350 // create weights for each document level
351 FragLevelConvert fragLevelConvert;
352 Weights w;
353 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
354 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
355 unsigned long levelNum = 0;
356 while (levelHere != levelEnd) {
357 const UCArray &levelName = (*levelHere).first;
358
359 // read the tag information about this level
360 fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
361 idh.num_frags, tagDict[levelName].fragOccur);
362
363 // create the weights for this level
364 GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
365 levelNum, dictFile, invfFile, invfIdxFile,
366 fragLevelConvert, w);
367
368 // write out the exact weights
369 WriteExactWeights (weightsFile,
370 (*levelHere).second.exactWeightsDiskPtr,
371 w);
372
373 // write out the approximate weights
374 WriteApproxWeights (approxWeightsFile,
375 (*levelHere).second.approxWeightsDiskPtr,
376 w, bits);
377
378 ++levelHere; ++levelNum;
379 }
380
381
382 // close input files
383 fclose (dictFile);
384 fclose (invfFile);
385 fclose (invfIdxFile);
386
387 // update the level information
388 levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
389 MAGIC_INVF_LEVELS, MG_ABORT);
390 ivfLevel.Write (levelFile);
391 fclose (levelFile);
392
393 // close output files
394 fclose (weightsFile);
395 fclose (approxWeightsFile);
396
397
398 return 0;
399}
Note: See TracBrowser for help on using the repository browser.