source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/mgpp_weights_build.cpp@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_weights_build.cpp -- Program to build the document weights file
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23// This was added for Solaris, but it makes things worse on Solaris for me...
24// #define _XOPEN_SOURCE_EXTENDED 1
25
26/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
27#if defined (__WIN32__) || defined (__CYGWIN__)
28# include "getopt_old.h"
29#else
30# include <unistd.h>
31#endif
32
33#include "UCArray.h"
34#include "sysfuncs.h"
35#include "memlib.h"
36#include "messages.h"
37#include "local_strings.h"
38#include "bitio_gen.h"
39#include "bitio_m_stdio.h"
40#include "mg_files.h"
41#include "locallib.h"
42#include "invf.h"
43#include "FIvfLevelInfo.h"
44#include "FragLevelConvert.h"
45
46#if defined(GSDL_USE_OBJECTSPACE)
47# include <ospace\std\map>
48#elif defined(GSDL_USE_STL_H)
49# include <map.h>
50#else
51# include <map>
52#endif
53
54#define MAXBITS (sizeof(mg_u_long) * 8)
55
56struct WBTagPtr {
57 mg_u_long tagNum;
58 mg_u_long tagPtr;
59 mg_u_long fragOccur;
60
61 WBTagPtr () {
62 tagNum = 0;
63 tagPtr = 0;
64 fragOccur = 0;
65 }
66};
67
68// maps tags to tag information
69typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;
70
71typedef vector<float> Weights;
72
73
74static void ReadTagDict (const invf_dict_header &idh,
75 FILE *dictFile,
76 FILE *invfIdxFile,
77 WBTagDict &tagDict) {
78 tagDict.erase (tagDict.begin(), tagDict.end());
79
80 // seek to the start of the tag information
81 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
82 fseek (invfIdxFile, sizeof(mg_u_long) +
83 idh.word_dict_size*sizeof(mg_u_long), SEEK_SET);
84
85 mg_u_long tagNum;
86 mg_u_long tagPtr;
87 dict_el thisEl;
88 for (tagNum = 0; tagNum < idh.tag_dict_size; ++tagNum) {
89 thisEl.Read (dictFile);
90 ReadUL (invfIdxFile, tagPtr);
91 tagDict[thisEl.el].tagNum = tagNum;
92 tagDict[thisEl.el].tagPtr = tagPtr;
93 tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
94 }
95}
96
97static void AddWeight (Weights &w,
98 mg_u_long levelDocNum,
99 mg_u_long termFreq,
100 float idf) {
101 double weight = termFreq * idf;
102 w[levelDocNum-1] += weight * weight;
103}
104
105static void GenerateLevelWeights (const invf_dict_header &idh,
106 const invf_file_header &ifh,
107 mg_u_long numLevelDocs,
108 mg_u_long levelNum,
109 FILE *dictFile,
110 FILE *invfFile,
111 FILE *invfIdxFile,
112 const FragLevelConvert &fragLevelConvert,
113 Weights &w) {
114 // pre-allocate the right number of weights
115 w.erase (w.begin(), w.end());
116 w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);
117
118 double logN = log ((double) numLevelDocs);
119
120 // reset the files
121 fseek (dictFile, idh.word_dict_start, SEEK_SET);
122 fseek (invfIdxFile, sizeof (mg_u_long), SEEK_SET);
123
124 // process each word adding its contributions to the document weights
125 mg_u_long wordNum;
126 mg_u_long wordStart;
127 word_dict_el wordEl;
128 wordEl.SetNumLevels (idh.num_levels);
129 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
130#ifndef SILENT
131 // give a little feedback every 4096 words
132 if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
133#endif
134
135 wordEl.Read (dictFile, idh.num_levels);
136 ReadUL (invfIdxFile, wordStart);
137
138 float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);
139
140 // seek to the appropriate place in the inverted file
141 fseek (invfFile, wordStart, SEEK_SET);
142 stdio_bitio_buffer buffer (invfFile);
143
144 mg_u_long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
145 mg_u_long fragNum = 0;
146 mg_u_long levelDocNum = 0;
147 mg_u_long lastLevelDocNum = 0;
148 mg_u_long termFreq = 0;
149 mg_u_long checkLevelFreq = 0;
150
151 mg_u_long count, i;
152 for (i=0; i<wordEl.frag_occur; ++i) {
153 fragNum += buffer.bblock_decode (B, NULL);
154 if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
155 else count = 1;
156
157 if (fragNum > idh.num_frags)
158 FatalError (1, "fragNum = %d, "
159 "number of fragments = %d\n"
160 "wordNum = %d\n"
161 "i = %d, frag_occur = %d\n",
162 fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);
163
164 if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
165 FatalError (1, "could not convert fragment number %d in level %d", fragNum, levelNum);
166
167 if (levelDocNum == 0 || levelDocNum > numLevelDocs)
168 FatalError (1, "bad level document number %d in level %d", levelDocNum, levelNum);
169
170 if (levelDocNum != lastLevelDocNum) {
171 // new level document
172 if (lastLevelDocNum > 0) {
173 AddWeight (w, lastLevelDocNum, termFreq, idf);
174 ++checkLevelFreq;
175 }
176 lastLevelDocNum = levelDocNum;
177 termFreq = 0;
178 }
179 termFreq += count;
180 }
181
182 if (lastLevelDocNum > 0) {
183 AddWeight (w, lastLevelDocNum, termFreq, idf);
184 ++checkLevelFreq;
185 }
186
187 if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
188 cerr << "bad level freq at level " <<levelNum<<" "<< checkLevelFreq << " != "
189 << wordEl.levelFreqs[levelNum] << ", word \""
190 << wordEl.el << "\" (" << wordNum << ")\n";
191 exit (1);
192 }
193
194 buffer.done();
195 }
196
197 if (w.size() != numLevelDocs)
198 FatalError (1, "wrong number of weights created %d != %d",
199 w.size(), numLevelDocs);
200}
201
202static void WriteExactWeights (FILE *weightsFile,
203 mg_u_long &diskPtr,
204 const Weights &w) {
205 diskPtr = ftell(weightsFile);
206
207 Weights::const_iterator here = w.begin();
208 Weights::const_iterator end = w.end();
209 while (here != end) {
210// cout << *here << "\n";
211 WriteF (weightsFile, sqrt (*here));
212 ++here;
213 }
214}
215
216static void WriteApproxWeights (FILE *approxWeightsFile,
217 mg_u_long &diskPtr,
218 const Weights &w,
219 unsigned char bits) {
220 diskPtr = ftell(approxWeightsFile);
221
222 // calculate L, U and B
223 double L = 1e300;
224 double U = 0;
225 float wgt;
226 Weights::const_iterator here = w.begin();
227 Weights::const_iterator end = w.end();
228 while (here != end) {
229 wgt = sqrt (*here);
230 if (wgt > U) U = wgt;
231 if (wgt > 0 && wgt < L) L = wgt;
232 ++here;
233 }
234
235 double B = pow (U / L, pow (2.0, -(double) bits));
236
237#ifndef SILENT
238 fprintf (stderr, "L = %f\n", L);
239 fprintf (stderr, "U = %f\n", U);
240 fprintf (stderr, "B = %f\n", B);
241#endif
242
243 WriteUC (approxWeightsFile, bits);
244 WriteD (approxWeightsFile, L);
245 WriteD (approxWeightsFile, B);
246
247
248 mg_u_long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
249 mg_u_long i=0, buf=0, pos=0;
250 here = w.begin();
251 end = w.end();
252 while (here != end) {
253 mg_u_long fx;
254 wgt = sqrt (*here);
255 if (wgt == 0) {
256 wgt = L;
257#ifndef SILENT
258 Message ("Warning: Document %d had a weight of 0.", i);
259#endif
260 }
261 fx = (mg_u_long) floor (log (wgt / L) / log (B));
262
263 if (fx > max) fx = max;
264
265 buf |= (fx << pos);
266 pos += bits;
267
268 if (pos >= MAXBITS) {
269 WriteUL (approxWeightsFile, buf);
270 buf = fx >> (bits - (pos - MAXBITS));
271 pos = pos - MAXBITS;
272 }
273
274 ++here; ++i;
275 }
276
277 // write out the last bits
278 if (pos > 0) WriteUL (approxWeightsFile, buf);
279}
280
281int main (int argc, char **argv) {
282 unsigned char bits = 8;
283 char *filename = (char*)"";
284 int ch;
285 opterr = 0;
286 msg_prefix = argv[0];
287
288 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
289 switch (ch) {
290 case 'f': // input file
291 filename = optarg;
292 break;
293 case 'd':
294 set_basepath (optarg);
295 break;
296 case 'b':
297 bits = atoi (optarg);
298 if (bits > 32) {
299 fprintf (stderr, "b may only take values 0-32\n");
300 exit (1);
301 }
302 break;
303 case 'h':
304 case '?':
305 fprintf (stderr, "usage: %s [-f input_file]"
306 "[-d data directory] [-b bits] [-h]\n", argv[0]);
307 exit (1);
308 }
309 }
310
311
312 // open the dictionary
313 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
314 MAGIC_STEM_BUILD, MG_ABORT);
315 invf_dict_header idh;
316 idh.Read (dictFile);
317
318 // open the inverted file
319 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
320 MAGIC_INVF, MG_ABORT);
321 invf_file_header ifh;
322 ifh.Read (invfFile);
323 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
324 FatalError (1, "The invf file contains skips. Unable to create weights.");
325
326 // open the inverted index file
327 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
328 MAGIC_INVI, MG_ABORT);
329
330 // read the level information
331 FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
332 MAGIC_INVF_LEVELS, MG_ABORT);
333 FIvfLevel ivfLevel;
334 ivfLevel.Read (levelFile);
335 fclose (levelFile);
336
337 // read in the tag dictionary and inverted file pointers
338 WBTagDict tagDict;
339 ReadTagDict (idh, dictFile, invfIdxFile, tagDict);
340
341
342 // create the weights file
343 FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
344 MAGIC_WGHT, MG_ABORT);
345
346 // create the approx weights file
347 FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
348 MAGIC_WGHT_APPROX, MG_ABORT);
349
350
351 // create weights for each document level
352 FragLevelConvert fragLevelConvert;
353 Weights w;
354 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
355 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
356 mg_u_long levelNum = 0;
357 while (levelHere != levelEnd) {
358 const UCArray &levelName = (*levelHere).first;
359
360 // read the tag information about this level
361 fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
362 idh.num_frags, tagDict[levelName].fragOccur);
363
364 // create the weights for this level
365 GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
366 levelNum, dictFile, invfFile, invfIdxFile,
367 fragLevelConvert, w);
368
369 // write out the exact weights
370 WriteExactWeights (weightsFile,
371 (*levelHere).second.exactWeightsDiskPtr,
372 w);
373
374 // write out the approximate weights
375 WriteApproxWeights (approxWeightsFile,
376 (*levelHere).second.approxWeightsDiskPtr,
377 w, bits);
378
379 ++levelHere; ++levelNum;
380 }
381
382
383 // close input files
384 fclose (dictFile);
385 fclose (invfFile);
386 fclose (invfIdxFile);
387
388 // update the level information
389 levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
390 MAGIC_INVF_LEVELS, MG_ABORT);
391 ivfLevel.Write (levelFile);
392 fclose (levelFile);
393
394 // close output files
395 fclose (weightsFile);
396 fclose (approxWeightsFile);
397
398
399 return 0;
400}
Note: See TracBrowser for help on using the repository browser.