/************************************************************************** * * mgpp_weights_build.cpp -- Program to build the document weights file * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 #define _XOPEN_SOURCE_EXTENDED 1 /* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */ #if defined (__WIN32__) || defined (__CYGWIN__) # include "getopt_old.h" #else # include #endif #include "UCArray.h" #include "sysfuncs.h" #include "memlib.h" #include "messages.h" #include "local_strings.h" #include "bitio_gen.h" #include "bitio_m_stdio.h" #include "mg_files.h" #include "locallib.h" #include "invf.h" #include "FIvfLevelInfo.h" #include "FragLevelConvert.h" #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_STL_H) # include #else # include #endif #define MAXBITS (sizeof(unsigned long) * 8) struct WBTagPtr { unsigned long tagNum; unsigned long tagPtr; unsigned long fragOccur; WBTagPtr () { tagNum = 0; tagPtr = 0; fragOccur = 0; } }; // maps tags to tag information typedef map WBTagDict; typedef vector Weights; static void ReadTagDict (const invf_dict_header &idh, FILE *dictFile, FILE *invfIdxFile, WBTagDict &tagDict) { tagDict.erase (tagDict.begin(), tagDict.end()); // seek to the start of the tag information fseek (dictFile, idh.tag_dict_start, SEEK_SET); fseek (invfIdxFile, sizeof(unsigned long) + idh.word_dict_size*sizeof(unsigned long), SEEK_SET); unsigned long tagNum; unsigned long tagPtr; dict_el thisEl; for (tagNum = 0; tagNum < idh.tag_dict_size; ++tagNum) { thisEl.Read (dictFile); ReadUL (invfIdxFile, tagPtr); tagDict[thisEl.el].tagNum = tagNum; tagDict[thisEl.el].tagPtr = tagPtr; tagDict[thisEl.el].fragOccur = thisEl.frag_occur; } } static void AddWeight (Weights &w, unsigned long levelDocNum, unsigned long termFreq, float idf) { double weight = termFreq * idf; w[levelDocNum-1] += weight * weight; } static void GenerateLevelWeights (const invf_dict_header &idh, const invf_file_header &ifh, unsigned long numLevelDocs, unsigned long levelNum, FILE *dictFile, FILE *invfFile, FILE *invfIdxFile, const FragLevelConvert &fragLevelConvert, Weights &w) { // pre-allocate the right number of weights w.erase (w.begin(), w.end()); w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0); double logN = log ((double) numLevelDocs); // reset the files fseek (dictFile, idh.word_dict_start, SEEK_SET); fseek (invfIdxFile, sizeof (unsigned long), SEEK_SET); // process each word adding its contributions to the document weights unsigned long wordNum; unsigned long wordStart; word_dict_el wordEl; wordEl.SetNumLevels (idh.num_levels); for (wordNum=0; wordNum idh.num_frags) FatalError (1, "fragNum = %d, " "number of fragments = %d\n" "wordNum = %d\n" "i = %d, frag_occur = %d\n", fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur); if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum)) FatalError (1, "could not convert fragment number %d in level %d", fragNum, levelNum); if (levelDocNum == 0 || levelDocNum > numLevelDocs) FatalError (1, "bad level document number %d in level %d", levelDocNum, levelNum); if (levelDocNum != lastLevelDocNum) { // new level document if (lastLevelDocNum > 0) { AddWeight (w, lastLevelDocNum, termFreq, idf); ++checkLevelFreq; } lastLevelDocNum = levelDocNum; termFreq = 0; } termFreq += count; } if (lastLevelDocNum > 0) { AddWeight (w, lastLevelDocNum, termFreq, idf); ++checkLevelFreq; } if (checkLevelFreq != wordEl.levelFreqs[levelNum]) { cerr << "bad level freq at level " < U) U = wgt; if (wgt > 0 && wgt < L) L = wgt; ++here; } double B = pow (U / L, pow (2.0, -(double) bits)); #ifndef SILENT fprintf (stderr, "L = %f\n", L); fprintf (stderr, "U = %f\n", U); fprintf (stderr, "B = %f\n", B); #endif WriteUC (approxWeightsFile, bits); WriteD (approxWeightsFile, L); WriteD (approxWeightsFile, B); unsigned long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;; unsigned long i=0, buf=0, pos=0; here = w.begin(); end = w.end(); while (here != end) { unsigned long fx; wgt = sqrt (*here); if (wgt == 0) { wgt = L; #ifndef SILENT Message ("Warning: Document %d had a weight of 0.", i); #endif } fx = (unsigned long) floor (log (wgt / L) / log (B)); if (fx > max) fx = max; buf |= (fx << pos); pos += bits; if (pos >= MAXBITS) { WriteUL (approxWeightsFile, buf); buf = fx >> (bits - (pos - MAXBITS)); pos = pos - MAXBITS; } ++here; ++i; } // write out the last bits if (pos > 0) WriteUL (approxWeightsFile, buf); } int main (int argc, char **argv) { unsigned char bits = 8; char *filename = ""; int ch; opterr = 0; msg_prefix = argv[0]; while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) { switch (ch) { case 'f': // input file filename = optarg; break; case 'd': set_basepath (optarg); break; case 'b': bits = atoi (optarg); if (bits > 32) { fprintf (stderr, "b may only take values 0-32\n"); exit (1); } break; case 'h': case '?': fprintf (stderr, "usage: %s [-f input_file]" "[-d data directory] [-b bits] [-h]\n", argv[0]); exit (1); } } // open the dictionary FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); invf_dict_header idh; idh.Read (dictFile); // open the inverted file FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb", MAGIC_INVF, MG_ABORT); invf_file_header ifh; ifh.Read (invfFile); if (ifh.skip_mode != SKIP_MODE_NO_SKIPS) FatalError (1, "The invf file contains skips. Unable to create weights."); // open the inverted index file FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb", MAGIC_INVI, MG_ABORT); // read the level information FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb", MAGIC_INVF_LEVELS, MG_ABORT); FIvfLevel ivfLevel; ivfLevel.Read (levelFile); fclose (levelFile); // read in the tag dictionary and inverted file pointers WBTagDict tagDict; ReadTagDict (idh, dictFile, invfIdxFile, tagDict); // create the weights file FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb", MAGIC_WGHT, MG_ABORT); // create the approx weights file FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb", MAGIC_WGHT_APPROX, MG_ABORT); // create weights for each document level FragLevelConvert fragLevelConvert; Weights w; IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin(); IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end(); unsigned long levelNum = 0; while (levelHere != levelEnd) { const UCArray &levelName = (*levelHere).first; // read the tag information about this level fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr, idh.num_frags, tagDict[levelName].fragOccur); // create the weights for this level GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur, levelNum, dictFile, invfFile, invfIdxFile, fragLevelConvert, w); // write out the exact weights WriteExactWeights (weightsFile, (*levelHere).second.exactWeightsDiskPtr, w); // write out the approximate weights WriteApproxWeights (approxWeightsFile, (*levelHere).second.approxWeightsDiskPtr, w, bits); ++levelHere; ++levelNum; } // close input files fclose (dictFile); fclose (invfFile); fclose (invfIdxFile); // update the level information levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb", MAGIC_INVF_LEVELS, MG_ABORT); ivfLevel.Write (levelFile); fclose (levelFile); // close output files fclose (weightsFile); fclose (approxWeightsFile); return 0; }