/************************************************************************** * * mgpp_invf_dict.cpp -- Program to build the blocked stemmed dictionary * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 #define _XOPEN_SOURCE_EXTENDED 1 // need this to avoid bizarre compiler problems under VC++ 6.0 #if defined (__WIN32__) && !defined (GSDL_USE_IOS_H) # include #endif /* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */ #if defined (__WIN32__) || defined (__CYGWIN__) # include "getopt_old.h" #else # include #endif #include "sysfuncs.h" #include "messages.h" #include "mg_files.h" #include "invf.h" static void process_files (char *filename, unsigned long entriesPerBlock) { // open the dictionary FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); invf_dict_header idh; idh.Read (dictFile); // open the inverted index file FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb", MAGIC_INVI, MG_ABORT); // create the blocked dictionary FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb", MAGIC_STEM, MG_ABORT); block_dict_header bdh; bdh.lookback = idh.lookback; bdh.word_dict_start = idh.word_dict_start; bdh.word_dict_size = idh.word_dict_size; bdh.tag_dict_start = idh.tag_dict_start; bdh.tag_dict_size = idh.tag_dict_size; bdh.num_docs = idh.num_docs; bdh.num_frags = idh.num_frags; bdh.num_words = idh.num_words; bdh.total_bytes = idh.total_bytes; bdh.index_string_bytes = idh.index_string_bytes; bdh.num_levels = idh.num_levels; bdh.Write (blockDictFile); // write out the word part of the dictionary bdh.entries_per_wblk = entriesPerBlock; bdh.max_wblk_size = 0; bdh.wblk_start = ftell (blockDictFile); fseek (dictFile, idh.word_dict_start, SEEK_SET); block_idx wordIdx; word_block_dict_el wordBlockEl; wordBlockEl.SetNumLevels (idh.num_levels); unsigned long wordNum; unsigned long wordInvfPtr; UCArray lastEl; word_dict_el wordEl; wordEl.SetNumLevels (idh.num_levels); for (wordNum=0; wordNum 0) { unsigned long blockLen = elIdx.block_ptr - (*(wordIdx.end()-1)).block_ptr; if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen; } wordIdx.push_back (elIdx); lastEl.erase (lastEl.begin(), lastEl.end()); // output full word } // copy the information for this word wordBlockEl.el = wordEl.el; wordBlockEl.frag_occur = wordEl.frag_occur; wordBlockEl.freq = wordEl.freq; wordBlockEl.invf_ptr = wordInvfPtr; unsigned long tempI; for (tempI=0; tempI 0) { unsigned long blockLen = elIdx.block_ptr - (*(tagIdx.end()-1)).block_ptr; if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen; } tagIdx.push_back (elIdx); lastEl.erase (lastEl.begin(), lastEl.end()); // output full word } // copy the information for this tag tagBlockEl.el = tagEl.el; tagBlockEl.frag_occur = tagEl.frag_occur; tagBlockEl.freq = tagEl.freq; tagBlockEl.invf_ptr = tagInvfPtr; // write out the tag tagBlockEl.Write (blockDictFile, &lastEl); lastEl = tagBlockEl.el; } // write out the element indexes bdh.num_wblks = wordIdx.size(); bdh.wblk_idx_start = ftell (blockDictFile); WriteBlockIdx (blockDictFile, wordIdx); bdh.num_tblks = tagIdx.size(); bdh.tblk_idx_start = ftell (blockDictFile); WriteBlockIdx (blockDictFile, tagIdx); // write out the blocked dictionary header fseek (blockDictFile, sizeof(unsigned long), SEEK_SET); bdh.Write (blockDictFile); // close open files fclose (blockDictFile); fclose (invfIdxFile); fclose (dictFile); // print out information #ifndef SILENT Message ("Max word block size = %d\n", bdh.max_wblk_size); Message ("Max tag block size = %d\n", bdh.max_tblk_size); Message ("Number of word blocks written = %d\n", bdh.num_wblks); Message ("Number of tag blocks written = %d\n", bdh.num_tblks); #endif } int main (int argc, char **argv) { unsigned long entriesPerBlock = 16; char *filename = ""; int ch; msg_prefix = argv[0]; opterr = 0; while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) { switch (ch) { case 'f': // input file filename = optarg; break; case 'd': set_basepath (optarg); break; case 'b': entriesPerBlock = atoi (optarg); break; case 'h': case '?': fprintf (stderr, "usage: %s [-f input_file] " "[-d data directory] [-b entries-per-block] " "[-h]\n", argv[0]); exit (1); } } process_files (filename, entriesPerBlock); return 0; }