/************************************************************************** * * mg_stem_idx.cpp -- stem index builder * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #include "sysfuncs.h" #include "messages.h" #include "mg_files.h" #include "invf.h" #include "UCArray.h" #include "words.h" #include "stemmer.h" #if defined(GSDL_USE_OBJECTSPACE) # include # include #elif defined(GSDL_USE_STL_H) # include # include #else # include # include #endif /* $Log$ Revision 1.1 2000/01/14 02:26:20 sjboddie Rodgers new C++ mg */ typedef vector WordNumList; typedef map StemMapDict; void CreateStemDict (char *filename, StemMapDict &stemDict, int stemMethod, int stemmerNum) { stemDict.erase (stemDict.begin(), stemDict.end()); // open the dictionary FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); invf_dict_header idh; idh.Read (dictFile); fseek (dictFile, idh.word_dict_start, SEEK_SET); unsigned long wordNum; u_char mgWord[MAXSTEMLEN + 1]; word_dict_el wordEl; UCArray stemEl; wordEl.SetNumLevels (idh.num_levels); for (wordNum=0; wordNum \"" << wordEl.el << "\"\n"; // add this word number to the list of word numbers for this word stemDict[stemEl].push_back (wordNum); } fclose (dictFile); } void WriteStemDict (char *filename, StemMapDict &stemDict, int stemMethod, int stemmerNum, unsigned long entriesPerBlock) { // Create appropriate stem index file FILE *stemDictFile; if (stemMethod == 1) { stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, "wb", MAGIC_STEM_1, MG_ABORT); } else if (stemMethod == 2) { stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, "wb", MAGIC_STEM_2, MG_ABORT); } else if (stemMethod == 3) { stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, "wb", MAGIC_STEM_3, MG_ABORT); } else { FatalError (1, "Unknown stem method %d", stemMethod); } stem_idx_header sih; sih.lookback = 0; sih.dict_size = stemDict.size(); sih.entries_per_block = entriesPerBlock; sih.max_block_size = 0; sih.stemmer_num = stemmerNum; sih.stem_method = stemMethod; // write out a place-holder version of the header sih.Write (stemDictFile); sih.blocks_start = ftell (stemDictFile); block_idx stemIdx; unsigned long stemNum = 0; stem_block_dict_el stemEl; UCArray lastEl; StemMapDict::const_iterator here = stemDict.begin(); StemMapDict::const_iterator end = stemDict.end(); while (here != end) { // remember this stem (and position) if this is the start // of a new block if (stemNum % entriesPerBlock == 0) { block_idx_info elIdx; elIdx.el = (*here).first; elIdx.block_ptr = ftell (stemDictFile); // see if this block is the longest so far if (stemIdx.size() > 0) { unsigned long blockLen = elIdx.block_ptr - (*(stemIdx.end()-1)).block_ptr; if (blockLen > sih.max_block_size) sih.max_block_size = blockLen; } stemIdx.push_back (elIdx); lastEl.erase (lastEl.begin(), lastEl.end()); // output full word } // copy the information for this stem stemEl.el = (*here).first; stemEl.equivWords = (*here).second; // write out the stem stemEl.Write (stemDictFile, &lastEl); here++; stemNum++; } // write out the element indexes sih.num_blocks = stemIdx.size(); sih.block_idx_start = ftell (stemDictFile); WriteBlockIdx (stemDictFile, stemIdx); // write out the stem dictionary header fseek (stemDictFile, sizeof(unsigned long), SEEK_SET); sih.Write (stemDictFile); // close open files fclose (stemDictFile); // print out information Message ("Num word stems = %d\n", sih.dict_size); Message ("Max stem block size = %d\n", sih.max_block_size); Message ("Number of stem blocks written = %d\n", sih.num_blocks); } int main (int argc, char **argv) { unsigned long entriesPerBlock = 16; char *filename = ""; int ch; int stemMethod = 0; // illegal value (no translation) int stemmerNum = 0; // English stemmer msg_prefix = argv[0]; opterr = 0; while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) { switch (ch) { case 'f': // input file filename = optarg; break; case 'd': set_basepath (optarg); break; case 'b': entriesPerBlock = atoi (optarg); break; case 's': stemMethod = atoi (optarg); break; case 'a': stemmerNum = stemmernumber ((unsigned char *) optarg); break; case 'h': case '?': fprintf (stderr, "usage: %s [-d directory] " "[-b entries-per-block] [-h] -s 1|2|3 " "[-a stemmer-method] -f name\n", argv[0]); exit (1); } } if (stemMethod < 1 || stemMethod > 3) FatalError (1, "Stem method must be 1, 2 or 3"); // read in the dictionary and create the in memory dictionary StemMapDict stemDict; CreateStemDict (filename, stemDict, stemMethod, stemmerNum); // write out the dictionary as a blocked file WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock); return 0; }