/************************************************************************** * * mgpp_stem_idx.cpp -- stem index builder * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 // This was added for Solaris, but it makes things worse on Solaris for me... // #define _XOPEN_SOURCE_EXTENDED 1 /* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */ #if defined (__WIN32__) || defined (__CYGWIN__) # include "getopt_old.h" #else # include #endif #include "UCArray.h" #include "sysfuncs.h" #include "messages.h" #include "mg_files.h" #include "invf.h" #include "words.h" #include "stemmer.h" #if defined(GSDL_USE_OBJECTSPACE) # include # include #elif defined(GSDL_USE_STL_H) # include # include #else # include # include #endif typedef vector WordNumList; typedef map StemMapDict; void CreateStemDict (char *filename, StemMapDict &stemDict, int stemMethod, int stemmerNum) { stemDict.erase (stemDict.begin(), stemDict.end()); // open the dictionary FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); invf_dict_header idh; idh.Read (dictFile); fseek (dictFile, idh.word_dict_start, SEEK_SET); unsigned long wordNum; u_char mgWord[MAXSTEMLEN + 1]; word_dict_el wordEl; UCArray stemEl; wordEl.SetNumLevels (idh.num_levels); for (wordNum=0; wordNum \"" << wordEl.el << "\"\n"; // add this word number to the list of word numbers for this word stemDict[stemEl].push_back (wordNum); } fclose (dictFile); } void WriteStemDict (char *filename, StemMapDict &stemDict, int stemMethod, int stemmerNum, unsigned long entriesPerBlock) { /* [JFG - Mar 06: Accent folding patch] */ // Create appropriate stem index file FILE *stemDictFile = NULL; if (stemMethod >= STEM_MIN && stemMethod <= STEM_MAX) { char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stemMethod, NULL); stemDictFile = create_file (filename, suffix, "wb", MAGIC_STEM_GEN(stemMethod + '0'), MG_ABORT); } else { FatalError (1, "Unknown stem method %d", stemMethod); } stem_idx_header sih; sih.lookback = 0; sih.dict_size = stemDict.size(); sih.entries_per_block = entriesPerBlock; sih.max_block_size = 0; sih.stemmer_num = stemmerNum; sih.stem_method = stemMethod; // write out a place-holder version of the header sih.Write (stemDictFile); sih.blocks_start = ftell (stemDictFile); block_idx stemIdx; unsigned long stemNum = 0; stem_block_dict_el stemEl; UCArray lastEl; StemMapDict::const_iterator here = stemDict.begin(); StemMapDict::const_iterator end = stemDict.end(); while (here != end) { // remember this stem (and position) if this is the start // of a new block if (stemNum % entriesPerBlock == 0) { block_idx_info elIdx; elIdx.el = (*here).first; elIdx.block_ptr = ftell (stemDictFile); // see if this block is the longest so far if (stemIdx.size() > 0) { unsigned long blockLen = elIdx.block_ptr - (*(stemIdx.end()-1)).block_ptr; if (blockLen > sih.max_block_size) sih.max_block_size = blockLen; } stemIdx.push_back (elIdx); lastEl.erase (lastEl.begin(), lastEl.end()); // output full word } // copy the information for this stem stemEl.el = (*here).first; stemEl.equivWords = (*here).second; // write out the stem stemEl.Write (stemDictFile, &lastEl); ++here; ++stemNum; } // write out the element indexes sih.num_blocks = stemIdx.size(); sih.block_idx_start = ftell (stemDictFile); WriteBlockIdx (stemDictFile, stemIdx); // write out the stem dictionary header fseek (stemDictFile, sizeof(unsigned long), SEEK_SET); sih.Write (stemDictFile); // close open files fclose (stemDictFile); // print out information #ifndef SILENT Message ("Num word stems = %d\n", sih.dict_size); Message ("Max stem block size = %d\n", sih.max_block_size); Message ("Number of stem blocks written = %d\n", sih.num_blocks); #endif } int main (int argc, char **argv) { unsigned long entriesPerBlock = 16; char *filename = ""; int ch; int stemMethod = 0; // illegal value (no translation) int stemmerNum = 0; // English stemmer msg_prefix = argv[0]; opterr = 0; while ((ch = getopt (argc, argv, "f:d:b:s:h:a:")) != -1) { switch (ch) { case 'f': // input file filename = optarg; break; case 'd': set_basepath (optarg); break; case 'b': entriesPerBlock = atoi (optarg); break; case 's': stemMethod = atoi (optarg); break; case 'a': stemmerNum = mgpp_stemmernumber ((unsigned char *) optarg); break; case 'h': case '?': fprintf (stderr, "usage: %s [-d directory] " "[-b entries-per-block] [-h] -s 1|2|3", argv[0]); #ifdef ENABLE_ACCENTFOLD fprintf (stderr, "|4|5|6|7"); #endif fprintf (stderr, " [-a stemmer-method] -f name\n"); exit (1); } } /* [JFG - Mar 06: Accent folding patch] */ if (stemMethod < STEM_MIN || stemMethod > STEM_MAX) FatalError (1, "Stem method must be between %d and %d", STEM_MIN, STEM_MAX); #ifndef ENABLE_ACCENTFOLD if (stemMethod & STEM_AccentFolding) { // accent folding not enabled return 2; } #endif // read in the dictionary and create the in memory dictionary StemMapDict stemDict; CreateStemDict (filename, stemDict, stemMethod, stemmerNum); // write out the dictionary as a blocked file WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock); return 0; }