source: indexers/trunk/mgpp/text/mgpp_stem_idx.cpp@ 18773

Last change on this file since 18773 was 18773, checked in by kjdon, 15 years ago

a option (stemmer method) missing from list in getopt in main

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_stem_idx.cpp -- stem index builder
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
26#if defined (__WIN32__) || defined (__CYGWIN__)
27# include "getopt_old.h"
28#else
29# include <unistd.h>
30#endif
31
32#include "UCArray.h"
33#include "sysfuncs.h"
34#include "messages.h"
35#include "mg_files.h"
36#include "invf.h"
37#include "words.h"
38#include "stemmer.h"
39
40#if defined(GSDL_USE_OBJECTSPACE)
41# include <ospace\std\map>
42# include <ospace\std\vector>
43#elif defined(GSDL_USE_STL_H)
44# include <map.h>
45# include <vector.h>
46#else
47# include <map>
48# include <vector>
49#endif
50
51typedef vector<unsigned long> WordNumList;
52typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;
53
54void CreateStemDict (char *filename,
55 StemMapDict &stemDict,
56 int stemMethod,
57 int stemmerNum) {
58 stemDict.erase (stemDict.begin(), stemDict.end());
59
60 // open the dictionary
61 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
62 MAGIC_STEM_BUILD, MG_ABORT);
63 invf_dict_header idh;
64 idh.Read (dictFile);
65
66 fseek (dictFile, idh.word_dict_start, SEEK_SET);
67
68 unsigned long wordNum;
69 u_char mgWord[MAXSTEMLEN + 1];
70 word_dict_el wordEl;
71 UCArray stemEl;
72 wordEl.SetNumLevels (idh.num_levels);
73 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
74 // read in the next word
75 wordEl.Read (dictFile, idh.num_levels);
76
77 // convert the word to an "mg word"
78 mgWord[0] = wordEl.el.size();
79 memcpy((char *)&mgWord[1], &(wordEl.el[0]), wordEl.el.size());
80
81 // stem the word
82 mgpp_stemmer (stemMethod, stemmerNum, mgWord);
83
84 // convert the result back to a UCArray
85 stemEl.erase (stemEl.begin(), stemEl.end());
86 stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
87
88// cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
89
90 // add this word number to the list of word numbers for this word
91 stemDict[stemEl].push_back (wordNum);
92 }
93
94 fclose (dictFile);
95}
96
97
98void WriteStemDict (char *filename,
99 StemMapDict &stemDict,
100 int stemMethod,
101 int stemmerNum,
102 unsigned long entriesPerBlock) {
103
104 /* [JFG - Mar 06: Accent folding patch] */
105 // Create appropriate stem index file
106 FILE *stemDictFile = NULL;
107 if (stemMethod >= STEM_MIN && stemMethod <= STEM_MAX) {
108 char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stemMethod, NULL);
109 stemDictFile = create_file (filename, suffix,
110 "wb", MAGIC_STEM_GEN(stemMethod + '0'), MG_ABORT);
111 }
112 else {
113 FatalError (1, "Unknown stem method %d", stemMethod);
114 }
115
116 stem_idx_header sih;
117 sih.lookback = 0;
118 sih.dict_size = stemDict.size();
119 sih.entries_per_block = entriesPerBlock;
120 sih.max_block_size = 0;
121
122 sih.stemmer_num = stemmerNum;
123 sih.stem_method = stemMethod;
124
125 // write out a place-holder version of the header
126 sih.Write (stemDictFile);
127
128 sih.blocks_start = ftell (stemDictFile);
129
130 block_idx stemIdx;
131 unsigned long stemNum = 0;
132 stem_block_dict_el stemEl;
133 UCArray lastEl;
134
135 StemMapDict::const_iterator here = stemDict.begin();
136 StemMapDict::const_iterator end = stemDict.end();
137 while (here != end) {
138 // remember this stem (and position) if this is the start
139 // of a new block
140 if (stemNum % entriesPerBlock == 0) {
141 block_idx_info elIdx;
142 elIdx.el = (*here).first;
143 elIdx.block_ptr = ftell (stemDictFile);
144
145 // see if this block is the longest so far
146 if (stemIdx.size() > 0) {
147 unsigned long blockLen = elIdx.block_ptr -
148 (*(stemIdx.end()-1)).block_ptr;
149 if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
150 }
151
152 stemIdx.push_back (elIdx);
153 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
154 }
155
156 // copy the information for this stem
157 stemEl.el = (*here).first;
158 stemEl.equivWords = (*here).second;
159
160 // write out the stem
161 stemEl.Write (stemDictFile, &lastEl);
162
163 ++here; ++stemNum;
164 }
165
166
167 // write out the element indexes
168 sih.num_blocks = stemIdx.size();
169 sih.block_idx_start = ftell (stemDictFile);
170 WriteBlockIdx (stemDictFile, stemIdx);
171
172 // write out the stem dictionary header
173 fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
174 sih.Write (stemDictFile);
175
176
177 // close open files
178 fclose (stemDictFile);
179
180 // print out information
181#ifndef SILENT
182 Message ("Num word stems = %d\n", sih.dict_size);
183 Message ("Max stem block size = %d\n", sih.max_block_size);
184 Message ("Number of stem blocks written = %d\n", sih.num_blocks);
185#endif
186}
187
188
189int main (int argc, char **argv) {
190 unsigned long entriesPerBlock = 16;
191 char *filename = "";
192 int ch;
193 int stemMethod = 0; // illegal value (no translation)
194 int stemmerNum = 0; // English stemmer
195 msg_prefix = argv[0];
196 opterr = 0;
197
198 while ((ch = getopt (argc, argv, "f:d:b:s:h:a:")) != -1) {
199 switch (ch) {
200 case 'f': // input file
201 filename = optarg;
202 break;
203 case 'd':
204 set_basepath (optarg);
205 break;
206 case 'b':
207 entriesPerBlock = atoi (optarg);
208 break;
209 case 's':
210 stemMethod = atoi (optarg);
211 break;
212 case 'a':
213 stemmerNum = mgpp_stemmernumber ((unsigned char *) optarg);
214 break;
215 case 'h':
216 case '?':
217 fprintf (stderr, "usage: %s [-d directory] "
218 "[-b entries-per-block] [-h] -s 1|2|3", argv[0]);
219#ifdef ENABLE_ACCENTFOLD
220 fprintf (stderr, "|4|5|6|7");
221#endif
222 fprintf (stderr, " [-a stemmer-method] -f name\n");
223 exit (1);
224 }
225 }
226
227 /* [JFG - Mar 06: Accent folding patch] */
228 if (stemMethod < STEM_MIN || stemMethod > STEM_MAX)
229 FatalError (1, "Stem method must be between %d and %d", STEM_MIN, STEM_MAX);
230#ifndef ENABLE_ACCENTFOLD
231 if (stemMethod & STEM_AccentFolding) {
232 // accent folding not enabled
233 return 2;
234 }
235#endif
236 // read in the dictionary and create the in memory dictionary
237 StemMapDict stemDict;
238 CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
239
240 // write out the dictionary as a blocked file
241 WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
242
243 return 0;
244}
Note: See TracBrowser for help on using the repository browser.