source: trunk/indexers/mgpp/text/mgpp_stem_idx.cpp@ 9613

Last change on this file since 9613 was 9613, checked in by kjdon, 19 years ago

added in x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_stem_idx.cpp -- stem index builder
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
26#if defined (__WIN32__) || defined (__CYGWIN__)
27# include "getopt_old.h"
28#else
29# include <unistd.h>
30#endif
31
32#include "UCArray.h"
33#include "sysfuncs.h"
34#include "messages.h"
35#include "mg_files.h"
36#include "invf.h"
37#include "words.h"
38#include "stemmer.h"
39
40#if defined(GSDL_USE_OBJECTSPACE)
41# include <ospace\std\map>
42# include <ospace\std\vector>
43#elif defined(GSDL_USE_STL_H)
44# include <map.h>
45# include <vector.h>
46#else
47# include <map>
48# include <vector>
49#endif
50
51typedef vector<unsigned long> WordNumList;
52typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;
53
54void CreateStemDict (char *filename,
55 StemMapDict &stemDict,
56 int stemMethod,
57 int stemmerNum) {
58 stemDict.erase (stemDict.begin(), stemDict.end());
59
60 // open the dictionary
61 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
62 MAGIC_STEM_BUILD, MG_ABORT);
63 invf_dict_header idh;
64 idh.Read (dictFile);
65
66 fseek (dictFile, idh.word_dict_start, SEEK_SET);
67
68 unsigned long wordNum;
69 u_char mgWord[MAXSTEMLEN + 1];
70 word_dict_el wordEl;
71 UCArray stemEl;
72 wordEl.SetNumLevels (idh.num_levels);
73 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
74 // read in the next word
75 wordEl.Read (dictFile, idh.num_levels);
76
77 // convert the word to an "mg word"
78 mgWord[0] = wordEl.el.size();
79 memcpy((char *)&mgWord[1], &(wordEl.el[0]), wordEl.el.size());
80
81 // stem the word
82 stemmer (stemMethod, stemmerNum, mgWord);
83
84 // convert the result back to a UCArray
85 stemEl.erase (stemEl.begin(), stemEl.end());
86 stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
87
88// cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
89
90 // add this word number to the list of word numbers for this word
91 stemDict[stemEl].push_back (wordNum);
92 }
93
94 fclose (dictFile);
95}
96
97
98void WriteStemDict (char *filename,
99 StemMapDict &stemDict,
100 int stemMethod,
101 int stemmerNum,
102 unsigned long entriesPerBlock) {
103 // Create appropriate stem index file
104 FILE *stemDictFile = NULL;
105 if (stemMethod == 1) {
106 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
107 "wb", MAGIC_STEM_1, MG_ABORT);
108 } else if (stemMethod == 2) {
109 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
110 "wb", MAGIC_STEM_2, MG_ABORT);
111 } else if (stemMethod == 3) {
112 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
113 "wb", MAGIC_STEM_3, MG_ABORT);
114 } else {
115 FatalError (1, "Unknown stem method %d", stemMethod);
116 }
117
118 stem_idx_header sih;
119 sih.lookback = 0;
120 sih.dict_size = stemDict.size();
121 sih.entries_per_block = entriesPerBlock;
122 sih.max_block_size = 0;
123
124 sih.stemmer_num = stemmerNum;
125 sih.stem_method = stemMethod;
126
127 // write out a place-holder version of the header
128 sih.Write (stemDictFile);
129
130 sih.blocks_start = ftell (stemDictFile);
131
132 block_idx stemIdx;
133 unsigned long stemNum = 0;
134 stem_block_dict_el stemEl;
135 UCArray lastEl;
136
137 StemMapDict::const_iterator here = stemDict.begin();
138 StemMapDict::const_iterator end = stemDict.end();
139 while (here != end) {
140 // remember this stem (and position) if this is the start
141 // of a new block
142 if (stemNum % entriesPerBlock == 0) {
143 block_idx_info elIdx;
144 elIdx.el = (*here).first;
145 elIdx.block_ptr = ftell (stemDictFile);
146
147 // see if this block is the longest so far
148 if (stemIdx.size() > 0) {
149 unsigned long blockLen = elIdx.block_ptr -
150 (*(stemIdx.end()-1)).block_ptr;
151 if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
152 }
153
154 stemIdx.push_back (elIdx);
155 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
156 }
157
158 // copy the information for this stem
159 stemEl.el = (*here).first;
160 stemEl.equivWords = (*here).second;
161
162 // write out the stem
163 stemEl.Write (stemDictFile, &lastEl);
164
165 ++here; ++stemNum;
166 }
167
168
169 // write out the element indexes
170 sih.num_blocks = stemIdx.size();
171 sih.block_idx_start = ftell (stemDictFile);
172 WriteBlockIdx (stemDictFile, stemIdx);
173
174 // write out the stem dictionary header
175 fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
176 sih.Write (stemDictFile);
177
178
179 // close open files
180 fclose (stemDictFile);
181
182 // print out information
183#ifndef SILENT
184 Message ("Num word stems = %d\n", sih.dict_size);
185 Message ("Max stem block size = %d\n", sih.max_block_size);
186 Message ("Number of stem blocks written = %d\n", sih.num_blocks);
187#endif
188}
189
190
191int main (int argc, char **argv) {
192 unsigned long entriesPerBlock = 16;
193 char *filename = "";
194 int ch;
195 int stemMethod = 0; // illegal value (no translation)
196 int stemmerNum = 0; // English stemmer
197 msg_prefix = argv[0];
198 opterr = 0;
199
200 while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) {
201 switch (ch) {
202 case 'f': // input file
203 filename = optarg;
204 break;
205 case 'd':
206 set_basepath (optarg);
207 break;
208 case 'b':
209 entriesPerBlock = atoi (optarg);
210 break;
211 case 's':
212 stemMethod = atoi (optarg);
213 break;
214 case 'a':
215 stemmerNum = stemmernumber ((unsigned char *) optarg);
216 break;
217 case 'h':
218 case '?':
219 fprintf (stderr, "usage: %s [-d directory] "
220 "[-b entries-per-block] [-h] -s 1|2|3 "
221 "[-a stemmer-method] -f name\n", argv[0]);
222 exit (1);
223 }
224 }
225
226 if (stemMethod < 1 || stemMethod > 3)
227 FatalError (1, "Stem method must be 1, 2 or 3");
228
229 // read in the dictionary and create the in memory dictionary
230 StemMapDict stemDict;
231 CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
232
233 // write out the dictionary as a blocked file
234 WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
235
236 return 0;
237}
Note: See TracBrowser for help on using the repository browser.