source: trunk/gsdl/src/mgpp/text/mg_stem_idx.cpp@ 860

Last change on this file since 860 was 860, checked in by rjmcnab, 24 years ago

Fixed a couple of bugs and made building silent if needed.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.7 KB
RevLine 
[856]1/**************************************************************************
2 *
3 * mg_stem_idx.cpp -- stem index builder
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22
23#include "sysfuncs.h"
24#include "messages.h"
25
26#include "mg_files.h"
27#include "invf.h"
28#include "UCArray.h"
29#include "words.h"
30
31#include "stemmer.h"
32
33
34#if defined(GSDL_USE_OBJECTSPACE)
35# include <ospace\std\map>
36# include <ospace\std\vector>
37#elif defined(GSDL_USE_STL_H)
38# include <map.h>
39# include <vector.h>
40#else
41# include <map>
42# include <vector>
43#endif
44
45
46/*
47 $Log$
[860]48 Revision 1.3 2000/01/18 03:53:24 rjmcnab
49 Fixed a couple of bugs and made building silent if needed.
50
[858]51 Revision 1.2 2000/01/14 02:45:51 sjboddie
52 fixed compiler warning
53
[856]54 Revision 1.1 2000/01/14 02:26:20 sjboddie
55 Rodgers new C++ mg
56
57 */
58
59
60typedef vector<unsigned long> WordNumList;
61typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;
62
63
64void CreateStemDict (char *filename,
65 StemMapDict &stemDict,
66 int stemMethod,
67 int stemmerNum) {
68 stemDict.erase (stemDict.begin(), stemDict.end());
69
70 // open the dictionary
71 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
72 MAGIC_STEM_BUILD, MG_ABORT);
73 invf_dict_header idh;
74 idh.Read (dictFile);
75
76 fseek (dictFile, idh.word_dict_start, SEEK_SET);
77
78 unsigned long wordNum;
79 u_char mgWord[MAXSTEMLEN + 1];
80 word_dict_el wordEl;
81 UCArray stemEl;
82 wordEl.SetNumLevels (idh.num_levels);
83 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
84 // read in the next word
85 wordEl.Read (dictFile, idh.num_levels);
86
87 // convert the word to an "mg word"
88 mgWord[0] = wordEl.el.size();
89 bcopy ((char *)wordEl.el.begin(), (char *)&mgWord[1], wordEl.el.size());
90
91 // stem the word
92 stemmer (stemMethod, stemmerNum, mgWord);
93
94 // convert the result back to a UCArray
95 stemEl.erase (stemEl.begin(), stemEl.end());
96 stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
97
98// cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
99
100 // add this word number to the list of word numbers for this word
101 stemDict[stemEl].push_back (wordNum);
102 }
103
104 fclose (dictFile);
105}
106
107
108void WriteStemDict (char *filename,
109 StemMapDict &stemDict,
110 int stemMethod,
111 int stemmerNum,
112 unsigned long entriesPerBlock) {
113 // Create appropriate stem index file
[858]114 FILE *stemDictFile = NULL;
[856]115 if (stemMethod == 1) {
116 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
117 "wb", MAGIC_STEM_1, MG_ABORT);
118 } else if (stemMethod == 2) {
119 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
120 "wb", MAGIC_STEM_2, MG_ABORT);
121 } else if (stemMethod == 3) {
122 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
123 "wb", MAGIC_STEM_3, MG_ABORT);
124 } else {
125 FatalError (1, "Unknown stem method %d", stemMethod);
126 }
127
128 stem_idx_header sih;
129 sih.lookback = 0;
130 sih.dict_size = stemDict.size();
131 sih.entries_per_block = entriesPerBlock;
132 sih.max_block_size = 0;
133
134 sih.stemmer_num = stemmerNum;
135 sih.stem_method = stemMethod;
136
137 // write out a place-holder version of the header
138 sih.Write (stemDictFile);
139
140 sih.blocks_start = ftell (stemDictFile);
141
142 block_idx stemIdx;
143 unsigned long stemNum = 0;
144 stem_block_dict_el stemEl;
145 UCArray lastEl;
146
147 StemMapDict::const_iterator here = stemDict.begin();
148 StemMapDict::const_iterator end = stemDict.end();
149 while (here != end) {
150 // remember this stem (and position) if this is the start
151 // of a new block
152 if (stemNum % entriesPerBlock == 0) {
153 block_idx_info elIdx;
154 elIdx.el = (*here).first;
155 elIdx.block_ptr = ftell (stemDictFile);
156
157 // see if this block is the longest so far
158 if (stemIdx.size() > 0) {
159 unsigned long blockLen = elIdx.block_ptr -
160 (*(stemIdx.end()-1)).block_ptr;
161 if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
162 }
163
164 stemIdx.push_back (elIdx);
165 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
166 }
167
168 // copy the information for this stem
169 stemEl.el = (*here).first;
170 stemEl.equivWords = (*here).second;
171
172 // write out the stem
173 stemEl.Write (stemDictFile, &lastEl);
174
175 here++; stemNum++;
176 }
177
178
179 // write out the element indexes
180 sih.num_blocks = stemIdx.size();
181 sih.block_idx_start = ftell (stemDictFile);
182 WriteBlockIdx (stemDictFile, stemIdx);
183
184 // write out the stem dictionary header
185 fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
186 sih.Write (stemDictFile);
187
188
189 // close open files
190 fclose (stemDictFile);
191
192 // print out information
[860]193#ifndef SILENT
[856]194 Message ("Num word stems = %d\n", sih.dict_size);
195 Message ("Max stem block size = %d\n", sih.max_block_size);
196 Message ("Number of stem blocks written = %d\n", sih.num_blocks);
[860]197#endif
[856]198}
199
200
201int main (int argc, char **argv) {
202 unsigned long entriesPerBlock = 16;
203 char *filename = "";
204 int ch;
205 int stemMethod = 0; // illegal value (no translation)
206 int stemmerNum = 0; // English stemmer
207 msg_prefix = argv[0];
208 opterr = 0;
209
210 while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) {
211 switch (ch) {
212 case 'f': // input file
213 filename = optarg;
214 break;
215 case 'd':
216 set_basepath (optarg);
217 break;
218 case 'b':
219 entriesPerBlock = atoi (optarg);
220 break;
221 case 's':
222 stemMethod = atoi (optarg);
223 break;
224 case 'a':
225 stemmerNum = stemmernumber ((unsigned char *) optarg);
226 break;
227 case 'h':
228 case '?':
229 fprintf (stderr, "usage: %s [-d directory] "
230 "[-b entries-per-block] [-h] -s 1|2|3 "
231 "[-a stemmer-method] -f name\n", argv[0]);
232 exit (1);
233 }
234 }
235
236 if (stemMethod < 1 || stemMethod > 3)
237 FatalError (1, "Stem method must be 1, 2 or 3");
238
239 // read in the dictionary and create the in memory dictionary
240 StemMapDict stemDict;
241 CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
242
243 // write out the dictionary as a blocked file
244 WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
245
246 return 0;
247}
Note: See TracBrowser for help on using the repository browser.