source: trunk/gsdl/src/mgpp/text/mg_stem_idx.cpp@ 858

Last change on this file since 858 was 858, checked in by sjboddie, 24 years ago

fixed compiler warning

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_stem_idx.cpp -- stem index builder
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22
23#include "sysfuncs.h"
24#include "messages.h"
25
26#include "mg_files.h"
27#include "invf.h"
28#include "UCArray.h"
29#include "words.h"
30
31#include "stemmer.h"
32
33
34#if defined(GSDL_USE_OBJECTSPACE)
35# include <ospace\std\map>
36# include <ospace\std\vector>
37#elif defined(GSDL_USE_STL_H)
38# include <map.h>
39# include <vector.h>
40#else
41# include <map>
42# include <vector>
43#endif
44
45
46/*
47 $Log$
48 Revision 1.2 2000/01/14 02:45:51 sjboddie
49 fixed compiler warning
50
51 Revision 1.1 2000/01/14 02:26:20 sjboddie
52 Rodgers new C++ mg
53
54 */
55
56
57typedef vector<unsigned long> WordNumList;
58typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;
59
60
61void CreateStemDict (char *filename,
62 StemMapDict &stemDict,
63 int stemMethod,
64 int stemmerNum) {
65 stemDict.erase (stemDict.begin(), stemDict.end());
66
67 // open the dictionary
68 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
69 MAGIC_STEM_BUILD, MG_ABORT);
70 invf_dict_header idh;
71 idh.Read (dictFile);
72
73 fseek (dictFile, idh.word_dict_start, SEEK_SET);
74
75 unsigned long wordNum;
76 u_char mgWord[MAXSTEMLEN + 1];
77 word_dict_el wordEl;
78 UCArray stemEl;
79 wordEl.SetNumLevels (idh.num_levels);
80 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
81 // read in the next word
82 wordEl.Read (dictFile, idh.num_levels);
83
84 // convert the word to an "mg word"
85 mgWord[0] = wordEl.el.size();
86 bcopy ((char *)wordEl.el.begin(), (char *)&mgWord[1], wordEl.el.size());
87
88 // stem the word
89 stemmer (stemMethod, stemmerNum, mgWord);
90
91 // convert the result back to a UCArray
92 stemEl.erase (stemEl.begin(), stemEl.end());
93 stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
94
95// cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
96
97 // add this word number to the list of word numbers for this word
98 stemDict[stemEl].push_back (wordNum);
99 }
100
101 fclose (dictFile);
102}
103
104
105void WriteStemDict (char *filename,
106 StemMapDict &stemDict,
107 int stemMethod,
108 int stemmerNum,
109 unsigned long entriesPerBlock) {
110 // Create appropriate stem index file
111 FILE *stemDictFile = NULL;
112 if (stemMethod == 1) {
113 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
114 "wb", MAGIC_STEM_1, MG_ABORT);
115 } else if (stemMethod == 2) {
116 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
117 "wb", MAGIC_STEM_2, MG_ABORT);
118 } else if (stemMethod == 3) {
119 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
120 "wb", MAGIC_STEM_3, MG_ABORT);
121 } else {
122 FatalError (1, "Unknown stem method %d", stemMethod);
123 }
124
125 stem_idx_header sih;
126 sih.lookback = 0;
127 sih.dict_size = stemDict.size();
128 sih.entries_per_block = entriesPerBlock;
129 sih.max_block_size = 0;
130
131 sih.stemmer_num = stemmerNum;
132 sih.stem_method = stemMethod;
133
134 // write out a place-holder version of the header
135 sih.Write (stemDictFile);
136
137 sih.blocks_start = ftell (stemDictFile);
138
139 block_idx stemIdx;
140 unsigned long stemNum = 0;
141 stem_block_dict_el stemEl;
142 UCArray lastEl;
143
144 StemMapDict::const_iterator here = stemDict.begin();
145 StemMapDict::const_iterator end = stemDict.end();
146 while (here != end) {
147 // remember this stem (and position) if this is the start
148 // of a new block
149 if (stemNum % entriesPerBlock == 0) {
150 block_idx_info elIdx;
151 elIdx.el = (*here).first;
152 elIdx.block_ptr = ftell (stemDictFile);
153
154 // see if this block is the longest so far
155 if (stemIdx.size() > 0) {
156 unsigned long blockLen = elIdx.block_ptr -
157 (*(stemIdx.end()-1)).block_ptr;
158 if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
159 }
160
161 stemIdx.push_back (elIdx);
162 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
163 }
164
165 // copy the information for this stem
166 stemEl.el = (*here).first;
167 stemEl.equivWords = (*here).second;
168
169 // write out the stem
170 stemEl.Write (stemDictFile, &lastEl);
171
172 here++; stemNum++;
173 }
174
175
176 // write out the element indexes
177 sih.num_blocks = stemIdx.size();
178 sih.block_idx_start = ftell (stemDictFile);
179 WriteBlockIdx (stemDictFile, stemIdx);
180
181 // write out the stem dictionary header
182 fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
183 sih.Write (stemDictFile);
184
185
186 // close open files
187 fclose (stemDictFile);
188
189 // print out information
190 Message ("Num word stems = %d\n", sih.dict_size);
191 Message ("Max stem block size = %d\n", sih.max_block_size);
192 Message ("Number of stem blocks written = %d\n", sih.num_blocks);
193}
194
195
196int main (int argc, char **argv) {
197 unsigned long entriesPerBlock = 16;
198 char *filename = "";
199 int ch;
200 int stemMethod = 0; // illegal value (no translation)
201 int stemmerNum = 0; // English stemmer
202 msg_prefix = argv[0];
203 opterr = 0;
204
205 while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) {
206 switch (ch) {
207 case 'f': // input file
208 filename = optarg;
209 break;
210 case 'd':
211 set_basepath (optarg);
212 break;
213 case 'b':
214 entriesPerBlock = atoi (optarg);
215 break;
216 case 's':
217 stemMethod = atoi (optarg);
218 break;
219 case 'a':
220 stemmerNum = stemmernumber ((unsigned char *) optarg);
221 break;
222 case 'h':
223 case '?':
224 fprintf (stderr, "usage: %s [-d directory] "
225 "[-b entries-per-block] [-h] -s 1|2|3 "
226 "[-a stemmer-method] -f name\n", argv[0]);
227 exit (1);
228 }
229 }
230
231 if (stemMethod < 1 || stemMethod > 3)
232 FatalError (1, "Stem method must be 1, 2 or 3");
233
234 // read in the dictionary and create the in memory dictionary
235 StemMapDict stemDict;
236 CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
237
238 // write out the dictionary as a blocked file
239 WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
240
241 return 0;
242}
Note: See TracBrowser for help on using the repository browser.