source: trunk/gsdl/src/mgpp/text/mg_stem_idx.cpp@ 2442

Last change on this file since 2442 was 2442, checked in by jrm21, 23 years ago

portability changes, use getopt from unistd.h (all POSIX systems)

  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_stem_idx.cpp -- stem index builder
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24#include <unistd.h>
25
26#include "sysfuncs.h"
27#include "messages.h"
28
29#include "mg_files.h"
30#include "invf.h"
31#include "UCArray.h"
32#include "words.h"
33
34#include "stemmer.h"
35
36
37#if defined(GSDL_USE_OBJECTSPACE)
38# include <ospace\std\map>
39# include <ospace\std\vector>
40#elif defined(GSDL_USE_STL_H)
41# include <map.h>
42# include <vector.h>
43#else
44# include <map>
45# include <vector>
46#endif
47
48
49/*
50 $Log$
51 Revision 1.5 2001/05/17 04:38:16 jrm21
52 portability changes, use getopt from unistd.h (all POSIX systems)
53
54 Revision 1.4 2001/05/07 05:01:47 jrm21
55 replaced bcopy with memcpy
56
57 Revision 1.3 2000/01/18 03:53:24 rjmcnab
58 Fixed a couple of bugs and made building silent if needed.
59
60 Revision 1.2 2000/01/14 02:45:51 sjboddie
61 fixed compiler warning
62
63 Revision 1.1 2000/01/14 02:26:20 sjboddie
64 Rodgers new C++ mg
65
66 */
67
68
69typedef vector<unsigned long> WordNumList;
70typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;
71
72
73void CreateStemDict (char *filename,
74 StemMapDict &stemDict,
75 int stemMethod,
76 int stemmerNum) {
77 stemDict.erase (stemDict.begin(), stemDict.end());
78
79 // open the dictionary
80 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
81 MAGIC_STEM_BUILD, MG_ABORT);
82 invf_dict_header idh;
83 idh.Read (dictFile);
84
85 fseek (dictFile, idh.word_dict_start, SEEK_SET);
86
87 unsigned long wordNum;
88 u_char mgWord[MAXSTEMLEN + 1];
89 word_dict_el wordEl;
90 UCArray stemEl;
91 wordEl.SetNumLevels (idh.num_levels);
92 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
93 // read in the next word
94 wordEl.Read (dictFile, idh.num_levels);
95
96 // convert the word to an "mg word"
97 mgWord[0] = wordEl.el.size();
98 memcpy((char *)&mgWord[1], (const char *)wordEl.el.begin(), wordEl.el.size());
99
100 // stem the word
101 stemmer (stemMethod, stemmerNum, mgWord);
102
103 // convert the result back to a UCArray
104 stemEl.erase (stemEl.begin(), stemEl.end());
105 stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
106
107// cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
108
109 // add this word number to the list of word numbers for this word
110 stemDict[stemEl].push_back (wordNum);
111 }
112
113 fclose (dictFile);
114}
115
116
117void WriteStemDict (char *filename,
118 StemMapDict &stemDict,
119 int stemMethod,
120 int stemmerNum,
121 unsigned long entriesPerBlock) {
122 // Create appropriate stem index file
123 FILE *stemDictFile = NULL;
124 if (stemMethod == 1) {
125 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
126 "wb", MAGIC_STEM_1, MG_ABORT);
127 } else if (stemMethod == 2) {
128 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
129 "wb", MAGIC_STEM_2, MG_ABORT);
130 } else if (stemMethod == 3) {
131 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
132 "wb", MAGIC_STEM_3, MG_ABORT);
133 } else {
134 FatalError (1, "Unknown stem method %d", stemMethod);
135 }
136
137 stem_idx_header sih;
138 sih.lookback = 0;
139 sih.dict_size = stemDict.size();
140 sih.entries_per_block = entriesPerBlock;
141 sih.max_block_size = 0;
142
143 sih.stemmer_num = stemmerNum;
144 sih.stem_method = stemMethod;
145
146 // write out a place-holder version of the header
147 sih.Write (stemDictFile);
148
149 sih.blocks_start = ftell (stemDictFile);
150
151 block_idx stemIdx;
152 unsigned long stemNum = 0;
153 stem_block_dict_el stemEl;
154 UCArray lastEl;
155
156 StemMapDict::const_iterator here = stemDict.begin();
157 StemMapDict::const_iterator end = stemDict.end();
158 while (here != end) {
159 // remember this stem (and position) if this is the start
160 // of a new block
161 if (stemNum % entriesPerBlock == 0) {
162 block_idx_info elIdx;
163 elIdx.el = (*here).first;
164 elIdx.block_ptr = ftell (stemDictFile);
165
166 // see if this block is the longest so far
167 if (stemIdx.size() > 0) {
168 unsigned long blockLen = elIdx.block_ptr -
169 (*(stemIdx.end()-1)).block_ptr;
170 if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
171 }
172
173 stemIdx.push_back (elIdx);
174 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
175 }
176
177 // copy the information for this stem
178 stemEl.el = (*here).first;
179 stemEl.equivWords = (*here).second;
180
181 // write out the stem
182 stemEl.Write (stemDictFile, &lastEl);
183
184 here++; stemNum++;
185 }
186
187
188 // write out the element indexes
189 sih.num_blocks = stemIdx.size();
190 sih.block_idx_start = ftell (stemDictFile);
191 WriteBlockIdx (stemDictFile, stemIdx);
192
193 // write out the stem dictionary header
194 fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
195 sih.Write (stemDictFile);
196
197
198 // close open files
199 fclose (stemDictFile);
200
201 // print out information
202#ifndef SILENT
203 Message ("Num word stems = %d\n", sih.dict_size);
204 Message ("Max stem block size = %d\n", sih.max_block_size);
205 Message ("Number of stem blocks written = %d\n", sih.num_blocks);
206#endif
207}
208
209
210int main (int argc, char **argv) {
211 unsigned long entriesPerBlock = 16;
212 char *filename = "";
213 int ch;
214 int stemMethod = 0; // illegal value (no translation)
215 int stemmerNum = 0; // English stemmer
216 msg_prefix = argv[0];
217 opterr = 0;
218
219 while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) {
220 switch (ch) {
221 case 'f': // input file
222 filename = optarg;
223 break;
224 case 'd':
225 set_basepath (optarg);
226 break;
227 case 'b':
228 entriesPerBlock = atoi (optarg);
229 break;
230 case 's':
231 stemMethod = atoi (optarg);
232 break;
233 case 'a':
234 stemmerNum = stemmernumber ((unsigned char *) optarg);
235 break;
236 case 'h':
237 case '?':
238 fprintf (stderr, "usage: %s [-d directory] "
239 "[-b entries-per-block] [-h] -s 1|2|3 "
240 "[-a stemmer-method] -f name\n", argv[0]);
241 exit (1);
242 }
243 }
244
245 if (stemMethod < 1 || stemMethod > 3)
246 FatalError (1, "Stem method must be 1, 2 or 3");
247
248 // read in the dictionary and create the in memory dictionary
249 StemMapDict stemDict;
250 CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
251
252 // write out the dictionary as a blocked file
253 WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
254
255 return 0;
256}
Note: See TracBrowser for help on using the repository browser.