source: trunk/gsdl/src/mgpp/text/mg_stem_idx.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_stem_idx.cpp -- stem index builder
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25#if defined (__WIN32__)
26# include "getopt.h"
27#else
28# include <unistd.h>
29#endif
30
31#include "UCArray.h"
32#include "sysfuncs.h"
33#include "messages.h"
34#include "mg_files.h"
35#include "invf.h"
36#include "words.h"
37#include "stemmer.h"
38
39#if defined(GSDL_USE_OBJECTSPACE)
40# include <ospace\std\map>
41# include <ospace\std\vector>
42#elif defined(GSDL_USE_STL_H)
43# include <map.h>
44# include <vector.h>
45#else
46# include <map>
47# include <vector>
48#endif
49
50typedef vector<unsigned long> WordNumList;
51typedef map<UCArray, WordNumList, DictLTUCArray> StemMapDict;
52
53void CreateStemDict (char *filename,
54 StemMapDict &stemDict,
55 int stemMethod,
56 int stemmerNum) {
57 stemDict.erase (stemDict.begin(), stemDict.end());
58
59 // open the dictionary
60 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
61 MAGIC_STEM_BUILD, MG_ABORT);
62 invf_dict_header idh;
63 idh.Read (dictFile);
64
65 fseek (dictFile, idh.word_dict_start, SEEK_SET);
66
67 unsigned long wordNum;
68 u_char mgWord[MAXSTEMLEN + 1];
69 word_dict_el wordEl;
70 UCArray stemEl;
71 wordEl.SetNumLevels (idh.num_levels);
72 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
73 // read in the next word
74 wordEl.Read (dictFile, idh.num_levels);
75
76 // convert the word to an "mg word"
77 mgWord[0] = wordEl.el.size();
78 memcpy((char *)&mgWord[1], (const char *)wordEl.el.begin(), wordEl.el.size());
79
80 // stem the word
81 stemmer (stemMethod, stemmerNum, mgWord);
82
83 // convert the result back to a UCArray
84 stemEl.erase (stemEl.begin(), stemEl.end());
85 stemEl.insert (stemEl.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
86
87// cout << "\"" << stemEl << "\" -> \"" << wordEl.el << "\"\n";
88
89 // add this word number to the list of word numbers for this word
90 stemDict[stemEl].push_back (wordNum);
91 }
92
93 fclose (dictFile);
94}
95
96
97void WriteStemDict (char *filename,
98 StemMapDict &stemDict,
99 int stemMethod,
100 int stemmerNum,
101 unsigned long entriesPerBlock) {
102 // Create appropriate stem index file
103 FILE *stemDictFile = NULL;
104 if (stemMethod == 1) {
105 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
106 "wb", MAGIC_STEM_1, MG_ABORT);
107 } else if (stemMethod == 2) {
108 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
109 "wb", MAGIC_STEM_2, MG_ABORT);
110 } else if (stemMethod == 3) {
111 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
112 "wb", MAGIC_STEM_3, MG_ABORT);
113 } else {
114 FatalError (1, "Unknown stem method %d", stemMethod);
115 }
116
117 stem_idx_header sih;
118 sih.lookback = 0;
119 sih.dict_size = stemDict.size();
120 sih.entries_per_block = entriesPerBlock;
121 sih.max_block_size = 0;
122
123 sih.stemmer_num = stemmerNum;
124 sih.stem_method = stemMethod;
125
126 // write out a place-holder version of the header
127 sih.Write (stemDictFile);
128
129 sih.blocks_start = ftell (stemDictFile);
130
131 block_idx stemIdx;
132 unsigned long stemNum = 0;
133 stem_block_dict_el stemEl;
134 UCArray lastEl;
135
136 StemMapDict::const_iterator here = stemDict.begin();
137 StemMapDict::const_iterator end = stemDict.end();
138 while (here != end) {
139 // remember this stem (and position) if this is the start
140 // of a new block
141 if (stemNum % entriesPerBlock == 0) {
142 block_idx_info elIdx;
143 elIdx.el = (*here).first;
144 elIdx.block_ptr = ftell (stemDictFile);
145
146 // see if this block is the longest so far
147 if (stemIdx.size() > 0) {
148 unsigned long blockLen = elIdx.block_ptr -
149 (*(stemIdx.end()-1)).block_ptr;
150 if (blockLen > sih.max_block_size) sih.max_block_size = blockLen;
151 }
152
153 stemIdx.push_back (elIdx);
154 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
155 }
156
157 // copy the information for this stem
158 stemEl.el = (*here).first;
159 stemEl.equivWords = (*here).second;
160
161 // write out the stem
162 stemEl.Write (stemDictFile, &lastEl);
163
164 here++; stemNum++;
165 }
166
167
168 // write out the element indexes
169 sih.num_blocks = stemIdx.size();
170 sih.block_idx_start = ftell (stemDictFile);
171 WriteBlockIdx (stemDictFile, stemIdx);
172
173 // write out the stem dictionary header
174 fseek (stemDictFile, sizeof(unsigned long), SEEK_SET);
175 sih.Write (stemDictFile);
176
177
178 // close open files
179 fclose (stemDictFile);
180
181 // print out information
182#ifndef SILENT
183 Message ("Num word stems = %d\n", sih.dict_size);
184 Message ("Max stem block size = %d\n", sih.max_block_size);
185 Message ("Number of stem blocks written = %d\n", sih.num_blocks);
186#endif
187}
188
189
190int main (int argc, char **argv) {
191 unsigned long entriesPerBlock = 16;
192 char *filename = "";
193 int ch;
194 int stemMethod = 0; // illegal value (no translation)
195 int stemmerNum = 0; // English stemmer
196 msg_prefix = argv[0];
197 opterr = 0;
198
199 while ((ch = getopt (argc, argv, "f:d:b:s:h")) != -1) {
200 switch (ch) {
201 case 'f': // input file
202 filename = optarg;
203 break;
204 case 'd':
205 set_basepath (optarg);
206 break;
207 case 'b':
208 entriesPerBlock = atoi (optarg);
209 break;
210 case 's':
211 stemMethod = atoi (optarg);
212 break;
213 case 'a':
214 stemmerNum = stemmernumber ((unsigned char *) optarg);
215 break;
216 case 'h':
217 case '?':
218 fprintf (stderr, "usage: %s [-d directory] "
219 "[-b entries-per-block] [-h] -s 1|2|3 "
220 "[-a stemmer-method] -f name\n", argv[0]);
221 exit (1);
222 }
223 }
224
225 if (stemMethod < 1 || stemMethod > 3)
226 FatalError (1, "Stem method must be 1, 2 or 3");
227
228 // read in the dictionary and create the in memory dictionary
229 StemMapDict stemDict;
230 CreateStemDict (filename, stemDict, stemMethod, stemmerNum);
231
232 // write out the dictionary as a blocked file
233 WriteStemDict (filename, stemDict, stemMethod, stemmerNum, entriesPerBlock);
234
235 return 0;
236}
Note: See TracBrowser for help on using the repository browser.