1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * mg_invf_dict.c -- Program to build the blocked stemmed dictionary
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: mg_invf_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | #include "sysfuncs.h"
|
---|
25 |
|
---|
26 | #include "memlib.h"
|
---|
27 | #include "messages.h"
|
---|
28 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
29 |
|
---|
30 | #include "mg_files.h"
|
---|
31 | #include "invf.h"
|
---|
32 | #include "locallib.h"
|
---|
33 | #include "words.h"
|
---|
34 | #include "mg.h"
|
---|
35 |
|
---|
36 | /*
|
---|
37 | $Log$
|
---|
38 | Revision 1.1 2003/02/20 21:18:24 mdewsnip
|
---|
39 | Addition of MG package for search and retrieval
|
---|
40 |
|
---|
41 | Revision 1.1 1999/08/10 21:18:08 sjboddie
|
---|
42 | renamed mg-1.3d directory mg
|
---|
43 |
|
---|
44 | Revision 1.2 1998/11/25 07:55:45 rjmcnab
|
---|
45 |
|
---|
46 | Modified mg to that you can specify the stemmer you want
|
---|
47 | to use via a command line option. You specify it to
|
---|
48 | mg_passes during the build process. The number of the
|
---|
49 | stemmer that you used is stored within the inverted
|
---|
50 | dictionary header and the stemmed dictionary header so
|
---|
51 | the correct stemmer is used in later stages of building
|
---|
52 | and querying.
|
---|
53 |
|
---|
54 | Revision 1.1 1998/11/17 09:35:03 rjmcnab
|
---|
55 | *** empty log message ***
|
---|
56 |
|
---|
57 | * Revision 1.4 1994/11/29 00:32:00 tes
|
---|
58 | * Committing the new merged files and changes.
|
---|
59 | *
|
---|
60 | * Revision 1.3 1994/10/20 03:56:56 tes
|
---|
61 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
62 | * components of the boolean query.
|
---|
63 | *
|
---|
64 | * Revision 1.2 1994/09/20 04:41:49 tes
|
---|
65 | * For version 1.1
|
---|
66 | *
|
---|
67 | */
|
---|
68 |
|
---|
69 | static char *RCSID = "$Id: mg_invf_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $";
|
---|
70 |
|
---|
71 | int block_size = 1024 * 4;
|
---|
72 |
|
---|
73 | int force = 0;
|
---|
74 |
|
---|
75 | static void process_files (char *filename);
|
---|
76 |
|
---|
77 | int main (int argc, char **argv)
|
---|
78 | {
|
---|
79 | char *file_name = "";
|
---|
80 | int ch;
|
---|
81 | msg_prefix = argv[0];
|
---|
82 | opterr = 0;
|
---|
83 | msg_prefix = argv[0];
|
---|
84 | while ((ch = getopt (argc, argv, "f:d:b:Fh")) != -1)
|
---|
85 | switch (ch)
|
---|
86 | {
|
---|
87 | case 'f': /* input file */
|
---|
88 | file_name = optarg;
|
---|
89 | break;
|
---|
90 | case 'd':
|
---|
91 | set_basepath (optarg);
|
---|
92 | break;
|
---|
93 | case 'b':
|
---|
94 | block_size = atoi (optarg);
|
---|
95 | break;
|
---|
96 | case 'F':
|
---|
97 | force = 1;
|
---|
98 | break;
|
---|
99 | case 'h':
|
---|
100 | case '?':
|
---|
101 | fprintf (stderr, "usage: %s [-f input_file]"
|
---|
102 | "[-d data directory] [-b num] [-F] [-h]\n", argv[0]);
|
---|
103 | exit (1);
|
---|
104 | }
|
---|
105 |
|
---|
106 | process_files (file_name);
|
---|
107 | return 0;
|
---|
108 | }
|
---|
109 |
|
---|
110 |
|
---|
111 |
|
---|
112 |
|
---|
113 | static void
|
---|
114 | process_files (char *filename)
|
---|
115 | {
|
---|
116 | FILE *id, *idb, *tmp, *ii;
|
---|
117 | unsigned long i, pos, num, First_word, invf_ptr, invf_len;
|
---|
118 | unsigned long last_ptr = 0;
|
---|
119 | char *FName;
|
---|
120 | struct invf_dict_header idh;
|
---|
121 | struct stem_dict_header sdh;
|
---|
122 | u_char prev[MAXSTEMLEN + 1];
|
---|
123 | u_char *buffer;
|
---|
124 | unsigned short *pointers;
|
---|
125 | int buf_in_use;
|
---|
126 | unsigned short ptrs_in_use, word_num;
|
---|
127 |
|
---|
128 | id = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
129 |
|
---|
130 | ii = open_file (filename, INVF_IDX_SUFFIX, "rb", MAGIC_INVI, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
131 |
|
---|
132 | idb = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "w+b", MAGIC_STEM,
|
---|
133 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
134 |
|
---|
135 | FName = make_name (filename, ".tmp", NULL);
|
---|
136 | if (!(tmp = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
137 | FatalError (1, "Unable to open \"%s\".\n", FName);
|
---|
138 |
|
---|
139 | /* Delete the file now */
|
---|
140 | unlink (FName);
|
---|
141 |
|
---|
142 | fread (&idh, sizeof (idh), 1, id);
|
---|
143 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
144 | NTOHUL(idh.lookback);
|
---|
145 | NTOHUL(idh.dict_size);
|
---|
146 | NTOHUL(idh.total_bytes);
|
---|
147 | NTOHUL(idh.index_string_bytes);
|
---|
148 | NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
149 | NTOHUL(idh.num_of_docs);
|
---|
150 | NTOHUL(idh.static_num_of_docs);
|
---|
151 | NTOHUL(idh.num_of_words);
|
---|
152 | NTOHUL(idh.stemmer_num);
|
---|
153 | NTOHUL(idh.stem_method);
|
---|
154 |
|
---|
155 | sdh.lookback = idh.lookback;
|
---|
156 | sdh.block_size = block_size;
|
---|
157 | sdh.num_blocks = 0;
|
---|
158 | sdh.index_chars = 0;
|
---|
159 | sdh.blocks_start = 0;
|
---|
160 | sdh.num_of_docs = idh.num_of_docs;
|
---|
161 | sdh.static_num_of_docs = idh.static_num_of_docs;
|
---|
162 | sdh.num_of_words = idh.num_of_words;
|
---|
163 | sdh.stemmer_num = idh.stemmer_num;
|
---|
164 | sdh.stem_method = idh.stem_method;
|
---|
165 | sdh.indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */
|
---|
166 |
|
---|
167 | fwrite (&sdh, sizeof (sdh), 1, idb);
|
---|
168 |
|
---|
169 | if (!(buffer = Xmalloc (block_size + 512)))
|
---|
170 | FatalError (1, "Unable to allocate memory for \"buffer\"\n");
|
---|
171 | if (!(pointers = Xmalloc (block_size + 512)))
|
---|
172 | FatalError (1, "Unable to allocate memory for \"buffer\"\n");
|
---|
173 |
|
---|
174 | buf_in_use = 0;
|
---|
175 | pos = 0;
|
---|
176 | word_num = 0;
|
---|
177 | ptrs_in_use = 0;
|
---|
178 | First_word = 0;
|
---|
179 | for (i = 0; i < idh.dict_size; i++)
|
---|
180 | {
|
---|
181 | register unsigned long extra, copy, suff;
|
---|
182 | unsigned long wcnt, fcnt;
|
---|
183 |
|
---|
184 | /* build a new word on top of prev */
|
---|
185 | copy = getc (id);
|
---|
186 | suff = getc (id);
|
---|
187 | *prev = copy + suff;
|
---|
188 | fread (prev + copy + 1, sizeof (u_char), suff, id);
|
---|
189 |
|
---|
190 | /* read other data, but no need to store it */
|
---|
191 | fread (&fcnt, sizeof (fcnt), 1, id);
|
---|
192 | fread (&wcnt, sizeof (wcnt), 1, id);
|
---|
193 |
|
---|
194 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
195 | NTOHUL(fcnt);
|
---|
196 | NTOHUL(wcnt);
|
---|
197 |
|
---|
198 | /* read in the inverted file position */
|
---|
199 | fread (&invf_ptr, sizeof (invf_ptr), 1, ii);
|
---|
200 | NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
201 | if (word_num % idh.lookback == 0)
|
---|
202 | extra = copy + sizeof (*pointers);
|
---|
203 | else
|
---|
204 | extra = 0;
|
---|
205 | if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
|
---|
206 | buf_in_use + suff + 1 + sizeof (fcnt) + sizeof (wcnt) +
|
---|
207 | sizeof (First_word) + sizeof (invf_ptr) + sizeof (invf_len) > block_size)
|
---|
208 | {
|
---|
209 | int chunk;
|
---|
210 | invf_len = invf_ptr - last_ptr;
|
---|
211 |
|
---|
212 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
213 | HTONUL(First_word);
|
---|
214 | HTONUL(invf_len);
|
---|
215 | HTONUS(word_num);
|
---|
216 |
|
---|
217 | fwrite (&First_word, sizeof (First_word), 1, tmp);
|
---|
218 | fwrite (&invf_len, sizeof (invf_len), 1, tmp);
|
---|
219 | fwrite (&word_num, sizeof (word_num), 1, tmp);
|
---|
220 | fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
|
---|
221 | fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
|
---|
222 | bzero ((char *) buffer, block_size);
|
---|
223 | chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
|
---|
224 | sizeof (ptrs_in_use) + sizeof (First_word) + sizeof (invf_len);
|
---|
225 | if (force && chunk < block_size)
|
---|
226 | {
|
---|
227 | fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
|
---|
228 | chunk = block_size;
|
---|
229 | }
|
---|
230 |
|
---|
231 | pos += chunk;
|
---|
232 |
|
---|
233 | buf_in_use = 0;
|
---|
234 | word_num = 0;
|
---|
235 | ptrs_in_use = 0;
|
---|
236 | sdh.num_blocks++;
|
---|
237 | }
|
---|
238 |
|
---|
239 | if (word_num % idh.lookback == 0)
|
---|
240 | {
|
---|
241 | HTONUS2(buf_in_use, pointers[ptrs_in_use++]); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
242 | suff += copy;
|
---|
243 | copy = 0;
|
---|
244 | }
|
---|
245 | buffer[buf_in_use++] = copy;
|
---|
246 | buffer[buf_in_use++] = suff;
|
---|
247 | bcopy ((char *) (prev + copy + 1), (char *) (buffer + buf_in_use), suff);
|
---|
248 | buf_in_use += suff;
|
---|
249 | HTONUL(fcnt); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
250 | bcopy ((char *) &fcnt, (char *) (buffer + buf_in_use), sizeof (fcnt));
|
---|
251 | buf_in_use += sizeof (fcnt);
|
---|
252 | HTONUL(wcnt); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
253 | bcopy ((char *) &wcnt, (char *) (buffer + buf_in_use), sizeof (wcnt));
|
---|
254 | buf_in_use += sizeof (wcnt);
|
---|
255 | last_ptr = invf_ptr;
|
---|
256 | HTONUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
257 | bcopy ((char *) &invf_ptr, (char *) (buffer + buf_in_use), sizeof (invf_ptr));
|
---|
258 | NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
259 | buf_in_use += sizeof (invf_ptr);
|
---|
260 | if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
|
---|
261 | sizeof (ptrs_in_use) > block_size)
|
---|
262 | FatalError (1, "Fatal Internal Error # 34876234\n");
|
---|
263 | if (word_num == 0)
|
---|
264 | {
|
---|
265 | fwrite (prev, sizeof (u_char), *prev + 1, idb);
|
---|
266 | HTONUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
267 | fwrite (&pos, sizeof (pos), 1, idb);
|
---|
268 | NTOHUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
269 | sdh.index_chars += *prev + 1;
|
---|
270 | First_word = i;
|
---|
271 | }
|
---|
272 | word_num++;
|
---|
273 | }
|
---|
274 | if (buf_in_use)
|
---|
275 | {
|
---|
276 | int chunk;
|
---|
277 | fread (&invf_ptr, sizeof (invf_ptr), 1, ii);
|
---|
278 | NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
279 | invf_len = invf_ptr - last_ptr;
|
---|
280 |
|
---|
281 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
282 | HTONUL(First_word);
|
---|
283 | HTONUL(invf_len);
|
---|
284 | HTONUS(word_num);
|
---|
285 |
|
---|
286 | fwrite (&First_word, sizeof (First_word), 1, tmp);
|
---|
287 | fwrite (&invf_len, sizeof (invf_len), 1, tmp);
|
---|
288 | fwrite (&word_num, sizeof (word_num), 1, tmp);
|
---|
289 | fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
|
---|
290 | fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
|
---|
291 | bzero ((char *) buffer, block_size);
|
---|
292 | chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
|
---|
293 | sizeof (ptrs_in_use) + sizeof (First_word) + sizeof (invf_len);
|
---|
294 | if (force && chunk < block_size)
|
---|
295 | {
|
---|
296 | fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
|
---|
297 | chunk = block_size;
|
---|
298 | }
|
---|
299 |
|
---|
300 | sdh.num_blocks++;
|
---|
301 | }
|
---|
302 | fclose (id);
|
---|
303 | fclose (ii);
|
---|
304 |
|
---|
305 | rewind (tmp);
|
---|
306 | sdh.blocks_start = sdh.index_chars + sizeof (u_long) + sizeof (sdh) +
|
---|
307 | sdh.num_blocks * sizeof (pos);
|
---|
308 | if (force)
|
---|
309 | {
|
---|
310 | int amount;
|
---|
311 | amount = sdh.blocks_start % block_size;
|
---|
312 | if (amount != 0)
|
---|
313 | {
|
---|
314 | bzero ((char *) buffer, block_size);
|
---|
315 | fwrite (buffer, sizeof (u_char), block_size - amount, idb);
|
---|
316 | sdh.blocks_start += block_size - amount;
|
---|
317 | }
|
---|
318 | }
|
---|
319 |
|
---|
320 | while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
|
---|
321 | fwrite (buffer, sizeof (u_char), num, idb);
|
---|
322 | fclose (tmp);
|
---|
323 |
|
---|
324 | /* skip over the magic number */
|
---|
325 | fseek (idb, sizeof (u_long), 0);
|
---|
326 |
|
---|
327 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
328 | HTONUL(sdh.lookback);
|
---|
329 | HTONUL(sdh.block_size);
|
---|
330 | HTONUL(sdh.num_blocks);
|
---|
331 | HTONUL(sdh.blocks_start);
|
---|
332 | HTONUL(sdh.index_chars);
|
---|
333 | HTONUL(sdh.num_of_docs);
|
---|
334 | HTONUL(sdh.static_num_of_docs);
|
---|
335 | HTONUL(sdh.num_of_words);
|
---|
336 | HTONUL(sdh.stemmer_num);
|
---|
337 | HTONUL(sdh.stem_method);
|
---|
338 | HTONUL(sdh.indexed);
|
---|
339 |
|
---|
340 | fwrite (&sdh, sizeof (sdh), 1, idb);
|
---|
341 | fclose (idb);
|
---|
342 |
|
---|
343 |
|
---|
344 | Message ("Block size = %d\n", block_size);
|
---|
345 | Message ("Number of blocks written = %d\n", NTOHUL(sdh.num_blocks));
|
---|
346 |
|
---|
347 | }
|
---|