source: main/tags/2.80/indexers/mg/src/text/mg_invf_dict.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.1 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dict.c -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
29
30#include "mg_files.h"
31#include "invf.h"
32#include "locallib.h"
33#include "words.h"
34#include "mg.h"
35
36/*
37 $Log$
38 Revision 1.1 2003/02/20 21:18:24 mdewsnip
39 Addition of MG package for search and retrieval
40
41 Revision 1.1 1999/08/10 21:18:08 sjboddie
42 renamed mg-1.3d directory mg
43
44 Revision 1.2 1998/11/25 07:55:45 rjmcnab
45
46 Modified mg to that you can specify the stemmer you want
47 to use via a command line option. You specify it to
48 mg_passes during the build process. The number of the
49 stemmer that you used is stored within the inverted
50 dictionary header and the stemmed dictionary header so
51 the correct stemmer is used in later stages of building
52 and querying.
53
54 Revision 1.1 1998/11/17 09:35:03 rjmcnab
55 *** empty log message ***
56
57 * Revision 1.4 1994/11/29 00:32:00 tes
58 * Committing the new merged files and changes.
59 *
60 * Revision 1.3 1994/10/20 03:56:56 tes
61 * I have rewritten the boolean query optimiser and abstracted out the
62 * components of the boolean query.
63 *
64 * Revision 1.2 1994/09/20 04:41:49 tes
65 * For version 1.1
66 *
67 */
68
69static char *RCSID = "$Id: mg_invf_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $";
70
71int block_size = 1024 * 4;
72
73int force = 0;
74
75static void process_files (char *filename);
76
77int main (int argc, char **argv)
78{
79 char *file_name = "";
80 int ch;
81 msg_prefix = argv[0];
82 opterr = 0;
83 msg_prefix = argv[0];
84 while ((ch = getopt (argc, argv, "f:d:b:Fh")) != -1)
85 switch (ch)
86 {
87 case 'f': /* input file */
88 file_name = optarg;
89 break;
90 case 'd':
91 set_basepath (optarg);
92 break;
93 case 'b':
94 block_size = atoi (optarg);
95 break;
96 case 'F':
97 force = 1;
98 break;
99 case 'h':
100 case '?':
101 fprintf (stderr, "usage: %s [-f input_file]"
102 "[-d data directory] [-b num] [-F] [-h]\n", argv[0]);
103 exit (1);
104 }
105
106 process_files (file_name);
107 return 0;
108}
109
110
111
112
113static void
114process_files (char *filename)
115{
116 FILE *id, *idb, *tmp, *ii;
117 unsigned long i, pos, num, First_word, invf_ptr, invf_len;
118 unsigned long last_ptr = 0;
119 char *FName;
120 struct invf_dict_header idh;
121 struct stem_dict_header sdh;
122 u_char prev[MAXSTEMLEN + 1];
123 u_char *buffer;
124 unsigned short *pointers;
125 int buf_in_use;
126 unsigned short ptrs_in_use, word_num;
127
128 id = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
129
130 ii = open_file (filename, INVF_IDX_SUFFIX, "rb", MAGIC_INVI, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
131
132 idb = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "w+b", MAGIC_STEM,
133 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
134
135 FName = make_name (filename, ".tmp", NULL);
136 if (!(tmp = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
137 FatalError (1, "Unable to open \"%s\".\n", FName);
138
139 /* Delete the file now */
140 unlink (FName);
141
142 fread (&idh, sizeof (idh), 1, id);
143 /* [RPAP - Jan 97: Endian Ordering] */
144 NTOHUL(idh.lookback);
145 NTOHUL(idh.dict_size);
146 NTOHUL(idh.total_bytes);
147 NTOHUL(idh.index_string_bytes);
148 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
149 NTOHUL(idh.num_of_docs);
150 NTOHUL(idh.static_num_of_docs);
151 NTOHUL(idh.num_of_words);
152 NTOHUL(idh.stemmer_num);
153 NTOHUL(idh.stem_method);
154
155 sdh.lookback = idh.lookback;
156 sdh.block_size = block_size;
157 sdh.num_blocks = 0;
158 sdh.index_chars = 0;
159 sdh.blocks_start = 0;
160 sdh.num_of_docs = idh.num_of_docs;
161 sdh.static_num_of_docs = idh.static_num_of_docs;
162 sdh.num_of_words = idh.num_of_words;
163 sdh.stemmer_num = idh.stemmer_num;
164 sdh.stem_method = idh.stem_method;
165 sdh.indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */
166
167 fwrite (&sdh, sizeof (sdh), 1, idb);
168
169 if (!(buffer = Xmalloc (block_size + 512)))
170 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
171 if (!(pointers = Xmalloc (block_size + 512)))
172 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
173
174 buf_in_use = 0;
175 pos = 0;
176 word_num = 0;
177 ptrs_in_use = 0;
178 First_word = 0;
179 for (i = 0; i < idh.dict_size; i++)
180 {
181 register unsigned long extra, copy, suff;
182 unsigned long wcnt, fcnt;
183
184 /* build a new word on top of prev */
185 copy = getc (id);
186 suff = getc (id);
187 *prev = copy + suff;
188 fread (prev + copy + 1, sizeof (u_char), suff, id);
189
190 /* read other data, but no need to store it */
191 fread (&fcnt, sizeof (fcnt), 1, id);
192 fread (&wcnt, sizeof (wcnt), 1, id);
193
194 /* [RPAP - Jan 97: Endian Ordering] */
195 NTOHUL(fcnt);
196 NTOHUL(wcnt);
197
198 /* read in the inverted file position */
199 fread (&invf_ptr, sizeof (invf_ptr), 1, ii);
200 NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
201 if (word_num % idh.lookback == 0)
202 extra = copy + sizeof (*pointers);
203 else
204 extra = 0;
205 if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
206 buf_in_use + suff + 1 + sizeof (fcnt) + sizeof (wcnt) +
207 sizeof (First_word) + sizeof (invf_ptr) + sizeof (invf_len) > block_size)
208 {
209 int chunk;
210 invf_len = invf_ptr - last_ptr;
211
212 /* [RPAP - Jan 97: Endian Ordering] */
213 HTONUL(First_word);
214 HTONUL(invf_len);
215 HTONUS(word_num);
216
217 fwrite (&First_word, sizeof (First_word), 1, tmp);
218 fwrite (&invf_len, sizeof (invf_len), 1, tmp);
219 fwrite (&word_num, sizeof (word_num), 1, tmp);
220 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
221 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
222 bzero ((char *) buffer, block_size);
223 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
224 sizeof (ptrs_in_use) + sizeof (First_word) + sizeof (invf_len);
225 if (force && chunk < block_size)
226 {
227 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
228 chunk = block_size;
229 }
230
231 pos += chunk;
232
233 buf_in_use = 0;
234 word_num = 0;
235 ptrs_in_use = 0;
236 sdh.num_blocks++;
237 }
238
239 if (word_num % idh.lookback == 0)
240 {
241 HTONUS2(buf_in_use, pointers[ptrs_in_use++]); /* [RPAP - Jan 97: Endian Ordering] */
242 suff += copy;
243 copy = 0;
244 }
245 buffer[buf_in_use++] = copy;
246 buffer[buf_in_use++] = suff;
247 bcopy ((char *) (prev + copy + 1), (char *) (buffer + buf_in_use), suff);
248 buf_in_use += suff;
249 HTONUL(fcnt); /* [RPAP - Jan 97: Endian Ordering] */
250 bcopy ((char *) &fcnt, (char *) (buffer + buf_in_use), sizeof (fcnt));
251 buf_in_use += sizeof (fcnt);
252 HTONUL(wcnt); /* [RPAP - Jan 97: Endian Ordering] */
253 bcopy ((char *) &wcnt, (char *) (buffer + buf_in_use), sizeof (wcnt));
254 buf_in_use += sizeof (wcnt);
255 last_ptr = invf_ptr;
256 HTONUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
257 bcopy ((char *) &invf_ptr, (char *) (buffer + buf_in_use), sizeof (invf_ptr));
258 NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
259 buf_in_use += sizeof (invf_ptr);
260 if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
261 sizeof (ptrs_in_use) > block_size)
262 FatalError (1, "Fatal Internal Error # 34876234\n");
263 if (word_num == 0)
264 {
265 fwrite (prev, sizeof (u_char), *prev + 1, idb);
266 HTONUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
267 fwrite (&pos, sizeof (pos), 1, idb);
268 NTOHUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
269 sdh.index_chars += *prev + 1;
270 First_word = i;
271 }
272 word_num++;
273 }
274 if (buf_in_use)
275 {
276 int chunk;
277 fread (&invf_ptr, sizeof (invf_ptr), 1, ii);
278 NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
279 invf_len = invf_ptr - last_ptr;
280
281 /* [RPAP - Jan 97: Endian Ordering] */
282 HTONUL(First_word);
283 HTONUL(invf_len);
284 HTONUS(word_num);
285
286 fwrite (&First_word, sizeof (First_word), 1, tmp);
287 fwrite (&invf_len, sizeof (invf_len), 1, tmp);
288 fwrite (&word_num, sizeof (word_num), 1, tmp);
289 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
290 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
291 bzero ((char *) buffer, block_size);
292 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
293 sizeof (ptrs_in_use) + sizeof (First_word) + sizeof (invf_len);
294 if (force && chunk < block_size)
295 {
296 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
297 chunk = block_size;
298 }
299
300 sdh.num_blocks++;
301 }
302 fclose (id);
303 fclose (ii);
304
305 rewind (tmp);
306 sdh.blocks_start = sdh.index_chars + sizeof (u_long) + sizeof (sdh) +
307 sdh.num_blocks * sizeof (pos);
308 if (force)
309 {
310 int amount;
311 amount = sdh.blocks_start % block_size;
312 if (amount != 0)
313 {
314 bzero ((char *) buffer, block_size);
315 fwrite (buffer, sizeof (u_char), block_size - amount, idb);
316 sdh.blocks_start += block_size - amount;
317 }
318 }
319
320 while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
321 fwrite (buffer, sizeof (u_char), num, idb);
322 fclose (tmp);
323
324 /* skip over the magic number */
325 fseek (idb, sizeof (u_long), 0);
326
327 /* [RPAP - Jan 97: Endian Ordering] */
328 HTONUL(sdh.lookback);
329 HTONUL(sdh.block_size);
330 HTONUL(sdh.num_blocks);
331 HTONUL(sdh.blocks_start);
332 HTONUL(sdh.index_chars);
333 HTONUL(sdh.num_of_docs);
334 HTONUL(sdh.static_num_of_docs);
335 HTONUL(sdh.num_of_words);
336 HTONUL(sdh.stemmer_num);
337 HTONUL(sdh.stem_method);
338 HTONUL(sdh.indexed);
339
340 fwrite (&sdh, sizeof (sdh), 1, idb);
341 fclose (idb);
342
343
344 Message ("Block size = %d\n", block_size);
345 Message ("Number of blocks written = %d\n", NTOHUL(sdh.num_blocks));
346
347}
Note: See TracBrowser for help on using the repository browser.