source: trunk/gsdl/packages/mg/src/text/mg_invf_dict.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.0 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dict.c -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dict.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
29
30#include "mg_files.h"
31#include "invf.h"
32#include "locallib.h"
33#include "words.h"
34#include "mg.h"
35
36/*
37 $Log$
38 Revision 1.1 1999/08/10 21:18:08 sjboddie
39 renamed mg-1.3d directory mg
40
41 Revision 1.2 1998/11/25 07:55:45 rjmcnab
42
43 Modified mg to that you can specify the stemmer you want
44 to use via a command line option. You specify it to
45 mg_passes during the build process. The number of the
46 stemmer that you used is stored within the inverted
47 dictionary header and the stemmed dictionary header so
48 the correct stemmer is used in later stages of building
49 and querying.
50
51 Revision 1.1 1998/11/17 09:35:03 rjmcnab
52 *** empty log message ***
53
54 * Revision 1.4 1994/11/29 00:32:00 tes
55 * Committing the new merged files and changes.
56 *
57 * Revision 1.3 1994/10/20 03:56:56 tes
58 * I have rewritten the boolean query optimiser and abstracted out the
59 * components of the boolean query.
60 *
61 * Revision 1.2 1994/09/20 04:41:49 tes
62 * For version 1.1
63 *
64 */
65
66static char *RCSID = "$Id: mg_invf_dict.c 439 1999-08-10 21:23:37Z sjboddie $";
67
68int block_size = 1024 * 4;
69
70int force = 0;
71
72static void process_files (char *filename);
73
74int main (int argc, char **argv)
75{
76 char *file_name = "";
77 int ch;
78 msg_prefix = argv[0];
79 opterr = 0;
80 msg_prefix = argv[0];
81 while ((ch = getopt (argc, argv, "f:d:b:Fh")) != -1)
82 switch (ch)
83 {
84 case 'f': /* input file */
85 file_name = optarg;
86 break;
87 case 'd':
88 set_basepath (optarg);
89 break;
90 case 'b':
91 block_size = atoi (optarg);
92 break;
93 case 'F':
94 force = 1;
95 break;
96 case 'h':
97 case '?':
98 fprintf (stderr, "usage: %s [-f input_file]"
99 "[-d data directory] [-b num] [-F] [-h]\n", argv[0]);
100 exit (1);
101 }
102
103 process_files (file_name);
104 return 0;
105}
106
107
108
109
110static void
111process_files (char *filename)
112{
113 FILE *id, *idb, *tmp, *ii;
114 unsigned long i, pos, num, First_word, invf_ptr, invf_len;
115 unsigned long last_ptr = 0;
116 char *FName;
117 struct invf_dict_header idh;
118 struct stem_dict_header sdh;
119 u_char prev[MAXSTEMLEN + 1];
120 u_char *buffer;
121 unsigned short *pointers;
122 int buf_in_use;
123 unsigned short ptrs_in_use, word_num;
124
125 id = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
126
127 ii = open_file (filename, INVF_IDX_SUFFIX, "rb", MAGIC_INVI, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
128
129 idb = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "w+b", MAGIC_STEM,
130 MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
131
132 FName = make_name (filename, ".tmp", NULL);
133 if (!(tmp = fopen (FName, "w+b"))) /* [RPAP - Feb 97: WIN32 Port] */
134 FatalError (1, "Unable to open \"%s\".\n", FName);
135
136 /* Delete the file now */
137 unlink (FName);
138
139 fread (&idh, sizeof (idh), 1, id);
140 /* [RPAP - Jan 97: Endian Ordering] */
141 NTOHUL(idh.lookback);
142 NTOHUL(idh.dict_size);
143 NTOHUL(idh.total_bytes);
144 NTOHUL(idh.index_string_bytes);
145 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
146 NTOHUL(idh.num_of_docs);
147 NTOHUL(idh.static_num_of_docs);
148 NTOHUL(idh.num_of_words);
149 NTOHUL(idh.stemmer_num);
150 NTOHUL(idh.stem_method);
151
152 sdh.lookback = idh.lookback;
153 sdh.block_size = block_size;
154 sdh.num_blocks = 0;
155 sdh.index_chars = 0;
156 sdh.blocks_start = 0;
157 sdh.num_of_docs = idh.num_of_docs;
158 sdh.static_num_of_docs = idh.static_num_of_docs;
159 sdh.num_of_words = idh.num_of_words;
160 sdh.stemmer_num = idh.stemmer_num;
161 sdh.stem_method = idh.stem_method;
162 sdh.indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */
163
164 fwrite (&sdh, sizeof (sdh), 1, idb);
165
166 if (!(buffer = Xmalloc (block_size + 512)))
167 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
168 if (!(pointers = Xmalloc (block_size + 512)))
169 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
170
171 buf_in_use = 0;
172 pos = 0;
173 word_num = 0;
174 ptrs_in_use = 0;
175 First_word = 0;
176 for (i = 0; i < idh.dict_size; i++)
177 {
178 register unsigned long extra, copy, suff;
179 unsigned long wcnt, fcnt;
180
181 /* build a new word on top of prev */
182 copy = getc (id);
183 suff = getc (id);
184 *prev = copy + suff;
185 fread (prev + copy + 1, sizeof (u_char), suff, id);
186
187 /* read other data, but no need to store it */
188 fread (&fcnt, sizeof (fcnt), 1, id);
189 fread (&wcnt, sizeof (wcnt), 1, id);
190
191 /* [RPAP - Jan 97: Endian Ordering] */
192 NTOHUL(fcnt);
193 NTOHUL(wcnt);
194
195 /* read in the inverted file position */
196 fread (&invf_ptr, sizeof (invf_ptr), 1, ii);
197 NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
198 if (word_num % idh.lookback == 0)
199 extra = copy + sizeof (*pointers);
200 else
201 extra = 0;
202 if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
203 buf_in_use + suff + 1 + sizeof (fcnt) + sizeof (wcnt) +
204 sizeof (First_word) + sizeof (invf_ptr) + sizeof (invf_len) > block_size)
205 {
206 int chunk;
207 invf_len = invf_ptr - last_ptr;
208
209 /* [RPAP - Jan 97: Endian Ordering] */
210 HTONUL(First_word);
211 HTONUL(invf_len);
212 HTONUS(word_num);
213
214 fwrite (&First_word, sizeof (First_word), 1, tmp);
215 fwrite (&invf_len, sizeof (invf_len), 1, tmp);
216 fwrite (&word_num, sizeof (word_num), 1, tmp);
217 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
218 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
219 bzero ((char *) buffer, block_size);
220 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
221 sizeof (ptrs_in_use) + sizeof (First_word) + sizeof (invf_len);
222 if (force && chunk < block_size)
223 {
224 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
225 chunk = block_size;
226 }
227
228 pos += chunk;
229
230 buf_in_use = 0;
231 word_num = 0;
232 ptrs_in_use = 0;
233 sdh.num_blocks++;
234 }
235
236 if (word_num % idh.lookback == 0)
237 {
238 HTONUS2(buf_in_use, pointers[ptrs_in_use++]); /* [RPAP - Jan 97: Endian Ordering] */
239 suff += copy;
240 copy = 0;
241 }
242 buffer[buf_in_use++] = copy;
243 buffer[buf_in_use++] = suff;
244 bcopy ((char *) (prev + copy + 1), (char *) (buffer + buf_in_use), suff);
245 buf_in_use += suff;
246 HTONUL(fcnt); /* [RPAP - Jan 97: Endian Ordering] */
247 bcopy ((char *) &fcnt, (char *) (buffer + buf_in_use), sizeof (fcnt));
248 buf_in_use += sizeof (fcnt);
249 HTONUL(wcnt); /* [RPAP - Jan 97: Endian Ordering] */
250 bcopy ((char *) &wcnt, (char *) (buffer + buf_in_use), sizeof (wcnt));
251 buf_in_use += sizeof (wcnt);
252 last_ptr = invf_ptr;
253 HTONUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
254 bcopy ((char *) &invf_ptr, (char *) (buffer + buf_in_use), sizeof (invf_ptr));
255 NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
256 buf_in_use += sizeof (invf_ptr);
257 if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
258 sizeof (ptrs_in_use) > block_size)
259 FatalError (1, "Fatal Internal Error # 34876234\n");
260 if (word_num == 0)
261 {
262 fwrite (prev, sizeof (u_char), *prev + 1, idb);
263 HTONUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
264 fwrite (&pos, sizeof (pos), 1, idb);
265 NTOHUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
266 sdh.index_chars += *prev + 1;
267 First_word = i;
268 }
269 word_num++;
270 }
271 if (buf_in_use)
272 {
273 int chunk;
274 fread (&invf_ptr, sizeof (invf_ptr), 1, ii);
275 NTOHUL(invf_ptr); /* [RPAP - Jan 97: Endian Ordering] */
276 invf_len = invf_ptr - last_ptr;
277
278 /* [RPAP - Jan 97: Endian Ordering] */
279 HTONUL(First_word);
280 HTONUL(invf_len);
281 HTONUS(word_num);
282
283 fwrite (&First_word, sizeof (First_word), 1, tmp);
284 fwrite (&invf_len, sizeof (invf_len), 1, tmp);
285 fwrite (&word_num, sizeof (word_num), 1, tmp);
286 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
287 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
288 bzero ((char *) buffer, block_size);
289 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
290 sizeof (ptrs_in_use) + sizeof (First_word) + sizeof (invf_len);
291 if (force && chunk < block_size)
292 {
293 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
294 chunk = block_size;
295 }
296
297 sdh.num_blocks++;
298 }
299 fclose (id);
300 fclose (ii);
301
302 rewind (tmp);
303 sdh.blocks_start = sdh.index_chars + sizeof (u_long) + sizeof (sdh) +
304 sdh.num_blocks * sizeof (pos);
305 if (force)
306 {
307 int amount;
308 amount = sdh.blocks_start % block_size;
309 if (amount != 0)
310 {
311 bzero ((char *) buffer, block_size);
312 fwrite (buffer, sizeof (u_char), block_size - amount, idb);
313 sdh.blocks_start += block_size - amount;
314 }
315 }
316
317 while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
318 fwrite (buffer, sizeof (u_char), num, idb);
319 fclose (tmp);
320
321 /* skip over the magic number */
322 fseek (idb, sizeof (u_long), 0);
323
324 /* [RPAP - Jan 97: Endian Ordering] */
325 HTONUL(sdh.lookback);
326 HTONUL(sdh.block_size);
327 HTONUL(sdh.num_blocks);
328 HTONUL(sdh.blocks_start);
329 HTONUL(sdh.index_chars);
330 HTONUL(sdh.num_of_docs);
331 HTONUL(sdh.static_num_of_docs);
332 HTONUL(sdh.num_of_words);
333 HTONUL(sdh.stemmer_num);
334 HTONUL(sdh.stem_method);
335 HTONUL(sdh.indexed);
336
337 fwrite (&sdh, sizeof (sdh), 1, idb);
338 fclose (idb);
339
340
341 Message ("Block size = %d\n", block_size);
342 Message ("Number of blocks written = %d\n", NTOHUL(sdh.num_blocks));
343
344}
Note: See TracBrowser for help on using the repository browser.