source: gsdl/trunk/trunk/mg/src/text/mg_perf_hash_build.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.8 KB
Line 
1/**************************************************************************
2 *
3 * mg_perf_hash_build.c -- Program to build a perfect hash function
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_perf_hash_build.c 16583 2008-07-29 10:20:36Z davidb $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "timing.h"
28#include "local_strings.h"
29#include "perf_hash.h"
30#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
31
32#include "mg_files.h"
33#include "invf.h"
34#include "locallib.h"
35#include "words.h"
36#include "mg.h"
37
38/*
39 $Log$
40 Revision 1.1 2003/02/20 21:18:24 mdewsnip
41 Addition of MG package for search and retrieval
42
43 Revision 1.1 1999/08/10 21:18:13 sjboddie
44 renamed mg-1.3d directory mg
45
46 Revision 1.2 1998/11/25 07:55:47 rjmcnab
47
48 Modified mg to that you can specify the stemmer you want
49 to use via a command line option. You specify it to
50 mg_passes during the build process. The number of the
51 stemmer that you used is stored within the inverted
52 dictionary header and the stemmed dictionary header so
53 the correct stemmer is used in later stages of building
54 and querying.
55
56 Revision 1.1 1998/11/17 09:35:15 rjmcnab
57 *** empty log message ***
58
59 * Revision 1.3 1994/10/20 03:56:58 tes
60 * I have rewritten the boolean query optimiser and abstracted out the
61 * components of the boolean query.
62 *
63 * Revision 1.2 1994/09/20 04:41:53 tes
64 * For version 1.1
65 *
66 */
67
68static char *RCSID = "$Id: mg_perf_hash_build.c 16583 2008-07-29 10:20:36Z davidb $";
69
70
71
72#define POOL_SIZE 1024*1024
73
74static void process_files (char *filename);
75
76int r = -1;
77
78int main (int argc, char **argv)
79{
80 ProgTime start;
81 char *file_name = "";
82 int ch;
83 msg_prefix = argv[0];
84 opterr = 0;
85 while ((ch = getopt (argc, argv, "f:d:r:h")) != -1)
86 switch (ch)
87 {
88 case 'f': /* input file */
89 file_name = optarg;
90 break;
91 case 'd':
92 set_basepath (optarg);
93 break;
94 case 'r':
95 r = atoi (optarg);
96 break;
97 case 'h':
98 case '?':
99 fprintf (stderr, "usage: %s [-f input_file]"
100 "[-d data directory] [-r random seed] [-h]\n", argv[0]);
101 exit (1);
102 }
103
104 GetTime (&start);
105 process_files (file_name);
106 Message ("%s\n", ElapsedTime (&start, NULL));
107 return 0;
108}
109
110
111
112
113
114static void
115process_files (char *filename)
116{
117 FILE *dict, *hash;
118 unsigned long i;
119 u_char prev[MAXSTEMLEN + 1];
120 struct invf_dict_header idh;
121 perf_hash_data *phd;
122 u_char *pool;
123 int pool_left;
124 u_char **starts;
125
126
127 dict = open_file (filename, INVF_DICT_SUFFIX, "rb",
128 MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
129
130 fread ((char *) &idh, sizeof (idh), 1, dict);
131
132 /* [RPAP - Jan 97: Endian Ordering] */
133 NTOHUL(idh.lookback);
134 NTOHUL(idh.dict_size);
135 NTOHUL(idh.total_bytes);
136 NTOHUL(idh.index_string_bytes);
137 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
138 NTOHUL(idh.num_of_docs);
139 NTOHUL(idh.static_num_of_docs);
140 NTOHUL(idh.num_of_words);
141 NTOHUL(idh.stemmer_num);
142 NTOHUL(idh.stem_method);
143
144 hash = create_file (filename, INVF_DICT_HASH_SUFFIX, "wb",
145 MAGIC_HASH, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
146
147 if (!(pool = Xmalloc (POOL_SIZE)))
148 FatalError (1, "Out of memory");
149 pool_left = POOL_SIZE;
150
151 if (!(starts = Xmalloc (sizeof (u_char *) * idh.dict_size)))
152 FatalError (1, "Out of memory");
153
154 for (i = 0; i < idh.dict_size; i++)
155 {
156 register unsigned long copy, suff, l;
157 unsigned long wcnt, fcnt;
158
159 /* build a new word on top of prev */
160 copy = getc (dict);
161 suff = getc (dict);
162 *prev = copy + suff;
163 fread (prev + copy + 1, sizeof (u_char), suff, dict);
164
165 /* read other data, but no need to store it */
166 fread (&fcnt, sizeof (fcnt), 1, dict);
167 fread (&wcnt, sizeof (wcnt), 1, dict);
168
169 l = *prev + 1;
170 if (pool_left < l)
171 {
172 pool = Xmalloc (POOL_SIZE);
173 pool_left = POOL_SIZE;
174 }
175 starts[i] = pool;
176 bcopy ((char *) prev, (char *) pool, l);
177 pool += l;
178 pool_left -= l;
179 }
180 if (!(phd = gen_hash_func (idh.dict_size, starts, r)))
181 FatalError (1, "Unable to generate hash function");
182 if (write_perf_hash_data (hash, phd) == -1)
183 FatalError (1, "Unable to write hash function");
184 fclose (dict);
185 fclose (hash);
186}
Note: See TracBrowser for help on using the repository browser.