source: trunk/gsdl/src/mgpp/text/mg_perf_hash_build.cpp@ 2442

Last change on this file since 2442 was 2442, checked in by jrm21, 23 years ago

portability changes, use getopt from unistd.h (all POSIX systems)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.3 KB
Line 
1/**************************************************************************
2 *
3 * mg_perf_hash_build.cpp -- Program to build a perfect hash function
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_perf_hash_build.cpp 2442 2001-05-17 04:38:16Z jrm21 $
21 *
22 **************************************************************************/
23
24#define _XOPEN_SOURCE 1
25#define _XOPEN_SOURCE_EXTENDED 1
26#include <unistd.h>
27
28#include "sysfuncs.h"
29#include "memlib.h"
30#include "messages.h"
31#include "local_strings.h"
32#include "perf_hash.h"
33#include "netorder.h"
34
35#include "mg_files.h"
36#include "invf.h"
37#include "locallib.h"
38#include "words.h"
39#include "mg.h"
40
41/*
42 $Log$
43 Revision 1.4 2001/05/17 04:38:16 jrm21
44 portability changes, use getopt from unistd.h (all POSIX systems)
45
46 Revision 1.3 2001/05/07 05:01:47 jrm21
47 replaced bcopy with memcpy
48
49 Revision 1.2 2000/02/15 22:45:22 kjm18
50 added feature to retrieve doc nums at a different level than the level
51 queried at. eg query at Document level, but retrieve section level docnums
52 bug in mg_perf_hash_build.cpp fixed
53
54 Revision 1.1 2000/01/14 02:26:19 sjboddie
55 Rodgers new C++ mg
56
57 Revision 1.2 1999/10/17 23:43:27 cs025
58 Changes to eradicate Xmalloc
59
60 Revision 1.1 1999/10/11 02:58:01 cs025
61 Base install of MG-PP
62
63 Revision 1.1 1999/08/10 21:18:13 sjboddie
64 renamed mg-1.3d directory mg
65
66 Revision 1.2 1998/11/25 07:55:47 rjmcnab
67
68 Modified mg to that you can specify the stemmer you want
69 to use via a command line option. You specify it to
70 mg_passes during the build process. The number of the
71 stemmer that you used is stored within the inverted
72 dictionary header and the stemmed dictionary header so
73 the correct stemmer is used in later stages of building
74 and querying.
75
76 Revision 1.1 1998/11/17 09:35:15 rjmcnab
77 *** empty log message ***
78
79 * Revision 1.3 1994/10/20 03:56:58 tes
80 * I have rewritten the boolean query optimiser and abstracted out the
81 * components of the boolean query.
82 *
83 * Revision 1.2 1994/09/20 04:41:53 tes
84 * For version 1.1
85 *
86 */
87
88
89
90#define POOL_SIZE 1024*1024
91
92
93static void ProcessFiles (char *filename, int r) {
94 FILE *dictFile, *hashFile;
95 unsigned long i;
96 invf_dict_header idh;
97 perf_hash_data *phd;
98 u_char *pool;
99 unsigned long pool_left;
100 u_char **starts;
101
102 // read in the dictionary
103 dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
104 MAGIC_STEM_BUILD, MG_ABORT);
105 if (dictFile==NULL) {
106 FatalError(1, "unable to open file");
107 }
108 idh.Read (dictFile);
109
110 //cerr << idh.lookback<<" "<<idh.word_dict_start<<endl;
111 // go to the start of the word dictionary
112 fseek (dictFile, idh.word_dict_start, SEEK_SET);
113
114 if (!(pool = (u_char *) Xmalloc (POOL_SIZE)))
115 FatalError (1, "Out of memory");
116 pool_left = POOL_SIZE;
117
118 if (!(starts = (u_char **) Xmalloc (sizeof (u_char *) * idh.word_dict_size)))
119 FatalError (1, "Out of memory");
120 //cerr << "size= "<< idh.word_dict_size<<endl;
121 word_dict_el wordEl;
122 wordEl.SetNumLevels (idh.num_levels);
123 for (i = 0; i < idh.word_dict_size; i++) {
124 // read the next word and associated information
125 wordEl.Read (dictFile, idh.num_levels);
126
127 // push string onto pool data
128 register unsigned long l;
129 l = wordEl.el.size() + 1;
130 if (pool_left < l) {
131 pool = (u_char *) Xmalloc (POOL_SIZE);
132 pool_left = POOL_SIZE;
133 }
134 starts[i] = pool;
135
136 *pool++ = wordEl.el.size();
137 memcpy ((char *) pool, (const char *) wordEl.el.begin(), wordEl.el.size());
138 //cerr << pool<<" " <<starts[i]<<endl;
139 pool += wordEl.el.size();
140 pool_left -= l;
141
142 }
143 fclose (dictFile);
144 //cerr << pool<<" " <<starts[i-1]<<endl;
145 //cerr<<"starts "<<starts[113529]<<endl;
146 //cerr << starts[17][1] << " "<<starts[25][4]<<endl;
147 // create perfect hash file
148 hashFile = create_file (filename, INVF_DICT_HASH_SUFFIX, "wb",
149 MAGIC_HASH, MG_ABORT);
150 if (!(phd = gen_hash_func (idh.word_dict_size, starts, r)))
151 FatalError (1, "Unable to generate hash function");
152 if (write_perf_hash_data (hashFile, phd) == -1)
153 FatalError (1, "Unable to write hash function");
154 fclose (hashFile);
155}
156
157
158
159int main (int argc, char **argv) {
160 int r = -1;
161 char *filename = "";
162 int ch;
163 msg_prefix = argv[0];
164 opterr = 0;
165
166 while ((ch = getopt (argc, argv, "f:d:r:h")) != -1) {
167 switch (ch) {
168 case 'f': // input file
169 filename = optarg;
170 break;
171 case 'd':
172 set_basepath (optarg);
173 break;
174 case 'r':
175 r = atoi (optarg);
176 break;
177 case 'h':
178 case '?':
179 fprintf (stderr, "usage: %s [-f input_file]"
180 "[-d data directory] [-r random seed] [-h]\n", argv[0]);
181 exit (1);
182 }
183 }
184
185 ProcessFiles (filename, r);
186 return 0;
187}
Note: See TracBrowser for help on using the repository browser.