source: gsdl/tags/gsdl-2_30-distribution/gsdl/src/mgpp/text/mg_perf_hash_build.cpp@ 14121

Last change on this file since 14121 was 927, checked in by kjm18, 24 years ago

added feature to retrieve doc nums at a different level than the level
queried at. eg query at Document level, but retrieve section level docnums
bug in mg_perf_hash_build.cpp fixed

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1/**************************************************************************
2 *
3 * mg_perf_hash_build.cpp -- Program to build a perfect hash function
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_perf_hash_build.cpp 927 2000-02-15 22:45:22Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "memlib.h"
26#include "messages.h"
27#include "local_strings.h"
28#include "perf_hash.h"
29#include "netorder.h"
30
31#include "mg_files.h"
32#include "invf.h"
33#include "locallib.h"
34#include "words.h"
35#include "mg.h"
36
37/*
38 $Log$
39 Revision 1.2 2000/02/15 22:45:22 kjm18
40 added feature to retrieve doc nums at a different level than the level
41 queried at. eg query at Document level, but retrieve section level docnums
42 bug in mg_perf_hash_build.cpp fixed
43
44 Revision 1.1 2000/01/14 02:26:19 sjboddie
45 Rodgers new C++ mg
46
47 Revision 1.2 1999/10/17 23:43:27 cs025
48 Changes to eradicate Xmalloc
49
50 Revision 1.1 1999/10/11 02:58:01 cs025
51 Base install of MG-PP
52
53 Revision 1.1 1999/08/10 21:18:13 sjboddie
54 renamed mg-1.3d directory mg
55
56 Revision 1.2 1998/11/25 07:55:47 rjmcnab
57
58 Modified mg to that you can specify the stemmer you want
59 to use via a command line option. You specify it to
60 mg_passes during the build process. The number of the
61 stemmer that you used is stored within the inverted
62 dictionary header and the stemmed dictionary header so
63 the correct stemmer is used in later stages of building
64 and querying.
65
66 Revision 1.1 1998/11/17 09:35:15 rjmcnab
67 *** empty log message ***
68
69 * Revision 1.3 1994/10/20 03:56:58 tes
70 * I have rewritten the boolean query optimiser and abstracted out the
71 * components of the boolean query.
72 *
73 * Revision 1.2 1994/09/20 04:41:53 tes
74 * For version 1.1
75 *
76 */
77
78
79
80#define POOL_SIZE 1024*1024
81
82
83static void ProcessFiles (char *filename, int r) {
84 FILE *dictFile, *hashFile;
85 unsigned long i;
86 invf_dict_header idh;
87 perf_hash_data *phd;
88 u_char *pool;
89 unsigned long pool_left;
90 u_char **starts;
91
92 // read in the dictionary
93 dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
94 MAGIC_STEM_BUILD, MG_ABORT);
95 if (dictFile==NULL) {
96 FatalError(1, "unable to open file");
97 }
98 idh.Read (dictFile);
99
100 //cerr << idh.lookback<<" "<<idh.word_dict_start<<endl;
101 // go to the start of the word dictionary
102 fseek (dictFile, idh.word_dict_start, SEEK_SET);
103
104 if (!(pool = (u_char *) Xmalloc (POOL_SIZE)))
105 FatalError (1, "Out of memory");
106 pool_left = POOL_SIZE;
107
108 if (!(starts = (u_char **) Xmalloc (sizeof (u_char *) * idh.word_dict_size)))
109 FatalError (1, "Out of memory");
110 //cerr << "size= "<< idh.word_dict_size<<endl;
111 word_dict_el wordEl;
112 wordEl.SetNumLevels (idh.num_levels);
113 for (i = 0; i < idh.word_dict_size; i++) {
114 // read the next word and associated information
115 wordEl.Read (dictFile, idh.num_levels);
116
117 // push string onto pool data
118 register unsigned long l;
119 l = wordEl.el.size() + 1;
120 if (pool_left < l) {
121 pool = (u_char *) Xmalloc (POOL_SIZE);
122 pool_left = POOL_SIZE;
123 }
124 starts[i] = pool;
125
126 *pool++ = wordEl.el.size();
127 bcopy ((char *) wordEl.el.begin(), (char *) pool, wordEl.el.size());
128 //cerr << pool<<" " <<starts[i]<<endl;
129 pool += wordEl.el.size();
130 pool_left -= l;
131
132 }
133 fclose (dictFile);
134 //cerr << pool<<" " <<starts[i-1]<<endl;
135 //cerr<<"starts "<<starts[113529]<<endl;
136 //cerr << starts[17][1] << " "<<starts[25][4]<<endl;
137 // create perfect hash file
138 hashFile = create_file (filename, INVF_DICT_HASH_SUFFIX, "wb",
139 MAGIC_HASH, MG_ABORT);
140 if (!(phd = gen_hash_func (idh.word_dict_size, starts, r)))
141 FatalError (1, "Unable to generate hash function");
142 if (write_perf_hash_data (hashFile, phd) == -1)
143 FatalError (1, "Unable to write hash function");
144 fclose (hashFile);
145}
146
147
148
149int main (int argc, char **argv) {
150 int r = -1;
151 char *filename = "";
152 int ch;
153 msg_prefix = argv[0];
154 opterr = 0;
155
156 while ((ch = getopt (argc, argv, "f:d:r:h")) != -1) {
157 switch (ch) {
158 case 'f': // input file
159 filename = optarg;
160 break;
161 case 'd':
162 set_basepath (optarg);
163 break;
164 case 'r':
165 r = atoi (optarg);
166 break;
167 case 'h':
168 case '?':
169 fprintf (stderr, "usage: %s [-f input_file]"
170 "[-d data directory] [-r random seed] [-h]\n", argv[0]);
171 exit (1);
172 }
173 }
174
175 ProcessFiles (filename, r);
176 return 0;
177}
Note: See TracBrowser for help on using the repository browser.