source: trunk/gsdl/src/mgpp/text/mg_invf_dict.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dict.cpp -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25// need this to avoid bizarre compiler problems under VC++ 6.0
26#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
27# include <iostream>
28#endif
29
30#if defined (__WIN32__)
31# include "getopt.h"
32#else
33# include <unistd.h>
34#endif
35
36#include "sysfuncs.h"
37#include "messages.h"
38#include "mg_files.h"
39#include "invf.h"
40
41static void process_files (char *filename, unsigned long entriesPerBlock) {
42 // open the dictionary
43 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
44 MAGIC_STEM_BUILD, MG_ABORT);
45 invf_dict_header idh;
46 idh.Read (dictFile);
47
48 // open the inverted index file
49 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
50 MAGIC_INVI, MG_ABORT);
51
52 // create the blocked dictionary
53 FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
54 MAGIC_STEM, MG_ABORT);
55 block_dict_header bdh;
56 bdh.lookback = idh.lookback;
57 bdh.word_dict_start = idh.word_dict_start;
58 bdh.word_dict_size = idh.word_dict_size;
59 bdh.tag_dict_start = idh.tag_dict_start;
60 bdh.tag_dict_size = idh.tag_dict_size;
61 bdh.num_docs = idh.num_docs;
62 bdh.num_frags = idh.num_frags;
63 bdh.num_words = idh.num_words;
64 bdh.total_bytes = idh.total_bytes;
65 bdh.index_string_bytes = idh.index_string_bytes;
66 bdh.num_levels = idh.num_levels;
67 bdh.Write (blockDictFile);
68
69
70 // write out the word part of the dictionary
71
72 bdh.entries_per_wblk = entriesPerBlock;
73 bdh.max_wblk_size = 0;
74 bdh.wblk_start = ftell (blockDictFile);
75
76 fseek (dictFile, idh.word_dict_start, SEEK_SET);
77
78 block_idx wordIdx;
79 word_block_dict_el wordBlockEl;
80 wordBlockEl.SetNumLevels (idh.num_levels);
81
82
83 unsigned long wordNum;
84 unsigned long wordInvfPtr;
85 UCArray lastEl;
86 word_dict_el wordEl;
87 wordEl.SetNumLevels (idh.num_levels);
88 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
89 // read in the next word and inverted file pointer
90 wordEl.Read (dictFile, idh.num_levels);
91 ReadUL (invfIdxFile, wordInvfPtr);
92
93 // remember this word (and position) if this is the start
94 // of a new block
95 if (wordNum % entriesPerBlock == 0) {
96 block_idx_info elIdx;
97 elIdx.el = wordEl.el;
98 elIdx.block_ptr = ftell (blockDictFile);
99
100 // see if this block is the longest so far
101 if (wordIdx.size() > 0) {
102 unsigned long blockLen = elIdx.block_ptr -
103 (*(wordIdx.end()-1)).block_ptr;
104 if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
105 }
106
107 wordIdx.push_back (elIdx);
108 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
109 }
110
111 // copy the information for this word
112 wordBlockEl.el = wordEl.el;
113 wordBlockEl.frag_occur = wordEl.frag_occur;
114 wordBlockEl.freq = wordEl.freq;
115 wordBlockEl.invf_ptr = wordInvfPtr;
116 unsigned long tempI;
117 for (tempI=0; tempI<idh.num_levels; tempI++)
118 wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];
119
120 // write out the word
121 wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);
122
123 lastEl = wordBlockEl.el;
124 }
125
126
127 // write out the tag part of the dictionary
128
129 bdh.entries_per_tblk = entriesPerBlock;
130 bdh.max_tblk_size = 0;
131 bdh.tblk_start = ftell (blockDictFile);
132
133 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
134
135 block_idx tagIdx;
136 block_dict_el tagBlockEl;
137
138 unsigned long tagNum;
139 unsigned long tagInvfPtr;
140 dict_el tagEl;
141 lastEl.erase (lastEl.begin(), lastEl.end());
142 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
143 // read in the next tag and inverted file pointer
144 tagEl.Read (dictFile);
145 ReadUL (invfIdxFile, tagInvfPtr);
146
147 // remember this tag (and position) if this is the start
148 // of a new block
149 if (tagNum % entriesPerBlock == 0) {
150 block_idx_info elIdx;
151 elIdx.el = tagEl.el;
152 elIdx.block_ptr = ftell (blockDictFile);
153
154 // see if this block is the longest so far
155 if (tagIdx.size() > 0) {
156 unsigned long blockLen = elIdx.block_ptr -
157 (*(tagIdx.end()-1)).block_ptr;
158 if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
159 }
160
161 tagIdx.push_back (elIdx);
162 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
163 }
164
165 // copy the information for this tag
166 tagBlockEl.el = tagEl.el;
167 tagBlockEl.frag_occur = tagEl.frag_occur;
168 tagBlockEl.freq = tagEl.freq;
169 tagBlockEl.invf_ptr = tagInvfPtr;
170
171 // write out the tag
172 tagBlockEl.Write (blockDictFile, &lastEl);
173
174 lastEl = tagBlockEl.el;
175 }
176
177
178 // write out the element indexes
179 bdh.num_wblks = wordIdx.size();
180 bdh.wblk_idx_start = ftell (blockDictFile);
181 WriteBlockIdx (blockDictFile, wordIdx);
182
183 bdh.num_tblks = tagIdx.size();
184 bdh.tblk_idx_start = ftell (blockDictFile);
185 WriteBlockIdx (blockDictFile, tagIdx);
186
187 // write out the blocked dictionary header
188 fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
189 bdh.Write (blockDictFile);
190
191
192 // close open files
193 fclose (blockDictFile);
194 fclose (invfIdxFile);
195 fclose (dictFile);
196
197 // print out information
198#ifndef SILENT
199 Message ("Max word block size = %d\n", bdh.max_wblk_size);
200 Message ("Max tag block size = %d\n", bdh.max_tblk_size);
201 Message ("Number of word blocks written = %d\n", bdh.num_wblks);
202 Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
203#endif
204}
205
206
207int main (int argc, char **argv) {
208 unsigned long entriesPerBlock = 16;
209 char *filename = "";
210 int ch;
211 msg_prefix = argv[0];
212 opterr = 0;
213
214 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
215 switch (ch) {
216 case 'f': // input file
217 filename = optarg;
218 break;
219 case 'd':
220 set_basepath (optarg);
221 break;
222 case 'b':
223 entriesPerBlock = atoi (optarg);
224 break;
225 case 'h':
226 case '?':
227 fprintf (stderr, "usage: %s [-f input_file] "
228 "[-d data directory] [-b entries-per-block] "
229 "[-h]\n", argv[0]);
230 exit (1);
231 }
232 }
233
234 process_files (filename, entriesPerBlock);
235 return 0;
236}
Note: See TracBrowser for help on using the repository browser.