source: trunk/gsdl/src/mgpp/text/mg_invf_dict.cpp@ 2442

Last change on this file since 2442 was 2442, checked in by jrm21, 23 years ago

portability changes, use getopt from unistd.h (all POSIX systems)

  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dict.cpp -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dict.cpp 2442 2001-05-17 04:38:16Z jrm21 $
21 *
22 **************************************************************************/
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25#include <unistd.h>
26
27#include "sysfuncs.h"
28#include "messages.h"
29
30#include "mg_files.h"
31#include "invf.h"
32
33/*
34 $Log$
35 Revision 1.3 2001/05/17 04:38:15 jrm21
36 portability changes, use getopt from unistd.h (all POSIX systems)
37
38 Revision 1.2 2000/01/18 03:53:24 rjmcnab
39 Fixed a couple of bugs and made building silent if needed.
40
41 Revision 1.1 2000/01/14 02:26:16 sjboddie
42 Rodgers new C++ mg
43
44 */
45
46
47static void process_files (char *filename, unsigned long entriesPerBlock) {
48 // open the dictionary
49 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
50 MAGIC_STEM_BUILD, MG_ABORT);
51 invf_dict_header idh;
52 idh.Read (dictFile);
53
54 // open the inverted index file
55 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
56 MAGIC_INVI, MG_ABORT);
57
58 // create the blocked dictionary
59 FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
60 MAGIC_STEM, MG_ABORT);
61 block_dict_header bdh;
62 bdh.lookback = idh.lookback;
63 bdh.word_dict_start = idh.word_dict_start;
64 bdh.word_dict_size = idh.word_dict_size;
65 bdh.tag_dict_start = idh.tag_dict_start;
66 bdh.tag_dict_size = idh.tag_dict_size;
67 bdh.num_docs = idh.num_docs;
68 bdh.num_frags = idh.num_frags;
69 bdh.num_words = idh.num_words;
70 bdh.total_bytes = idh.total_bytes;
71 bdh.index_string_bytes = idh.index_string_bytes;
72 bdh.num_levels = idh.num_levels;
73 bdh.Write (blockDictFile);
74
75
76 // write out the word part of the dictionary
77
78 bdh.entries_per_wblk = entriesPerBlock;
79 bdh.max_wblk_size = 0;
80 bdh.wblk_start = ftell (blockDictFile);
81
82 fseek (dictFile, idh.word_dict_start, SEEK_SET);
83
84 block_idx wordIdx;
85 word_block_dict_el wordBlockEl;
86 wordBlockEl.SetNumLevels (idh.num_levels);
87
88
89 unsigned long wordNum;
90 unsigned long wordInvfPtr;
91 UCArray lastEl;
92 word_dict_el wordEl;
93 wordEl.SetNumLevels (idh.num_levels);
94 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
95 // read in the next word and inverted file pointer
96 wordEl.Read (dictFile, idh.num_levels);
97 ReadUL (invfIdxFile, wordInvfPtr);
98
99 // remember this word (and position) if this is the start
100 // of a new block
101 if (wordNum % entriesPerBlock == 0) {
102 block_idx_info elIdx;
103 elIdx.el = wordEl.el;
104 elIdx.block_ptr = ftell (blockDictFile);
105
106 // see if this block is the longest so far
107 if (wordIdx.size() > 0) {
108 unsigned long blockLen = elIdx.block_ptr -
109 (*(wordIdx.end()-1)).block_ptr;
110 if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
111 }
112
113 wordIdx.push_back (elIdx);
114 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
115 }
116
117 // copy the information for this word
118 wordBlockEl.el = wordEl.el;
119 wordBlockEl.frag_occur = wordEl.frag_occur;
120 wordBlockEl.freq = wordEl.freq;
121 wordBlockEl.invf_ptr = wordInvfPtr;
122 unsigned long tempI;
123 for (tempI=0; tempI<idh.num_levels; tempI++)
124 wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];
125
126 // write out the word
127 wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);
128
129 lastEl = wordBlockEl.el;
130 }
131
132
133 // write out the tag part of the dictionary
134
135 bdh.entries_per_tblk = entriesPerBlock;
136 bdh.max_tblk_size = 0;
137 bdh.tblk_start = ftell (blockDictFile);
138
139 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
140
141 block_idx tagIdx;
142 block_dict_el tagBlockEl;
143
144 unsigned long tagNum;
145 unsigned long tagInvfPtr;
146 dict_el tagEl;
147 lastEl.erase (lastEl.begin(), lastEl.end());
148 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
149 // read in the next tag and inverted file pointer
150 tagEl.Read (dictFile);
151 ReadUL (invfIdxFile, tagInvfPtr);
152
153 // remember this tag (and position) if this is the start
154 // of a new block
155 if (tagNum % entriesPerBlock == 0) {
156 block_idx_info elIdx;
157 elIdx.el = tagEl.el;
158 elIdx.block_ptr = ftell (blockDictFile);
159
160 // see if this block is the longest so far
161 if (tagIdx.size() > 0) {
162 unsigned long blockLen = elIdx.block_ptr -
163 (*(tagIdx.end()-1)).block_ptr;
164 if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
165 }
166
167 tagIdx.push_back (elIdx);
168 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
169 }
170
171 // copy the information for this tag
172 tagBlockEl.el = tagEl.el;
173 tagBlockEl.frag_occur = tagEl.frag_occur;
174 tagBlockEl.freq = tagEl.freq;
175 tagBlockEl.invf_ptr = tagInvfPtr;
176
177 // write out the tag
178 tagBlockEl.Write (blockDictFile, &lastEl);
179
180 lastEl = tagBlockEl.el;
181 }
182
183
184 // write out the element indexes
185 bdh.num_wblks = wordIdx.size();
186 bdh.wblk_idx_start = ftell (blockDictFile);
187 WriteBlockIdx (blockDictFile, wordIdx);
188
189 bdh.num_tblks = tagIdx.size();
190 bdh.tblk_idx_start = ftell (blockDictFile);
191 WriteBlockIdx (blockDictFile, tagIdx);
192
193 // write out the blocked dictionary header
194 fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
195 bdh.Write (blockDictFile);
196
197
198 // close open files
199 fclose (blockDictFile);
200 fclose (invfIdxFile);
201 fclose (dictFile);
202
203 // print out information
204#ifndef SILENT
205 Message ("Max word block size = %d\n", bdh.max_wblk_size);
206 Message ("Max tag block size = %d\n", bdh.max_tblk_size);
207 Message ("Number of word blocks written = %d\n", bdh.num_wblks);
208 Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
209#endif
210}
211
212
213int main (int argc, char **argv) {
214 unsigned long entriesPerBlock = 16;
215 char *filename = "";
216 int ch;
217 msg_prefix = argv[0];
218 opterr = 0;
219
220 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
221 switch (ch) {
222 case 'f': // input file
223 filename = optarg;
224 break;
225 case 'd':
226 set_basepath (optarg);
227 break;
228 case 'b':
229 entriesPerBlock = atoi (optarg);
230 break;
231 case 'h':
232 case '?':
233 fprintf (stderr, "usage: %s [-f input_file] "
234 "[-d data directory] [-b entries-per-block] "
235 "[-h]\n", argv[0]);
236 exit (1);
237 }
238 }
239
240 process_files (filename, entriesPerBlock);
241 return 0;
242}
Note: See TracBrowser for help on using the repository browser.