source: trunk/gsdl/src/mgpp/text/mg_invf_dict.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dict.cpp -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dict.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "messages.h"
26
27#include "mg_files.h"
28#include "invf.h"
29
30/*
31 $Log$
32 Revision 1.1 2000/01/14 02:26:16 sjboddie
33 Rodgers new C++ mg
34
35 */
36
37
38static void process_files (char *filename, unsigned long entriesPerBlock) {
39 // open the dictionary
40 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
41 MAGIC_STEM_BUILD, MG_ABORT);
42 invf_dict_header idh;
43 idh.Read (dictFile);
44
45 // open the inverted index file
46 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
47 MAGIC_INVI, MG_ABORT);
48
49 // create the blocked dictionary
50 FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
51 MAGIC_STEM, MG_ABORT);
52 block_dict_header bdh;
53 bdh.lookback = idh.lookback;
54 bdh.word_dict_start = idh.word_dict_start;
55 bdh.word_dict_size = idh.word_dict_size;
56 bdh.tag_dict_start = idh.tag_dict_start;
57 bdh.tag_dict_size = idh.tag_dict_size;
58 bdh.num_docs = idh.num_docs;
59 bdh.num_frags = idh.num_frags;
60 bdh.num_words = idh.num_words;
61 bdh.total_bytes = idh.total_bytes;
62 bdh.index_string_bytes = idh.index_string_bytes;
63 bdh.num_levels = idh.num_levels;
64 bdh.Write (blockDictFile);
65
66
67 // write out the word part of the dictionary
68
69 bdh.entries_per_wblk = entriesPerBlock;
70 bdh.max_wblk_size = 0;
71 bdh.wblk_start = ftell (blockDictFile);
72
73 fseek (dictFile, idh.word_dict_start, SEEK_SET);
74
75 block_idx wordIdx;
76 word_block_dict_el wordBlockEl;
77 wordBlockEl.SetNumLevels (idh.num_levels);
78
79
80 unsigned long wordNum;
81 unsigned long wordInvfPtr;
82 UCArray lastEl;
83 word_dict_el wordEl;
84 wordEl.SetNumLevels (idh.num_levels);
85 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
86 // read in the next word and inverted file pointer
87 wordEl.Read (dictFile, idh.num_levels);
88 ReadUL (invfIdxFile, wordInvfPtr);
89
90 // remember this word (and position) if this is the start
91 // of a new block
92 if (wordNum % entriesPerBlock == 0) {
93 block_idx_info elIdx;
94 elIdx.el = wordEl.el;
95 elIdx.block_ptr = ftell (blockDictFile);
96
97 // see if this block is the longest so far
98 if (wordIdx.size() > 0) {
99 unsigned long blockLen = elIdx.block_ptr -
100 (*(wordIdx.end()-1)).block_ptr;
101 if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
102 }
103
104 wordIdx.push_back (elIdx);
105 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
106 }
107
108 // copy the information for this word
109 wordBlockEl.el = wordEl.el;
110 wordBlockEl.frag_occur = wordEl.frag_occur;
111 wordBlockEl.freq = wordEl.freq;
112 wordBlockEl.invf_ptr = wordInvfPtr;
113 unsigned long tempI;
114 for (tempI=0; tempI<idh.num_levels; tempI++)
115 wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];
116
117 // write out the word
118 wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);
119
120 lastEl = wordBlockEl.el;
121 }
122
123
124 // write out the tag part of the dictionary
125
126 bdh.entries_per_tblk = entriesPerBlock;
127 bdh.max_tblk_size = 0;
128 bdh.tblk_start = ftell (blockDictFile);
129
130 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
131
132 block_idx tagIdx;
133 block_dict_el tagBlockEl;
134
135 unsigned long tagNum;
136 unsigned long tagInvfPtr;
137 dict_el tagEl;
138 lastEl.erase (lastEl.begin(), lastEl.end());
139 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
140 // read in the next tag and inverted file pointer
141 tagEl.Read (dictFile);
142 ReadUL (invfIdxFile, tagInvfPtr);
143
144 // remember this tag (and position) if this is the start
145 // of a new block
146 if (tagNum % entriesPerBlock == 0) {
147 block_idx_info elIdx;
148 elIdx.el = tagEl.el;
149 elIdx.block_ptr = ftell (blockDictFile);
150
151 // see if this block is the longest so far
152 if (tagIdx.size() > 0) {
153 unsigned long blockLen = elIdx.block_ptr -
154 (*(tagIdx.end()-1)).block_ptr;
155 if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
156 }
157
158 tagIdx.push_back (elIdx);
159 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
160 }
161
162 // copy the information for this tag
163 tagBlockEl.el = tagEl.el;
164 tagBlockEl.frag_occur = tagEl.frag_occur;
165 tagBlockEl.freq = tagEl.freq;
166 tagBlockEl.invf_ptr = tagInvfPtr;
167
168 // write out the tag
169 tagBlockEl.Write (blockDictFile, &lastEl);
170
171 lastEl = tagBlockEl.el;
172 }
173
174
175 // write out the element indexes
176 bdh.num_wblks = wordIdx.size();
177 bdh.wblk_idx_start = ftell (blockDictFile);
178 WriteBlockIdx (blockDictFile, wordIdx);
179
180 bdh.num_tblks = tagIdx.size();
181 bdh.tblk_idx_start = ftell (blockDictFile);
182 WriteBlockIdx (blockDictFile, tagIdx);
183
184 // write out the blocked dictionary header
185 fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
186 bdh.Write (blockDictFile);
187
188
189 // close open files
190 fclose (blockDictFile);
191 fclose (invfIdxFile);
192 fclose (dictFile);
193
194 // print out information
195 Message ("Max word block size = %d\n", bdh.max_wblk_size);
196 Message ("Max tag block size = %d\n", bdh.max_tblk_size);
197 Message ("Number of word blocks written = %d\n", bdh.num_wblks);
198 Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
199}
200
201
202int main (int argc, char **argv) {
203 unsigned long entriesPerBlock = 16;
204 char *filename = "";
205 int ch;
206 msg_prefix = argv[0];
207 opterr = 0;
208
209 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
210 switch (ch) {
211 case 'f': // input file
212 filename = optarg;
213 break;
214 case 'd':
215 set_basepath (optarg);
216 break;
217 case 'b':
218 entriesPerBlock = atoi (optarg);
219 break;
220 case 'h':
221 case '?':
222 fprintf (stderr, "usage: %s [-f input_file] "
223 "[-d data directory] [-b entries-per-block] "
224 "[-h]\n", argv[0]);
225 exit (1);
226 }
227 }
228
229 process_files (filename, entriesPerBlock);
230 return 0;
231}
Note: See TracBrowser for help on using the repository browser.