source: gsdl/trunk/trunk/mgpp/text/mgpp_invf_dict.cpp@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_invf_dict.cpp -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25// need this to avoid bizarre compiler problems under VC++ 6.0
26#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
27# include <iostream>
28#endif
29
30/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
31#if defined (__WIN32__) || defined (__CYGWIN__)
32# include "getopt_old.h"
33#else
34# include <unistd.h>
35#endif
36
37#include "sysfuncs.h"
38#include "messages.h"
39#include "mg_files.h"
40#include "invf.h"
41
42static void process_files (char *filename, unsigned long entriesPerBlock) {
43 // open the dictionary
44 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
45 MAGIC_STEM_BUILD, MG_ABORT);
46 invf_dict_header idh;
47 idh.Read (dictFile);
48
49 // open the inverted index file
50 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
51 MAGIC_INVI, MG_ABORT);
52
53 // create the blocked dictionary
54 FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
55 MAGIC_STEM, MG_ABORT);
56 block_dict_header bdh;
57 bdh.lookback = idh.lookback;
58 bdh.word_dict_start = idh.word_dict_start;
59 bdh.word_dict_size = idh.word_dict_size;
60 bdh.tag_dict_start = idh.tag_dict_start;
61 bdh.tag_dict_size = idh.tag_dict_size;
62 bdh.num_docs = idh.num_docs;
63 bdh.num_frags = idh.num_frags;
64 bdh.num_words = idh.num_words;
65 bdh.total_bytes = idh.total_bytes;
66 bdh.index_string_bytes = idh.index_string_bytes;
67 bdh.num_levels = idh.num_levels;
68 bdh.Write (blockDictFile);
69
70
71 // write out the word part of the dictionary
72
73 bdh.entries_per_wblk = entriesPerBlock;
74 bdh.max_wblk_size = 0;
75 bdh.wblk_start = ftell (blockDictFile);
76
77 fseek (dictFile, idh.word_dict_start, SEEK_SET);
78
79 block_idx wordIdx;
80 word_block_dict_el wordBlockEl;
81 wordBlockEl.SetNumLevels (idh.num_levels);
82
83
84 unsigned long wordNum;
85 unsigned long wordInvfPtr;
86 UCArray lastEl;
87 word_dict_el wordEl;
88 wordEl.SetNumLevels (idh.num_levels);
89 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
90 // read in the next word and inverted file pointer
91 wordEl.Read (dictFile, idh.num_levels);
92 ReadUL (invfIdxFile, wordInvfPtr);
93
94 // remember this word (and position) if this is the start
95 // of a new block
96 if (wordNum % entriesPerBlock == 0) {
97 block_idx_info elIdx;
98 elIdx.el = wordEl.el;
99 elIdx.block_ptr = ftell (blockDictFile);
100
101 // see if this block is the longest so far
102 if (wordIdx.size() > 0) {
103 unsigned long blockLen = elIdx.block_ptr -
104 (*(wordIdx.end()-1)).block_ptr;
105 if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
106 }
107
108 wordIdx.push_back (elIdx);
109 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
110 }
111
112 // copy the information for this word
113 wordBlockEl.el = wordEl.el;
114 wordBlockEl.frag_occur = wordEl.frag_occur;
115 wordBlockEl.freq = wordEl.freq;
116 wordBlockEl.invf_ptr = wordInvfPtr;
117 unsigned long tempI;
118 for (tempI=0; tempI<idh.num_levels; ++tempI)
119 wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];
120
121 // write out the word
122 wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);
123
124 lastEl = wordBlockEl.el;
125 }
126
127
128 // write out the tag part of the dictionary
129
130 bdh.entries_per_tblk = entriesPerBlock;
131 bdh.max_tblk_size = 0;
132 bdh.tblk_start = ftell (blockDictFile);
133
134 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
135
136 block_idx tagIdx;
137 block_dict_el tagBlockEl;
138
139 unsigned long tagNum;
140 unsigned long tagInvfPtr;
141 dict_el tagEl;
142 lastEl.erase (lastEl.begin(), lastEl.end());
143 for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
144 // read in the next tag and inverted file pointer
145 tagEl.Read (dictFile);
146 ReadUL (invfIdxFile, tagInvfPtr);
147
148 // remember this tag (and position) if this is the start
149 // of a new block
150 if (tagNum % entriesPerBlock == 0) {
151 block_idx_info elIdx;
152 elIdx.el = tagEl.el;
153 elIdx.block_ptr = ftell (blockDictFile);
154
155 // see if this block is the longest so far
156 if (tagIdx.size() > 0) {
157 unsigned long blockLen = elIdx.block_ptr -
158 (*(tagIdx.end()-1)).block_ptr;
159 if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
160 }
161
162 tagIdx.push_back (elIdx);
163 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
164 }
165
166 // copy the information for this tag
167 tagBlockEl.el = tagEl.el;
168 tagBlockEl.frag_occur = tagEl.frag_occur;
169 tagBlockEl.freq = tagEl.freq;
170 tagBlockEl.invf_ptr = tagInvfPtr;
171
172 // write out the tag
173 tagBlockEl.Write (blockDictFile, &lastEl);
174
175 lastEl = tagBlockEl.el;
176 }
177
178
179 // write out the element indexes
180 bdh.num_wblks = wordIdx.size();
181 bdh.wblk_idx_start = ftell (blockDictFile);
182 WriteBlockIdx (blockDictFile, wordIdx);
183
184 bdh.num_tblks = tagIdx.size();
185 bdh.tblk_idx_start = ftell (blockDictFile);
186 WriteBlockIdx (blockDictFile, tagIdx);
187
188 // write out the blocked dictionary header
189 fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
190 bdh.Write (blockDictFile);
191
192
193 // close open files
194 fclose (blockDictFile);
195 fclose (invfIdxFile);
196 fclose (dictFile);
197
198 // print out information
199#ifndef SILENT
200 Message ("Max word block size = %d\n", bdh.max_wblk_size);
201 Message ("Max tag block size = %d\n", bdh.max_tblk_size);
202 Message ("Number of word blocks written = %d\n", bdh.num_wblks);
203 Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
204#endif
205}
206
207
208int main (int argc, char **argv) {
209 unsigned long entriesPerBlock = 16;
210 char *filename = "";
211 int ch;
212 msg_prefix = argv[0];
213 opterr = 0;
214
215 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
216 switch (ch) {
217 case 'f': // input file
218 filename = optarg;
219 break;
220 case 'd':
221 set_basepath (optarg);
222 break;
223 case 'b':
224 entriesPerBlock = atoi (optarg);
225 break;
226 case 'h':
227 case '?':
228 fprintf (stderr, "usage: %s [-f input_file] "
229 "[-d data directory] [-b entries-per-block] "
230 "[-h]\n", argv[0]);
231 exit (1);
232 }
233 }
234
235 process_files (filename, entriesPerBlock);
236 return 0;
237}
Note: See TracBrowser for help on using the repository browser.