source: indexers/trunk/mgpp/text/mgpp_invf_dict.cpp@ 19822

Last change on this file since 19822 was 19822, checked in by mdewsnip, 15 years ago

Commented out all occurrences of

#define _XOPEN_SOURCE_EXTENDED 1

This was allegedly added for compilation on Solaris, but it just causes errors for me (on the NLNZ Solaris machines).

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_invf_dict.cpp -- Program to build the blocked stemmed dictionary
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23// This was added for Solaris, but it makes things worse on Solaris for me...
24// #define _XOPEN_SOURCE_EXTENDED 1
25
26// need this to avoid bizarre compiler problems under VC++ 6.0
27#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
28# include <iostream>
29#endif
30
31/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
32#if defined (__WIN32__) || defined (__CYGWIN__)
33# include "getopt_old.h"
34#else
35# include <unistd.h>
36#endif
37
38#include "sysfuncs.h"
39#include "messages.h"
40#include "mg_files.h"
41#include "invf.h"
42
43static void process_files (char *filename, unsigned long entriesPerBlock) {
44 // open the dictionary
45 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
46 MAGIC_STEM_BUILD, MG_ABORT);
47 invf_dict_header idh;
48 idh.Read (dictFile);
49
50 // open the inverted index file
51 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
52 MAGIC_INVI, MG_ABORT);
53
54 // create the blocked dictionary
55 FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
56 MAGIC_STEM, MG_ABORT);
57 block_dict_header bdh;
58 bdh.lookback = idh.lookback;
59 bdh.word_dict_start = idh.word_dict_start;
60 bdh.word_dict_size = idh.word_dict_size;
61 bdh.tag_dict_start = idh.tag_dict_start;
62 bdh.tag_dict_size = idh.tag_dict_size;
63 bdh.num_docs = idh.num_docs;
64 bdh.num_frags = idh.num_frags;
65 bdh.num_words = idh.num_words;
66 bdh.total_bytes = idh.total_bytes;
67 bdh.index_string_bytes = idh.index_string_bytes;
68 bdh.num_levels = idh.num_levels;
69 bdh.Write (blockDictFile);
70
71
72 // write out the word part of the dictionary
73
74 bdh.entries_per_wblk = entriesPerBlock;
75 bdh.max_wblk_size = 0;
76 bdh.wblk_start = ftell (blockDictFile);
77
78 fseek (dictFile, idh.word_dict_start, SEEK_SET);
79
80 block_idx wordIdx;
81 word_block_dict_el wordBlockEl;
82 wordBlockEl.SetNumLevels (idh.num_levels);
83
84
85 unsigned long wordNum;
86 unsigned long wordInvfPtr;
87 UCArray lastEl;
88 word_dict_el wordEl;
89 wordEl.SetNumLevels (idh.num_levels);
90 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
91 // read in the next word and inverted file pointer
92 wordEl.Read (dictFile, idh.num_levels);
93 ReadUL (invfIdxFile, wordInvfPtr);
94
95 // remember this word (and position) if this is the start
96 // of a new block
97 if (wordNum % entriesPerBlock == 0) {
98 block_idx_info elIdx;
99 elIdx.el = wordEl.el;
100 elIdx.block_ptr = ftell (blockDictFile);
101
102 // see if this block is the longest so far
103 if (wordIdx.size() > 0) {
104 unsigned long blockLen = elIdx.block_ptr -
105 (*(wordIdx.end()-1)).block_ptr;
106 if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
107 }
108
109 wordIdx.push_back (elIdx);
110 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
111 }
112
113 // copy the information for this word
114 wordBlockEl.el = wordEl.el;
115 wordBlockEl.frag_occur = wordEl.frag_occur;
116 wordBlockEl.freq = wordEl.freq;
117 wordBlockEl.invf_ptr = wordInvfPtr;
118 unsigned long tempI;
119 for (tempI=0; tempI<idh.num_levels; ++tempI)
120 wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];
121
122 // write out the word
123 wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);
124
125 lastEl = wordBlockEl.el;
126 }
127
128
129 // write out the tag part of the dictionary
130
131 bdh.entries_per_tblk = entriesPerBlock;
132 bdh.max_tblk_size = 0;
133 bdh.tblk_start = ftell (blockDictFile);
134
135 fseek (dictFile, idh.tag_dict_start, SEEK_SET);
136
137 block_idx tagIdx;
138 block_dict_el tagBlockEl;
139
140 unsigned long tagNum;
141 unsigned long tagInvfPtr;
142 dict_el tagEl;
143 lastEl.erase (lastEl.begin(), lastEl.end());
144 for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
145 // read in the next tag and inverted file pointer
146 tagEl.Read (dictFile);
147 ReadUL (invfIdxFile, tagInvfPtr);
148
149 // remember this tag (and position) if this is the start
150 // of a new block
151 if (tagNum % entriesPerBlock == 0) {
152 block_idx_info elIdx;
153 elIdx.el = tagEl.el;
154 elIdx.block_ptr = ftell (blockDictFile);
155
156 // see if this block is the longest so far
157 if (tagIdx.size() > 0) {
158 unsigned long blockLen = elIdx.block_ptr -
159 (*(tagIdx.end()-1)).block_ptr;
160 if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
161 }
162
163 tagIdx.push_back (elIdx);
164 lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
165 }
166
167 // copy the information for this tag
168 tagBlockEl.el = tagEl.el;
169 tagBlockEl.frag_occur = tagEl.frag_occur;
170 tagBlockEl.freq = tagEl.freq;
171 tagBlockEl.invf_ptr = tagInvfPtr;
172
173 // write out the tag
174 tagBlockEl.Write (blockDictFile, &lastEl);
175
176 lastEl = tagBlockEl.el;
177 }
178
179
180 // write out the element indexes
181 bdh.num_wblks = wordIdx.size();
182 bdh.wblk_idx_start = ftell (blockDictFile);
183 WriteBlockIdx (blockDictFile, wordIdx);
184
185 bdh.num_tblks = tagIdx.size();
186 bdh.tblk_idx_start = ftell (blockDictFile);
187 WriteBlockIdx (blockDictFile, tagIdx);
188
189 // write out the blocked dictionary header
190 fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
191 bdh.Write (blockDictFile);
192
193
194 // close open files
195 fclose (blockDictFile);
196 fclose (invfIdxFile);
197 fclose (dictFile);
198
199 // print out information
200#ifndef SILENT
201 Message ("Max word block size = %d\n", bdh.max_wblk_size);
202 Message ("Max tag block size = %d\n", bdh.max_tblk_size);
203 Message ("Number of word blocks written = %d\n", bdh.num_wblks);
204 Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
205#endif
206}
207
208
209int main (int argc, char **argv) {
210 unsigned long entriesPerBlock = 16;
211 char *filename = "";
212 int ch;
213 msg_prefix = argv[0];
214 opterr = 0;
215
216 while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
217 switch (ch) {
218 case 'f': // input file
219 filename = optarg;
220 break;
221 case 'd':
222 set_basepath (optarg);
223 break;
224 case 'b':
225 entriesPerBlock = atoi (optarg);
226 break;
227 case 'h':
228 case '?':
229 fprintf (stderr, "usage: %s [-f input_file] "
230 "[-d data directory] [-b entries-per-block] "
231 "[-h]\n", argv[0]);
232 exit (1);
233 }
234 }
235
236 process_files (filename, entriesPerBlock);
237 return 0;
238}
Note: See TracBrowser for help on using the repository browser.