Context Navigation

source: indexers/trunk/mgpp/text/mgpp_invf_dict.cpp@ 19822

Last change on this file since 19822 was 19822, checked in by mdewsnip, 15 years ago

Commented out all occurrences of

#define _XOPEN_SOURCE_EXTENDED 1

This was allegedly added for compilation on Solaris, but it just causes errors for me (on the NLNZ Solaris machines).

Property svn:keywords set to Author Date Id Revision

File size: 7.0 KB

Line
1	/**************************************************************************
2	*
3	* mgpp_invf_dict.cpp -- Program to build the blocked stemmed dictionary
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#define _XOPEN_SOURCE 1
23	// This was added for Solaris, but it makes things worse on Solaris for me...
24	// #define _XOPEN_SOURCE_EXTENDED 1
25
26	// need this to avoid bizarre compiler problems under VC++ 6.0
27	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
28	# include <iostream>
29	#endif
30
31	/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
32	#if defined (__WIN32__) \|\| defined (__CYGWIN__)
33	# include "getopt_old.h"
34	#else
35	# include <unistd.h>
36	#endif
37
38	#include "sysfuncs.h"
39	#include "messages.h"
40	#include "mg_files.h"
41	#include "invf.h"
42
43	static void process_files (char *filename, unsigned long entriesPerBlock) {
44	// open the dictionary
45	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
46	MAGIC_STEM_BUILD, MG_ABORT);
47	invf_dict_header idh;
48	idh.Read (dictFile);
49
50	// open the inverted index file
51	FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
52	MAGIC_INVI, MG_ABORT);
53
54	// create the blocked dictionary
55	FILE *blockDictFile = create_file (filename, INVF_DICT_BLOCKED_SUFFIX, "wb",
56	MAGIC_STEM, MG_ABORT);
57	block_dict_header bdh;
58	bdh.lookback = idh.lookback;
59	bdh.word_dict_start = idh.word_dict_start;
60	bdh.word_dict_size = idh.word_dict_size;
61	bdh.tag_dict_start = idh.tag_dict_start;
62	bdh.tag_dict_size = idh.tag_dict_size;
63	bdh.num_docs = idh.num_docs;
64	bdh.num_frags = idh.num_frags;
65	bdh.num_words = idh.num_words;
66	bdh.total_bytes = idh.total_bytes;
67	bdh.index_string_bytes = idh.index_string_bytes;
68	bdh.num_levels = idh.num_levels;
69	bdh.Write (blockDictFile);
70
71
72	// write out the word part of the dictionary
73
74	bdh.entries_per_wblk = entriesPerBlock;
75	bdh.max_wblk_size = 0;
76	bdh.wblk_start = ftell (blockDictFile);
77
78	fseek (dictFile, idh.word_dict_start, SEEK_SET);
79
80	block_idx wordIdx;
81	word_block_dict_el wordBlockEl;
82	wordBlockEl.SetNumLevels (idh.num_levels);
83
84
85	unsigned long wordNum;
86	unsigned long wordInvfPtr;
87	UCArray lastEl;
88	word_dict_el wordEl;
89	wordEl.SetNumLevels (idh.num_levels);
90	for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
91	// read in the next word and inverted file pointer
92	wordEl.Read (dictFile, idh.num_levels);
93	ReadUL (invfIdxFile, wordInvfPtr);
94
95	// remember this word (and position) if this is the start
96	// of a new block
97	if (wordNum % entriesPerBlock == 0) {
98	block_idx_info elIdx;
99	elIdx.el = wordEl.el;
100	elIdx.block_ptr = ftell (blockDictFile);
101
102	// see if this block is the longest so far
103	if (wordIdx.size() > 0) {
104	unsigned long blockLen = elIdx.block_ptr -
105	(*(wordIdx.end()-1)).block_ptr;
106	if (blockLen > bdh.max_wblk_size) bdh.max_wblk_size = blockLen;
107	}
108
109	wordIdx.push_back (elIdx);
110	lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
111	}
112
113	// copy the information for this word
114	wordBlockEl.el = wordEl.el;
115	wordBlockEl.frag_occur = wordEl.frag_occur;
116	wordBlockEl.freq = wordEl.freq;
117	wordBlockEl.invf_ptr = wordInvfPtr;
118	unsigned long tempI;
119	for (tempI=0; tempI<idh.num_levels; ++tempI)
120	wordBlockEl.levelFreqs[tempI] = wordEl.levelFreqs[tempI];
121
122	// write out the word
123	wordBlockEl.Write (blockDictFile, &lastEl, idh.num_levels);
124
125	lastEl = wordBlockEl.el;
126	}
127
128
129	// write out the tag part of the dictionary
130
131	bdh.entries_per_tblk = entriesPerBlock;
132	bdh.max_tblk_size = 0;
133	bdh.tblk_start = ftell (blockDictFile);
134
135	fseek (dictFile, idh.tag_dict_start, SEEK_SET);
136
137	block_idx tagIdx;
138	block_dict_el tagBlockEl;
139
140	unsigned long tagNum;
141	unsigned long tagInvfPtr;
142	dict_el tagEl;
143	lastEl.erase (lastEl.begin(), lastEl.end());
144	for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
145	// read in the next tag and inverted file pointer
146	tagEl.Read (dictFile);
147	ReadUL (invfIdxFile, tagInvfPtr);
148
149	// remember this tag (and position) if this is the start
150	// of a new block
151	if (tagNum % entriesPerBlock == 0) {
152	block_idx_info elIdx;
153	elIdx.el = tagEl.el;
154	elIdx.block_ptr = ftell (blockDictFile);
155
156	// see if this block is the longest so far
157	if (tagIdx.size() > 0) {
158	unsigned long blockLen = elIdx.block_ptr -
159	(*(tagIdx.end()-1)).block_ptr;
160	if (blockLen > bdh.max_tblk_size) bdh.max_tblk_size = blockLen;
161	}
162
163	tagIdx.push_back (elIdx);
164	lastEl.erase (lastEl.begin(), lastEl.end()); // output full word
165	}
166
167	// copy the information for this tag
168	tagBlockEl.el = tagEl.el;
169	tagBlockEl.frag_occur = tagEl.frag_occur;
170	tagBlockEl.freq = tagEl.freq;
171	tagBlockEl.invf_ptr = tagInvfPtr;
172
173	// write out the tag
174	tagBlockEl.Write (blockDictFile, &lastEl);
175
176	lastEl = tagBlockEl.el;
177	}
178
179
180	// write out the element indexes
181	bdh.num_wblks = wordIdx.size();
182	bdh.wblk_idx_start = ftell (blockDictFile);
183	WriteBlockIdx (blockDictFile, wordIdx);
184
185	bdh.num_tblks = tagIdx.size();
186	bdh.tblk_idx_start = ftell (blockDictFile);
187	WriteBlockIdx (blockDictFile, tagIdx);
188
189	// write out the blocked dictionary header
190	fseek (blockDictFile, sizeof(unsigned long), SEEK_SET);
191	bdh.Write (blockDictFile);
192
193
194	// close open files
195	fclose (blockDictFile);
196	fclose (invfIdxFile);
197	fclose (dictFile);
198
199	// print out information
200	#ifndef SILENT
201	Message ("Max word block size = %d\n", bdh.max_wblk_size);
202	Message ("Max tag block size = %d\n", bdh.max_tblk_size);
203	Message ("Number of word blocks written = %d\n", bdh.num_wblks);
204	Message ("Number of tag blocks written = %d\n", bdh.num_tblks);
205	#endif
206	}
207
208
209	int main (int argc, char **argv) {
210	unsigned long entriesPerBlock = 16;
211	char *filename = "";
212	int ch;
213	msg_prefix = argv[0];
214	opterr = 0;
215
216	while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
217	switch (ch) {
218	case 'f': // input file
219	filename = optarg;
220	break;
221	case 'd':
222	set_basepath (optarg);
223	break;
224	case 'b':
225	entriesPerBlock = atoi (optarg);
226	break;
227	case 'h':
228	case '?':
229	fprintf (stderr, "usage: %s [-f input_file] "
230	"[-d data directory] [-b entries-per-block] "
231	"[-h]\n", argv[0]);
232	exit (1);
233	}
234	}
235
236	process_files (filename, entriesPerBlock);
237	return 0;
238	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: