source: trunk/gsdl/perllib/mgbuilder.pm@ 81

Last change on this file since 81 was 81, checked in by sjboddie, 25 years ago

fixed minor bug in create_index_mapping()

  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1# MGBuilder object
2#
3
4package mgbuilder;
5
6use cfgread;
7use colcfg;
8use plugin;
9use util;
10
11$maxdocsize = 12000;
12
13%wanted_index_files = ('td'=>1,
14 't'=>1,
15 'idb'=>1,
16 'ib1'=>1,
17 'ib2'=>1,
18 'ib3'=>1,
19 'i'=>1,
20 'ip'=>1,
21 'tiw'=>1,
22 'wa'=>1);
23
24
25sub new {
26 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
27
28 # create an mgbuilder object
29 my $self = bless {'collection'=>$collection,
30 'source_dir'=>$source_dir,
31 'build_dir'=>$build_dir,
32 'verbosity'=>$verbosity}, $class;
33
34
35 # read in the collection configuration file
36 if (!-e "$ENV{'GSDLHOME'}/collect/$collection/collect.cfg") {
37 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
38 }
39 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ("$ENV{'GSDLHOME'}/collect/" .
40 "$collection/collect.cfg");
41
42
43 # sort out subcollection indexes
44 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
45 my $indexes = $self->{'collect_cfg'}->{'indexes'};
46 $self->{'collect_cfg'}->{'indexes'} = [];
47 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
48 foreach $index (@$indexes) {
49 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
50 }
51 }
52 }
53
54 # get the list of plugins for this collection
55 my @plugins = (); # some good choice of plugins .... ????
56 if (defined $self->{'collect_cfg'}->{'plugins'}) {
57 @plugins = @{$self->{'collect_cfg'}->{'plugins'}};
58 }
59
60
61 # load all the plugins
62 $self->{'pluginfo'} = &plugin::load_plugins ($collection, \@plugins);
63 if (scalar(@{$self->{'pluginfo'}}) == 0) {
64 print STDERR "No plugins were loaded.\n";
65 die "\n";
66 }
67
68
69 # load up the document processor for building
70 # if a buildproc class has been created for this collection, use it
71 # otherwise, use the mg buildproc
72 my ($buildprocdir, $buildproctype);
73 if (-e "$ENV{'GSDLHOME'}/collect/$collection/perllib/${collection}buildproc.pm") {
74 $buildprocdir = "$ENV{'GSDLHOME'}/collect/$collection/perllib";
75 $buildproctype = "${collection}buildproc";
76 } else {
77 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
78 $buildproctype = "mgbuildproc";
79 }
80 require "$buildprocdir/$buildproctype.pm";
81
82 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
83 "\$source_dir, \$build_dir, \$verbosity)");
84 die "$@" if $@;
85
86
87 return $self;
88}
89
90sub init {
91 my $self = shift (@_);
92
93 # remove any old builds
94 &util::rm_r($self->{'build_dir'});
95 &util::mk_all_dir($self->{'build_dir'});
96
97 # make the text directory
98 my $textdir = "$self->{'build_dir'}/text";
99 &util::mk_all_dir($textdir);
100}
101
102sub compress_text {
103 my $self = shift (@_);
104 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
105 my $exe = &util::get_os_exe ();
106
107 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
108 my $basefilename = "text/$self->{'collection'}";
109 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
110
111 my $osextra = "";
112 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
113 $fulltextprefix =~ s/\//\\/g;
114 } else {
115 $osextra = " -d /";
116 }
117
118 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
119
120 # set up the document processor
121 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
122 $self->{'buildproc'}->set_mode ('text');
123 $self->{'buildproc'}->set_index ('section:text');
124
125 # collect the statistics for the text
126 # -b $maxdocsize sets the maximum document size to be 12 meg
127 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
128 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
129 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
130 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
131 }
132 $self->{'buildproc'}->reset();
133 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
134 "", {}, $self->{'buildproc'});
135 close (PIPEOUT);
136
137 # create the compression dictionary
138 # the compression dictionary is built by assuming the stats are from a seed
139 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
140 # and the resulting dictionary must be less than 5 meg with the most frequent
141 # words being put into the dictionary first (-2 -k 5120)
142 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
143 if (!-e "$exedir/mg_compression_dict$exe") {
144 die "mgbuilder::compress_text - couldn't run $exedir/mg_compression_dict$exe\n";
145 }
146 system ("$exedir/mg_compression_dict$exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
147
148 # compress the text
149 # -b $maxdocsize sets the maximum document size to be 12 meg
150 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
151 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
152 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
153 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
154 }
155 $self->{'buildproc'}->reset();
156 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
157 "", {}, $self->{'buildproc'});
158 close (PIPEOUT);
159}
160
161sub build_indexes {
162 my $self = shift (@_);
163 my $indexes = $self->{'collect_cfg'}->{'indexes'};
164
165 # create the mapping between the index descriptions
166 # and their directory names
167 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
168
169 # build each of the indexes
170 foreach $index (@$indexes) {
171 print STDERR "\n*** building index $index in subdirectory " .
172 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
173 $self->build_index($index);
174 }
175}
176
177# creates directory names for each of the index descriptions
178sub create_index_mapping {
179 my $self = shift (@_);
180 my ($indexes) = @_;
181
182 my %mapping = ();
183
184 # dirnames is used to check for collisions. Start this off
185 # with the manditory directory names
186 my %dirnames = ('text'=>'text',
187 'extra'=>'extra');
188
189 foreach $index (@$indexes) {
190 my ($level, $fields, $subcollection) = split (":", $index);
191 my @fields = split (/,/, $fields);
192 splice (@fields, 2); # just want first two fields
193
194 # the directory names starts with the first character of the index level
195 my ($dirname) = $level =~ /^(.)/;
196
197 # next comes a processed version of the first two fields in the index
198 # the processed version contains the first character and the next
199 # consonant
200 map {s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;} @fields;
201 $dirname .= join("", @fields);
202
203 # next comes a processed version of the subcollection if there is one.
204 # the processed version contains the first character and the next
205 # consonant if there's only one field, otherwise the first character
206 # of the first two fields
207 if (defined ($subcollection) && $subcollection =~ /\w/) {
208 @fields = split /,/, $subcollection;
209 if (scalar @fields >= 2) {
210 splice (@fields, 2);
211 map {s/^(.).*$/$1/i;} @fields;
212 $dirname .= join("", @fields);
213 } else {
214 $subcollection =~ s/^(.).*?([bcdfghjklmnpqrstvwxyz]?).*$/$1$2/i;
215 $dirname .= $subcollection;
216 }
217 }
218
219 # convert the directory name to lowercase
220 $dirname = lc ($dirname);
221
222 # add a number to make this directory name unique
223 if (defined $dirnames{$dirname}) {
224 my $num = 1;
225 while (defined $dirnames{"$dirname$num"}) {
226 $num++;
227 }
228 $dirname .= $num;
229 }
230
231 $mapping{$index} = $dirname;
232 $dirnames{$dirname} = $index;
233 }
234
235 return \%mapping;
236}
237
238
239sub build_index {
240 my $self = shift (@_);
241 my ($index) = @_;
242
243 # get the full index directory path and make sure it exists
244 my $indexdir = $self->{'index_mapping'}->{$index};
245 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
246 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
247 $self->{'collection'});
248 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
249 $self->{'collection'});
250
251 # get any os specific stuff
252 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
253 my $exe = &util::get_os_exe ();
254 my $osextra = "";
255 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
256 $fullindexprefix =~ s/\//\\/g;
257 } else {
258 $osextra = " -d /";
259 }
260
261 # get the index level from the index description
262 # the index will be level 2 unless we are building a
263 # paragraph level index
264 my $index_level = 2;
265 $index_level = 3 if $index =~ /^paragraph/i;
266
267 # get the index expression if this index belongs
268 # to a subcollection
269 my $indexexparr = [];
270 my ($level, $fields, $subcollection) = split (":", $index);
271 my (@subcollections) = split /,/, $subcollection;
272
273 foreach $subcollection (@subcollections) {
274 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
275 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
276 }
277 }
278
279 # set up the document processor
280 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
281 $self->{'buildproc'}->set_mode ('text');
282 $self->{'buildproc'}->set_index ($index, $indexexparr);
283
284
285 # Build index dictionary. Uses verbatim stem method
286 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
287 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
288 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
289 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
290 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
291 }
292 $self->{'buildproc'}->reset();
293 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
294 "", {}, $self->{'buildproc'});
295 close (PIPEOUT);
296
297 # create the perfect hash function
298 if (!-e "$exedir/mg_perf_hash_build$exe") {
299 die "mgbuilder::build_index - couldn't run $exedir/mg_perf_hash_build$exe\n";
300 }
301 system ("$exedir/mg_perf_hash_build$exe -f $fullindexprefix $osextra");
302
303 # invert the text
304 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
305 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
306 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
307 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
308 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
309 }
310 $self->{'buildproc'}->reset();
311 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
312 "", {}, $self->{'buildproc'});
313 close (PIPEOUT);
314
315 # create the weights file
316 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
317 if (!-e "$exedir/mg_weights_build$exe") {
318 die "mgbuilder::build_index - couldn't run $exedir/mg_weights_build$exe\n";
319 }
320 system ("$exedir/mg_weights_build$exe -f $fullindexprefix -t $fulltextprefix $osextra");
321
322 # create 'on-disk' stemmed dictionary
323 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
324 if (!-e "$exedir/mg_invf_dict$exe") {
325 die "mgbuilder::build_index - couldn't run $exedir/mg_invf_dict$exe\n";
326 }
327 system ("$exedir/mg_invf_dict$exe -f $fullindexprefix $osextra");
328
329
330 # creates stem index files for the various stemming methods
331 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
332 if (!-e "$exedir/mg_stem_idx$exe") {
333 die "mgbuilder::build_index - couldn't run $exedir/mg_stem_idx$exe\n";
334 }
335 system ("$exedir/mg_stem_idx$exe -b 4096 -s1 -f $fullindexprefix $osextra");
336 system ("$exedir/mg_stem_idx$exe -b 4096 -s2 -f $fullindexprefix $osextra");
337 system ("$exedir/mg_stem_idx$exe -b 4096 -s3 -f $fullindexprefix $osextra");
338
339
340 # remove unwanted files
341 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
342 opendir (DIR, $tmpdir) || die
343 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
344 foreach $file (readdir(DIR)) {
345 next if $file =~ /^\./;
346 my ($suffix) = $file =~ /\.([^\.]+)$/;
347 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
348 # delete it!
349# print STDERR "deleting $file\n";
350 &util::rm (&util::filename_cat ($tmpdir, $file));
351 }
352 }
353 closedir (DIR);
354}
355
356sub make_infodatabase {
357 my $self = shift (@_);
358 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
359 &util::mk_all_dir ($textdir);
360
361 # assume little-endian for now :-)
362 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}.ldb");
363 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
364 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
365 my $exe = &util::get_os_exe ();
366
367 print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1);
368
369 # set up the document processor
370 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
371 $self->{'buildproc'}->set_mode ('infodb');
372
373 # collect the statistics for the text
374 # -b $maxdocsize sets the maximum document size to be 12 meg
375 if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT,
376 "| $exedir/txt2db$exe $fulldbname")) {
377 die "mgbuilder::make_infodatabase - couldn't run $exedir/txt2db$exe\n";
378 }
379 $self->{'buildproc'}->reset();
380 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
381 "", {}, $self->{'buildproc'});
382 close (PIPEOUT);
383}
384
385sub make_auxiliary_files {
386 my $self = shift (@_);
387 my ($index);
388 my %build_cfg = ();
389
390 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
391
392 # get the text directory
393 &util::mk_all_dir ($self->{'build_dir'});
394
395 # store the build date
396 $build_cfg->{'builddate'} = time;
397
398 # store the number of documents and number of bytes
399 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
400 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
401
402 # store the mapping between the index names and the directory names
403 my @indexmap = ();
404 foreach $index (@{$self->{'collect_cfg'}->{'indexes'}}) {
405 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{$index}");
406 }
407 $build_cfg->{'indexmap'} = \@indexmap;
408
409 # write out the build information
410 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
411 '^(builddate|numdocs|numbytes)$', '^(indexmap)$');
412
413}
414
415sub deinit {
416 my $self = shift (@_);
417}
418
419
4201;
421
422
Note: See TracBrowser for help on using the repository browser.