source: main/trunk/greenstone2/perllib/mgbuilder.pm@ 22352

Last change on this file since 22352 was 22352, checked in by kjdon, 14 years ago

remove ex. when generating index lists. Don't want any ex. in build.cfg. This fixes the problem where index list had eg ex.Photographer and collectionmeta in config file had .Photographer and then they didn't match up.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.9 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use basebuilder;
29use plugin;
30use strict; no strict 'refs';
31use util;
32
33
34BEGIN {
35 @mgbuilder::ISA = ('basebuilder');
36}
37
38
39my %wanted_index_files = ('td'=>1,
40 't'=>1,
41 'idb'=>1,
42 'ib1'=>1,
43 'ib2'=>1,
44 'ib3'=>1,
45 'i'=>1,
46 'ip'=>1,
47 'tiw'=>1,
48 'wa'=>1);
49
50my $maxdocsize = $basebuilder::maxdocsize;
51
52
53sub new {
54 my $class = shift(@_);
55
56 my $self = new basebuilder (@_);
57 $self = bless $self, $class;
58
59 $self->{'buildtype'} = "mg";
60 return $self;
61}
62
63sub default_buildproc {
64 my $self = shift (@_);
65
66 return "mgbuildproc";
67}
68
69sub generate_index_list {
70 my $self = shift (@_);
71
72 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
73 $self->{'collect_cfg'}->{'indexes'} = [];
74 }
75 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
76 # no indexes have been specified so we'll build a "dummy:text" index
77 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
78 }
79 # remove any ex.
80 my @orig_indexes = @{$self->{'collect_cfg'}->{'indexes'}};
81 $self->{'collect_cfg'}->{'indexes'} = [];
82 foreach my $index (@orig_indexes) {
83 $index =~ s/([:,])ex\./$1/g;
84 push (@{$self->{'collect_cfg'}->{'indexes'}}, $index);
85 }
86}
87
88sub generate_index_options {
89 my $self = shift (@_);
90 $self->SUPER::generate_index_options();
91
92 $self->{'casefold'} = 0;
93 $self->{'stem'} = 0;
94 $self->{'accentfold'} = 0; #not yet implemented for mg
95
96 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
97 # just use default options
98 $self->{'casefold'} = 1;
99 $self->{'stem'} = 1;
100
101 } else {
102 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
103 if ($option =~ /stem/) {
104 $self->{'stem'} = 1;
105 } elsif ($option =~ /casefold/) {
106 $self->{'casefold'} = 1;
107 }
108 }
109 }
110
111 # now we record this for the build cfg
112 $self->{'stemindexes'} = 0;
113 if ($self->{'casefold'}) {
114 $self->{'stemindexes'} += 1;
115 }
116 if ($self->{'stem'}) {
117 $self->{'stemindexes'} += 2;
118 }
119
120
121}
122
123sub compress_text {
124 my $self = shift (@_);
125 my ($textindex) = @_;
126 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
127 my $exe = &util::get_os_exe ();
128 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
129 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
130 my $outhandle = $self->{'outhandle'};
131
132 my $maxnumeric = $self->{'maxnumeric'};
133
134 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
135
136 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
137 my $basefilename = &util::filename_cat("text",$collect_tail);
138 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
139
140 my $osextra = "";
141 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
142 $fulltextprefix =~ s@/@\\@g;
143 } else {
144 $osextra = " -d /";
145 }
146
147 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
148 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
149
150 # collect the statistics for the text
151 # -b $maxdocsize sets the maximum document size to be 12 meg
152 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
153 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
154
155 my ($handle);
156 if ($self->{'debug'}) {
157 $handle = *STDOUT;
158 }
159 else {
160 if (!-e "$mg_passes_exe" ||
161 !open($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
162 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
163 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
164 }
165 }
166
167 $self->{'buildproc'}->set_output_handle ($handle);
168 $self->{'buildproc'}->set_mode ('text');
169 $self->{'buildproc'}->set_index ($textindex);
170 $self->{'buildproc'}->set_indexing_text (0);
171
172
173 if ($self->{'no_text'}) {
174 $self->{'buildproc'}->set_store_text(0);
175 } else {
176 $self->{'buildproc'}->set_store_text(1);
177 }
178 $self->{'buildproc'}->reset();
179
180 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
181 $self->{'buildproc'}, $self->{'maxdocs'});
182 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
183 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
184 &plugin::end($self->{'pluginfo'});
185
186
187 close ($handle) unless $self->{'debug'};
188
189 $self->print_stats();
190
191 # create the compression dictionary
192 # the compression dictionary is built by assuming the stats are from a seed
193 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
194 # and the resulting dictionary must be less than 5 meg with the most frequent
195 # words being put into the dictionary first (-2 -k 5120)
196 if (!$self->{'debug'}) {
197 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
198 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
199 if (!-e "$mg_compression_dict_exe") {
200 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
201 }
202 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
203
204 # -b $maxdocsize sets the maximum document size to be 12 meg
205 if (!-e "$mg_passes_exe" ||
206 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
207 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
208 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
209 }
210 }
211 else {
212 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
213 }
214
215 $self->{'buildproc'}->reset();
216 # compress the text
217 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
218 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
219
220 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
221 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
222
223 close ($handle) unless $self->{'debug'};
224
225 $self->print_stats();
226 print STDERR "</Stage>\n" if $self->{'gli'};
227}
228
229
230# creates directory names for each of the index descriptions
231sub create_index_mapping {
232 my $self = shift (@_);
233 my ($indexes) = @_;
234
235 my %mapping = ();
236 $mapping{'indexmaporder'} = [];
237 $mapping{'subcollectionmaporder'} = [];
238 $mapping{'languagemaporder'} = [];
239
240 # dirnames is used to check for collisions. Start this off
241 # with the manditory directory names
242 my %dirnames = ('text'=>'text',
243 'extra'=>'extra');
244 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
245 foreach my $index (@$indexes) {
246 my ($level, $gran, $subcollection, $languages) = split (":", $index);
247
248 # the directory name starts with the first character of the index level
249 my ($pindex) = $level =~ /^(.)/;
250
251 # next comes a processed version of the index
252 $pindex .= $self->process_field ($gran);
253 $pindex = lc ($pindex);
254
255 # next comes a processed version of the subcollection if there is one.
256 my $psub = $self->process_field ($subcollection);
257 $psub = lc ($psub);
258
259 # next comes a processed version of the language if there is one.
260 my $plang = $self->process_field ($languages);
261 $plang = lc ($plang);
262
263 my $dirname = $pindex . $psub . $plang;
264
265 # check to be sure all index names are unique
266 while (defined ($dirnames{$dirname})) {
267 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
268 }
269 $mapping{$index} = $dirname;
270
271 # store the mapping orders as well as the maps
272 # also put index, subcollection and language fields into the mapping thing -
273 # (the full index name (eg document:text:subcol:lang) is not used on
274 # the query page) -these are used for collectionmeta later on
275 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
276 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
277 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
278 if (!defined $mapping{"$level:$gran"}) {
279 $mapping{"$level:$gran"} = $pindex;
280 }
281 }
282 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
283 $mapping{'subcollectionmap'}{$subcollection} = $psub;
284 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
285 $mapping{$subcollection} = $psub;
286 }
287 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
288 $mapping{'languagemap'}{$languages} = $plang;
289 push (@{$mapping{'languagemaporder'}}, $languages);
290 $mapping{$languages} = $plang;
291 }
292 $dirnames{$dirname} = $index;
293 $pnames{'index'}->{$pindex} = "$level:$gran";
294 $pnames{'subcollection'}->{$psub} = $subcollection;
295 $pnames{'languages'}->{$plang} = $languages;
296 }
297
298 return \%mapping;
299}
300
301
302sub make_unique {
303 my $self = shift (@_);
304 my ($namehash, $index, $indexref, $subref, $langref) = @_;
305 my ($level, $gran, $subcollection, $languages) = split (":", $index);
306
307 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
308 $self->get_next_version ($indexref);
309 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
310 $self->get_next_version ($subref);
311 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
312 $self->get_next_version ($langref);
313 }
314 return "$$indexref$$subref$$langref";
315}
316
317sub build_index {
318 my $self = shift (@_);
319 my ($index) = @_;
320 my $outhandle = $self->{'outhandle'};
321
322 # get the full index directory path and make sure it exists
323 my $indexdir = $self->{'index_mapping'}->{$index};
324 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
325
326 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
327 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
328 $collect_tail);
329 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
330 $collect_tail);
331
332 # get any os specific stuff
333 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
334 my $exe = &util::get_os_exe ();
335 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
336 my $mg_perf_hash_build_exe =
337 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
338 my $mg_weights_build_exe =
339 &util::filename_cat ($exedir, "mg_weights_build$exe");
340 my $mg_invf_dict_exe =
341 &util::filename_cat ($exedir, "mg_invf_dict$exe");
342 my $mg_stem_idx_exe =
343 &util::filename_cat ($exedir, "mg_stem_idx$exe");
344
345 my $maxnumeric = $self->{'maxnumeric'};
346
347 my $osextra = "";
348 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
349 $fullindexprefix =~ s@/@\\@g;
350 } else {
351 $osextra = " -d /";
352 if ($outhandle ne "STDERR") {
353 # so mg_passes doesn't print to stderr if we redirect output
354 $osextra .= " 2>/dev/null";
355 }
356 }
357
358 # get the index level from the index description
359 # the index will be level 2 unless we are building a
360 # paragraph level index
361 my $index_level = 2;
362 $index_level = 3 if $index =~ /^paragraph/i;
363
364 # get the index expression if this index belongs
365 # to a subcollection
366 my $indexexparr = [];
367 my $langarr = [];
368 # there may be subcollection info, and language info.
369 my ($level, $fields, $subcollection, $language) = split (":", $index);
370 my @subcollections = ();
371 @subcollections = split /,/, $subcollection if (defined $subcollection);
372
373 foreach my $subcollection (@subcollections) {
374 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
375 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
376 }
377 }
378
379 # add expressions for languages if this index belongs to
380 # a language subcollection - only put languages expressions for the
381 # ones we want in the index
382
383 my @languages = ();
384 my $languagemetadata = "Language";
385 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
386 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
387 }
388 @languages = split /,/, $language if (defined $language);
389 foreach my $language (@languages) {
390 my $not=0;
391 if ($language =~ s/^\!//) {
392 $not = 1;
393 }
394 if($not) {
395 push (@$langarr, "!$language");
396 } else {
397 push (@$langarr, "$language");
398 }
399 }
400
401 # Build index dictionary. Uses verbatim stem method
402 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
403 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
404 my ($handle);
405 if ($self->{'debug'}) {
406 $handle = *STDOUT;
407 }
408 else {
409 if (!-e "$mg_passes_exe" ||
410 !open($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
411 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
412 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
413 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
414 }
415 }
416
417 # set up the document processor
418 $self->{'buildproc'}->set_output_handle ($handle);
419 $self->{'buildproc'}->set_mode ('text');
420 $self->{'buildproc'}->set_index ($index, $indexexparr);
421 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
422 $self->{'buildproc'}->set_indexing_text (1);
423 $self->{'buildproc'}->set_store_text(1);
424
425 $self->{'buildproc'}->reset();
426 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
427 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
428 close ($handle) unless $self->{'debug'};
429
430 $self->print_stats();
431
432 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
433 # we check on the .id file - index dictionary
434 my $dict_file = "$fullindexprefix.id";
435 if (!-e $dict_file) {
436 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
437 $self->{'notbuilt'}->{$index}=1;
438 return;
439 }
440 if (!$self->{'debug'}) {
441 # create the perfect hash function
442 if (!-e "$mg_perf_hash_build_exe") {
443 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
444 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
445 }
446 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
447
448 if (!-e "$mg_passes_exe" ||
449 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
450 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
451 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
452 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
453 }
454 }
455
456 # invert the text
457 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
458 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
459 $self->{'buildproc'}->reset();
460 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
461 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
462
463
464 $self->print_stats ();
465
466 if (!$self->{'debug'}) {
467
468 close ($handle);
469
470 # create the weights file
471 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
472 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
473 if (!-e "$mg_weights_build_exe") {
474 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
475 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
476 }
477 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
478
479 # create 'on-disk' stemmed dictionary
480 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
481 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
482 if (!-e "$mg_invf_dict_exe") {
483 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
484 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
485 }
486 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
487
488
489 # creates stem index files for the various stemming methods
490 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
491 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
492 if (!-e "$mg_stem_idx_exe") {
493 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
494 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
495 }
496 # currently mg wont work if we don't generate all the stem idexes
497 # so we generate them whatever, but don't advertise the fact
498 #if ($self->{'casefold'}) {
499 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
500 #}
501 #if ($self->{'stem'}) {
502 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
503 #}
504 #if ($self->{'casefold'} && $self->{'stem'}) {
505 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
506 #}
507
508 # remove unwanted files
509 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
510 opendir (DIR, $tmpdir) || die
511 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
512 foreach my $file (readdir(DIR)) {
513 next if $file =~ /^\./;
514 my ($suffix) = $file =~ /\.([^\.]+)$/;
515 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
516 # delete it!
517 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
518 &util::rm (&util::filename_cat ($tmpdir, $file));
519 }
520 }
521 closedir (DIR);
522 }
523 print STDERR "</Stage>\n" if $self->{'gli'};
524}
525
526sub build_cfg_extra {
527 my $self = shift(@_);
528 my ($build_cfg) = @_;
529
530 # get additional stats from mg
531 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
532 my $exe = &util::get_os_exe ();
533 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
534
535 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
536 my $input_file = &util::filename_cat ("text", $collect_tail);
537 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
538 my $outhandle = $self->{'outhandle'};
539 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
540 } else {
541 my $line = "";
542 while (defined ($line = <PIPEIN>)) {
543 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
544 ($build_cfg->{'numwords'}) = $1;
545 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
546 ($build_cfg->{'numsections'}) = $1;
547 }
548 }
549 close PIPEIN;
550 }
551}
552
5531;
554
555
556
Note: See TracBrowser for help on using the repository browser.