source: trunk/gsdl/perllib/mgbuilder.pm@ 5608

Last change on this file since 5608 was 5225, checked in by sjboddie, 21 years ago

Fixed a couple of bugs in recent changes for building/displaying
collections that have no searchable indexes.

  • Property svn:keywords set to Author Date Id Revision
File size: 29.3 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>[] # indexes not built
83 }, $class;
84
85
86 # read in the collection configuration file
87 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
88 if (!-e $colcfgname) {
89 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
90 }
91 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
92
93 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
94 $self->{'collect_cfg'}->{'indexes'} = [];
95 }
96
97 # sort out subcollection indexes
98 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
99 my $indexes = $self->{'collect_cfg'}->{'indexes'};
100 $self->{'collect_cfg'}->{'indexes'} = [];
101 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
102 foreach $index (@$indexes) {
103 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
104 }
105 }
106 }
107
108 # sort out language subindexes
109 if (defined $self->{'collect_cfg'}->{'languages'}) {
110 my $indexes = $self->{'collect_cfg'}->{'indexes'};
111 $self->{'collect_cfg'}->{'indexes'} = [];
112 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
113 foreach $index (@$indexes) {
114 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
115 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
116 }
117 else { # add in an empty subcollection field
118 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
119 }
120 }
121 }
122 }
123
124 if (defined($self->{'collect_cfg'}->{'indexes'})) {
125 # make sure that the same index isn't specified more than once
126 my %tmphash = ();
127 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
128 $self->{'collect_cfg'}->{'indexes'} = [];
129 foreach my $i (@tmparray) {
130 if (!defined ($tmphash{$i})) {
131 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
132 $tmphash{$i} = 1;
133 }
134 }
135 } else {
136 $self->{'collect_cfg'}->{'indexes'} = [];
137 }
138
139 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
140 # no indexes have been specified so we'll build a "dummy:text" index
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
142 }
143
144 # get the list of plugins for this collection
145 my $plugins = [];
146 if (defined $self->{'collect_cfg'}->{'plugin'}) {
147 $plugins = $self->{'collect_cfg'}->{'plugin'};
148 }
149
150 # load all the plugins
151 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
152 if (scalar(@{$self->{'pluginfo'}}) == 0) {
153 print $outhandle "No plugins were loaded.\n";
154 die "\n";
155 }
156
157 # get the list of classifiers for this collection
158 my $classifiers = [];
159 if (defined $self->{'collect_cfg'}->{'classify'}) {
160 $classifiers = $self->{'collect_cfg'}->{'classify'};
161 }
162
163 # load all the classifiers
164 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
165
166 # load up any dontgdbm fields
167 $self->{'dontgdbm'} = {};
168 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
169 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
170 $self->{'dontgdbm'}->{$dg} = 1;
171 }
172 }
173
174 # load up the document processor for building
175 # if a buildproc class has been created for this collection, use it
176 # otherwise, use the mg buildproc
177 my ($buildprocdir, $buildproctype);
178 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
179 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
180 $buildproctype = "${collection}buildproc";
181 } else {
182 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
183 $buildproctype = "mgbuildproc";
184 }
185 require "$buildprocdir/$buildproctype.pm";
186
187 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
188 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
189 die "$@" if $@;
190
191 return $self;
192}
193
194sub init {
195 my $self = shift (@_);
196
197 if (!$self->{'debug'} && !$self->{'keepold'}) {
198 # remove any old builds
199 &util::rm_r($self->{'build_dir'});
200 &util::mk_all_dir($self->{'build_dir'});
201
202 # make the text directory
203 my $textdir = "$self->{'build_dir'}/text";
204 &util::mk_all_dir($textdir);
205 }
206}
207
208sub compress_text {
209 my $self = shift (@_);
210 my ($textindex) = @_;
211 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
212 my $exe = &util::get_os_exe ();
213 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
214 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
215 my $outhandle = $self->{'outhandle'};
216
217 my $maxnumeric = 4;
218 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
219 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
220 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
221 }
222
223 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
224 my $basefilename = "text/$self->{'collection'}";
225 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
226
227 my $osextra = "";
228 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
229 $fulltextprefix =~ s@/@\\@g;
230 } else {
231 $osextra = " -d /";
232 }
233
234 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
235
236 # collect the statistics for the text
237 # -b $maxdocsize sets the maximum document size to be 12 meg
238 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
239
240 my ($handle);
241 if ($self->{'debug'}) {
242 $handle = STDOUT;
243 } else {
244 if (!-e "$mg_passes_exe" ||
245 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
246 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
247 }
248 $handle = mgbuilder::PIPEOUT;
249 }
250
251 $self->{'buildproc'}->set_output_handle ($handle);
252 $self->{'buildproc'}->set_mode ('text');
253 $self->{'buildproc'}->set_index ($textindex);
254 $self->{'buildproc'}->set_indexing_text (0);
255 if ($self->{'no_text'}) {
256 $self->{'buildproc'}->set_store_text(0);
257 } else {
258 $self->{'buildproc'}->set_store_text(1);
259 }
260 $self->{'buildproc'}->reset();
261 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
262 $self->{'buildproc'}, $self->{'maxdocs'});
263 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
264 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
265 &plugin::end($self->{'pluginfo'});
266
267 close ($handle) unless $self->{'debug'};
268
269 $self->print_stats();
270
271 # create the compression dictionary
272 # the compression dictionary is built by assuming the stats are from a seed
273 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
274 # and the resulting dictionary must be less than 5 meg with the most frequent
275 # words being put into the dictionary first (-2 -k 5120)
276 if (!$self->{'debug'}) {
277 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
278 if (!-e "$mg_compression_dict_exe") {
279 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
280 }
281 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
282
283 # -b $maxdocsize sets the maximum document size to be 12 meg
284 if (!-e "$mg_passes_exe" ||
285 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
286 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
287 }
288 }
289
290 $self->{'buildproc'}->reset();
291 # compress the text
292 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
293 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
294 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
295 close ($handle) unless $self->{'debug'};
296
297 $self->print_stats();
298}
299
300sub want_built {
301 my $self = shift (@_);
302 my ($index) = @_;
303
304 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
305 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
306 if ($index =~ /^$checkstr$/) {
307 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
308 return 0;
309 }
310 }
311 }
312
313 return 1;
314}
315
316sub build_indexes {
317 my $self = shift (@_);
318 my ($indexname) = @_;
319 my $outhandle = $self->{'outhandle'};
320
321 my $indexes = [];
322 if (defined $indexname && $indexname =~ /\w/) {
323 push @$indexes, $indexname;
324 } else {
325 $indexes = $self->{'collect_cfg'}->{'indexes'};
326 }
327
328 # create the mapping between the index descriptions
329 # and their directory names
330 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
331
332 # build each of the indexes
333 foreach $index (@$indexes) {
334 if ($self->want_built($index)) {
335 print $outhandle "\n*** building index $index in subdirectory " .
336 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
337 $self->build_index($index);
338 } else {
339 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
340 }
341 }
342}
343
344# creates directory names for each of the index descriptions
345sub create_index_mapping {
346 my $self = shift (@_);
347 my ($indexes) = @_;
348
349 my %mapping = ();
350 $mapping{'indexmaporder'} = [];
351 $mapping{'subcollectionmaporder'} = [];
352 $mapping{'languagemaporder'} = [];
353
354 # dirnames is used to check for collisions. Start this off
355 # with the manditory directory names
356 my %dirnames = ('text'=>'text',
357 'extra'=>'extra');
358 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
359
360 foreach $index (@$indexes) {
361 my ($level, $gran, $subcollection, $languages) = split (":", $index);
362
363 # the directory name starts with the first character of the index level
364 my ($pindex) = $level =~ /^(.)/;
365
366 # next comes a processed version of the index
367 $pindex .= $self->process_field ($gran);
368 $pindex = lc ($pindex);
369
370 # next comes a processed version of the subcollection if there is one.
371 my $psub = $self->process_field ($subcollection);
372 $psub = lc ($psub);
373
374 # next comes a processed version of the language if there is one.
375 my $plang = $self->process_field ($languages);
376 $plang = lc ($plang);
377
378 my $dirname = $pindex . $psub . $plang;
379
380 # check to be sure all index names are unique
381 while (defined ($dirnames{$dirname})) {
382 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
383 }
384 $mapping{$index} = $dirname;
385
386 # store the mapping orders as well as the maps
387 # also put index, subcollection and language fields into the mapping thing -
388 # (the full index name (eg document:text:subcol:lang) is not used on
389 # the query page) -these are used for collectionmeta later on
390 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
391 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
392 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
393 if (!defined $mapping{"$level:$gran"}) {
394 $mapping{"$level:$gran"} = $pindex;
395 }
396 }
397 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
398 $mapping{'subcollectionmap'}{$subcollection} = $psub;
399 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
400 $mapping{$subcollection} = $psub;
401 }
402 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
403 $mapping{'languagemap'}{$languages} = $plang;
404 push (@{$mapping{'languagemaporder'}}, $languages);
405 $mapping{$languages} = $plang;
406 }
407 $dirnames{$dirname} = $index;
408 $pnames{'index'}{$pindex} = "$level:$gran";
409 $pnames{'subcollection'}{$psub} = $subcollection;
410 $pnames{'languages'}{$plang} = $languages;
411 }
412
413 return \%mapping;
414}
415
416# returns a processed version of a field.
417# if the field has only one component the processed
418# version will contain the first character and next consonant
419# of that componant - otherwise it will contain the first
420# character of the first two components
421sub process_field {
422 my $self = shift (@_);
423 my ($field) = @_;
424
425 return "" unless (defined ($field) && $field =~ /\w/);
426
427 my @components = split /,/, $field;
428 if (scalar @components >= 2) {
429 splice (@components, 2);
430 map {s/^(.).*$/$1/;} @components;
431 return join("", @components);
432 } else {
433 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
434 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
435 return "$a$b";
436 }
437}
438
439sub make_unique {
440 my $self = shift (@_);
441 my ($namehash, $index, $indexref, $subref, $langref) = @_;
442 my ($level, $gran, $subcollection, $languages) = split (":", $index);
443
444 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
445 $self->get_next_version ($indexref);
446 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
447 $self->get_next_version ($subref);
448 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
449 $self->get_next_version ($langref);
450 }
451 return "$$indexref$$subref$$langref";
452}
453
454sub get_next_version {
455 my $self = shift (@_);
456 my ($nameref) = @_;
457
458 if ($$nameref =~ /(\d\d)$/) {
459 my $num = $1; $num ++;
460 $$nameref =~ s/\d\d$/$num/;
461 } elsif ($$nameref =~ /(\d)$/) {
462 my $num = $1;
463 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
464 else {$num ++; $$nameref =~ s/\d$/$num/;}
465 } else {
466 $$nameref =~ s/.$/0/;
467 }
468}
469
470sub build_index {
471 my $self = shift (@_);
472 my ($index) = @_;
473 my $outhandle = $self->{'outhandle'};
474
475 # get the full index directory path and make sure it exists
476 my $indexdir = $self->{'index_mapping'}->{$index};
477 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
478 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
479 $self->{'collection'});
480 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
481 $self->{'collection'});
482
483 # get any os specific stuff
484 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
485 my $exe = &util::get_os_exe ();
486 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
487 my $mg_perf_hash_build_exe =
488 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
489 my $mg_weights_build_exe =
490 &util::filename_cat ($exedir, "mg_weights_build$exe");
491 my $mg_invf_dict_exe =
492 &util::filename_cat ($exedir, "mg_invf_dict$exe");
493 my $mg_stem_idx_exe =
494 &util::filename_cat ($exedir, "mg_stem_idx$exe");
495
496 my $maxnumeric = 4;
497 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
498 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
499 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
500 }
501
502 my $osextra = "";
503 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
504 $fullindexprefix =~ s@/@\\@g;
505 } else {
506 $osextra = " -d /";
507 if ($outhandle ne "STDERR") {
508 # so mg_passes doesn't print to stderr if we redirect output
509 $osextra .= " 2>/dev/null";
510 }
511 }
512
513 # get the index level from the index description
514 # the index will be level 2 unless we are building a
515 # paragraph level index
516 my $index_level = 2;
517 $index_level = 3 if $index =~ /^paragraph/i;
518
519 # get the index expression if this index belongs
520 # to a subcollection
521 my $indexexparr = [];
522
523 # there may be subcollection info, and language info.
524 my ($level, $fields, $subcollection, $language) = split (":", $index);
525 my @subcollections = ();
526 @subcollections = split /,/, $subcollection if (defined $subcollection);
527
528 foreach $subcollection (@subcollections) {
529 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
530 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
531 }
532 }
533
534 # add expressions for languages if this index belongs to
535 # a language subcollection - only put languages expressions for the
536 # ones we want in the index
537
538 my @languages = ();
539 @languages = split /,/, $language if (defined $language);
540 foreach $language (@languages) {
541 my $not=0;
542 if ($language =~ s/^\!//) {
543 $not = 1;
544 }
545 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
546 if ($lang eq $language) {
547 if($not) {
548 push (@$indexexparr, "!Language/$language/");
549 } else {
550 push (@$indexexparr, "Language/$language/");
551 }
552 last;
553 }
554 }
555 }
556
557 # Build index dictionary. Uses verbatim stem method
558 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
559 my ($handle);
560 if ($self->{'debug'}) {
561 $handle = STDOUT;
562 } else {
563 if (!-e "$mg_passes_exe" ||
564 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
565 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
566 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
567 }
568 $handle = mgbuilder::PIPEOUT;
569 }
570
571 # set up the document processor
572 $self->{'buildproc'}->set_output_handle ($handle);
573 $self->{'buildproc'}->set_mode ('text');
574 $self->{'buildproc'}->set_index ($index, $indexexparr);
575 $self->{'buildproc'}->set_indexing_text (1);
576 $self->{'buildproc'}->set_store_text(1);
577
578 $self->{'buildproc'}->reset();
579 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
580 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
581 close ($handle) unless $self->{'debug'};
582
583 $self->print_stats();
584
585 if (!$self->{'debug'}) {
586 # create the perfect hash function
587 if (!-e "$mg_perf_hash_build_exe") {
588 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
589 }
590 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
591
592 if (!-e "$mg_passes_exe" ||
593 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
594 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
595 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
596 }
597 }
598
599 # invert the text
600 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
601
602 $self->{'buildproc'}->reset();
603 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
604 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
605
606 $self->print_stats ();
607
608 if (!$self->{'debug'}) {
609
610 close ($handle);
611
612 # create the weights file
613 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
614 if (!-e "$mg_weights_build_exe") {
615 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
616 }
617 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
618
619 # create 'on-disk' stemmed dictionary
620 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
621 if (!-e "$mg_invf_dict_exe") {
622 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
623 }
624 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
625
626
627 # creates stem index files for the various stemming methods
628 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
629 if (!-e "$mg_stem_idx_exe") {
630 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
631 }
632 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
633 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
634 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
635
636 # remove unwanted files
637 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
638 opendir (DIR, $tmpdir) || die
639 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
640 foreach $file (readdir(DIR)) {
641 next if $file =~ /^\./;
642 my ($suffix) = $file =~ /\.([^\.]+)$/;
643 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
644 # delete it!
645 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
646 &util::rm (&util::filename_cat ($tmpdir, $file));
647 }
648 }
649 closedir (DIR);
650 }
651}
652
653sub make_infodatabase {
654 my $self = shift (@_);
655 my $outhandle = $self->{'outhandle'};
656
657 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
658 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
659 &util::mk_all_dir ($textdir);
660 &util::mk_all_dir ($assocdir);
661
662 # get db name
663 my $dbext = ".bdb";
664 $dbext = ".ldb" if &util::is_little_endian();
665 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
666 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
667
668 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
669 my $exe = &util::get_os_exe ();
670 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
671
672 print $outhandle "\n*** creating the info database and processing associated files\n"
673 if ($self->{'verbosity'} >= 1);
674
675 # init all the classifiers
676 &classify::init_classifiers ($self->{'classifiers'});
677
678
679 # set up the document processor
680 my ($handle);
681 if ($self->{'debug'}) {
682 $handle = STDOUT;
683 } else {
684 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
685 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
686 }
687 $handle = mgbuilder::PIPEOUT;
688 }
689
690 $self->{'buildproc'}->set_output_handle ($handle);
691 $self->{'buildproc'}->set_mode ('infodb');
692 $self->{'buildproc'}->set_assocdir ($assocdir);
693 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
694 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
695 $self->{'buildproc'}->set_indexing_text (0);
696 $self->{'buildproc'}->set_store_text(1);
697 $self->{'buildproc'}->reset();
698
699 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
700
701 if (!defined $self->{'index_mapping'}) {
702 $self->{'index_mapping'} =
703 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
704 }
705
706 print $handle "[collection]\n";
707
708 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
709 my $defaultfound=0;
710 my $first=1;
711 my $metadata_entry = "";
712 my $default="";
713 my $cmetamap = "";
714 if ($cmeta =~ s/^\.//) {
715 if (defined $self->{'index_mapping'}->{$cmeta}) {
716 $cmetamap = $self->{'index_mapping'}->{$cmeta};
717 $cmeta = ".$cmeta";
718 }
719 else {
720 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
721 next; #ignore this one
722 }
723 }
724 else {
725 $cmetamap = $cmeta; # just using the same name
726 }
727 #iterate through the languages
728 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
729 if ($first) {
730 $first=0;
731 #set the default default to the first entry
732 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
733 }
734 if ($lang =~ /default/) {
735 $defaultfound=1;
736 #the default entry goes first
737 $metadata_entry = "<$cmetamap>" .
738 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
739 }
740 else {
741 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
742 if ($l) {
743 $metadata_entry .= "<$cmetamap:$l>" .
744 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
745 }
746 }
747 }
748 #if we haven't found a default, put one in
749 if (!$defaultfound) {
750 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
751 }
752 #write the entry to the file
753 print $handle $metadata_entry;
754
755 }
756
757 print $handle "\n" . ('-' x 70) . "\n";
758
759 }
760
761
762 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
763 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
764
765 # output classification information
766 &classify::output_classify_info ($self->{'classifiers'}, $handle,
767 $self->{'allclassifications'});
768
769
770
771 #output doclist
772 my @doclist = $self->{'buildproc'}->get_doc_list();
773 my $docs = join (";",@doclist);
774 print $handle "[browselist]\n";
775 print $handle "<hastxt>0\n";
776 print $handle "<childtype>VList\n";
777 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
778 print $handle "<thistype>Invisible\n";
779 print $handle "<contains>$docs";
780 print $handle "\n" . ('-' x 70) . "\n";
781
782 close ($handle) if !$self->{'debug'};
783}
784
785sub collect_specific {
786 my $self = shift (@_);
787}
788
789sub make_auxiliary_files {
790 my $self = shift (@_);
791 my ($index);
792 my %build_cfg = ();
793 my $outhandle = $self->{'outhandle'};
794
795 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
796
797 # get the text directory
798 &util::mk_all_dir ($self->{'build_dir'});
799
800 # store the build date
801 $build_cfg->{'builddate'} = time;
802
803 # store the number of documents and number of bytes
804 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
805 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
806
807 # get additional stats from mg
808 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
809 my $exe = &util::get_os_exe ();
810 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
811 my $input_file = &util::filename_cat ("text", $self->{'collection'});
812 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
813 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
814 } else {
815 my $line = "";
816 while (defined ($line = <PIPEIN>)) {
817 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
818 ($build_cfg->{'numwords'}) = $1;
819 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
820 ($build_cfg->{'numsections'}) = $1;
821 }
822 }
823 close PIPEIN;
824 }
825
826 # store the mapping between the index names and the directory names
827 my @indexmap = ();
828 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
829 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
830 }
831 $build_cfg->{'indexmap'} = \@indexmap;
832
833 my @subcollectionmap = ();
834 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
835 push (@subcollectionmap, "$subcollection\-\>" .
836 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
837 }
838 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
839
840 my @languagemap = ();
841 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
842 push (@languagemap, "$language\-\>" .
843 $self->{'index_mapping'}->{'languagemap'}->{$language});
844 }
845 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
846
847 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
848
849 $build_cfg->{'maxnumeric'} = 4;
850 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
851 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
852 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
853 }
854
855 # write out the build information
856 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
857 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
858 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
859
860}
861
862sub deinit {
863 my $self = shift (@_);
864}
865
866sub print_stats {
867 my $self = shift (@_);
868
869 my $outhandle = $self->{'outhandle'};
870 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
871 my $index = $self->{'buildproc'}->get_index();
872 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
873 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
874
875 if ($indexing_text) {
876 print $outhandle "Stats (Creating index $index)\n";
877 } else {
878 print $outhandle "Stats (Compressing text from $index)\n";
879 }
880 print $outhandle "Total bytes in collection: $num_bytes\n";
881 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
882
883 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
884 print $outhandle "***************\n";
885 if ($indexing_text) {
886 print $outhandle "WARNING: There is very little or no text to process for $index\n";
887 } elsif (!$self->{'no_text'}) {
888 print $outhandle "WARNING: There is very little or no text to compress\n";
889 }
890 print $outhandle " Was this your intention?\n";
891 print $outhandle "***************\n";
892 }
893}
894
8951;
Note: See TracBrowser for help on using the repository browser.