source: trunk/gsdl/perllib/mgppbuilder.pm@ 3805

Last change on this file since 3805 was 3805, checked in by sjboddie, 21 years ago

Bug fix so mgpp works properly for windows installations installed to
paths containing spaces

  • Property svn:keywords set to Author Date Id Revision
File size: 31.0 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'w'=>1,
62 'wa'=>1);
63
64# change this so a user can add their own ones in via a file or cfg
65%static_indexfield_map = ('Title'=>'TI',
66 'TI'=>1,
67 'Subject'=>'SU',
68 'SU'=>1,
69 'Creator'=>'CR',
70 'CR'=>1,
71 'Organization'=>'OR',
72 'OR'=>1,
73 'Source'=>'SO',
74 'SO'=>1,
75 'Howto'=>'HT',
76 'HT'=>1,
77 'ItemTitle'=>'IT',
78 'IT'=>1,
79 'ProgNumber'=>'PN',
80 'PN'=>1,
81 'People'=>'PE',
82 'PE'=>1,
83 'TextOnly'=>'TX',
84 'TX'=>1);
85
86sub new {
87 my ($class, $collection, $source_dir, $build_dir, $verbosity,
88 $maxdocs, $debug, $keepold, $allclassifications,
89 $outhandle, $no_text) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92 $no_text = 0 unless defined $no_text;
93
94 # create an mgppbuilder object
95 my $self = bless {'collection'=>$collection,
96 'source_dir'=>$source_dir,
97 'build_dir'=>$build_dir,
98 'verbosity'=>$verbosity,
99 'maxdocs'=>$maxdocs,
100 'debug'=>$debug,
101 'keepold'=>$keepold,
102 'allclassifications'=>$allclassifications,
103 'outhandle'=>$outhandle,
104 'no_text'=>$no_text,
105 'notbuilt'=>[], # indexes not built
106 'indexfieldmap'=>\%static_indexfield_map
107 }, $class;
108
109
110 # read in the collection configuration file
111 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
112 if (!-e $colcfgname) {
113 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
114 }
115 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
116
117 # sort out subcollection indexes
118 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
119 my $indexes = $self->{'collect_cfg'}->{'indexes'};
120 $self->{'collect_cfg'}->{'indexes'} = [];
121 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
122 foreach $index (@$indexes) {
123 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
124 }
125 }
126 }
127
128 # sort out language subindexes
129 if (defined $self->{'collect_cfg'}->{'languages'}) {
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
133 foreach $index (@$indexes) {
134 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
136 }
137 else { # add in an empty subcollection field
138 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
139 }
140 }
141 }
142 }
143
144 # make sure that the same index isn't specified more than once
145 my %tmphash = ();
146 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 foreach my $i (@tmparray) {
149 if (!defined ($tmphash{$i})) {
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
151 $tmphash{$i} = 1;
152 }
153 }
154
155
156 # get the levels (Section, Paragraph) for indexing and compression
157 $self->{'levels'} = {};
158 if (defined $self->{'collect_cfg'}->{'levels'}) {
159 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
160 $self->{'levels'}->{$level} = 1;
161 }
162 }
163
164 # get the list of plugins for this collection
165 my $plugins = [];
166 if (defined $self->{'collect_cfg'}->{'plugin'}) {
167 $plugins = $self->{'collect_cfg'}->{'plugin'};
168 }
169
170 # load all the plugins
171 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
172 if (scalar(@{$self->{'pluginfo'}}) == 0) {
173 print $outhandle "No plugins were loaded.\n";
174 die "\n";
175 }
176
177 # get the list of classifiers for this collection
178 my $classifiers = [];
179 if (defined $self->{'collect_cfg'}->{'classify'}) {
180 $classifiers = $self->{'collect_cfg'}->{'classify'};
181 }
182
183 # load all the classifiers
184 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
185
186 # load up any dontgdbm fields
187 $self->{'dontgdbm'} = {};
188 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
189 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
190 $self->{'dontgdbm'}->{$dg} = 1;
191 }
192 }
193
194 # load up the document processor for building
195 # if a buildproc class has been created for this collection, use it
196 # otherwise, use the mgpp buildproc
197 my ($buildprocdir, $buildproctype);
198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
199 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
200 $buildproctype = "${collection}buildproc";
201 } else {
202 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
203 $buildproctype = "mgppbuildproc";
204 }
205 require "$buildprocdir/$buildproctype.pm";
206
207 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
208 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
209 die "$@" if $@;
210
211
212 return $self;
213}
214
215sub init {
216 my $self = shift (@_);
217
218 if (!$self->{'debug'} && !$self->{'keepold'}) {
219 # remove any old builds
220 &util::rm_r($self->{'build_dir'});
221 &util::mk_all_dir($self->{'build_dir'});
222
223 # make the text directory
224 my $textdir = "$self->{'build_dir'}/text";
225 &util::mk_all_dir($textdir);
226 }
227}
228
229sub set_strip_html {
230 my $self = shift (@_);
231 my ($strip) = @_;
232
233 $self->{'strip_html'} = $strip;
234 $self->{'buildproc'}->set_strip_html($strip);
235}
236
237sub compress_text {
238
239 my $self = shift (@_);
240 my ($textindex) = @_;
241
242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243 my $exe = &util::get_os_exe ();
244 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
245 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
246 my $outhandle = $self->{'outhandle'};
247
248 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
249
250 my $basefilename = "text/$self->{'collection'}";
251 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
252
253 my $osextra = "";
254 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 $fulltextprefix =~ s@/@\\@g;
256 }
257 else {
258 $osextra = " -d /";
259 }
260
261
262 # define the section names for mgpasses
263 # the compressor doesn't need to know about paragraphs - never want to
264 # retrieve them
265 my $mgpp_passes_sections = "";
266 if ($self->{'levels'}->{'Section'}) {
267 $mgpp_passes_sections .= "-K Section ";
268 }
269
270 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
271
272 # collect the statistics for the text
273 # -b $maxdocsize sets the maximum document size to be 12 meg
274 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
275
276 my ($handle);
277 if ($self->{'debug'}) {
278 $handle = STDOUT;
279 } else {
280 if (!-e "$mgpp_passes_exe" ||
281 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
282 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
283 }
284 $handle = mgppbuilder::PIPEOUT;
285 }
286 $self->{'buildproc'}->set_output_handle ($handle);
287 $self->{'buildproc'}->set_mode ('text');
288 $self->{'buildproc'}->set_index ($textindex);
289 $self->{'buildproc'}->set_indexing_text (0);
290 if ($self->{'no_text'}) {
291 $self->{'buildproc'}->set_store_text(0);
292 } else {
293 $self->{'buildproc'}->set_store_text(1);
294 }
295 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
296 $self->{'buildproc'}->set_levels ($self->{'levels'});
297 $self->{'buildproc'}->reset();
298 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
299 $self->{'buildproc'}, $self->{'maxdocs'});
300 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
301 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
302 &plugin::end($self->{'pluginfo'});
303 close (PIPEOUT);
304
305 close ($handle) unless $self->{'debug'};
306
307 $self->print_stats();
308
309 # create the compression dictionary
310 # the compression dictionary is built by assuming the stats are from a seed
311 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
312 # and the resulting dictionary must be less than 5 meg with the most
313 # frequent words being put into the dictionary first (-2 -k 5120)
314 # note: these options are left over from mg version
315 if (!$self->{'debug'}) {
316 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
317 if (!-e "$mgpp_compression_dict_exe") {
318 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
319 }
320 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
321
322 if (!$self->{'debug'}) {
323 if (!-e "$mgpp_passes_exe" ||
324 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
325 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
326 }
327 }
328 }
329
330 $self->{'buildproc'}->reset();
331 # compress the text
332 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
333 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
334 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
335 close ($handle) unless $self->{'debug'};
336
337 $self->print_stats();
338}
339
340sub want_built {
341 my $self = shift (@_);
342 my ($index) = @_;
343
344 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
345 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
346 if ($index =~ /^$checkstr$/) {
347 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
348 return 0;
349 }
350 }
351 }
352
353 return 1;
354}
355
356sub build_indexes {
357 my $self = shift (@_);
358 my ($indexname) = @_;
359 my $outhandle = $self->{'outhandle'};
360
361 my $indexes = [];
362 if (defined $indexname && $indexname =~ /\w/) {
363 push @$indexes, $indexname;
364 } else {
365 $indexes = $self->{'collect_cfg'}->{'indexes'};
366 }
367
368 # create the mapping between the index descriptions
369 # and their directory names
370 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
371
372 # build each of the indexes
373 foreach $index (@$indexes) {
374 if ($self->want_built($index)) {
375 print $outhandle "\n*** building index $index in subdirectory " .
376 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
377 $self->build_index($index);
378 } else {
379 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
380 }
381 }
382}
383
384# creates directory names for each of the index descriptions
385sub create_index_mapping {
386 my $self = shift (@_);
387 my ($indexes) = @_;
388
389 my %mapping = ();
390 $mapping{'indexmaporder'} = [];
391 $mapping{'subcollectionmaporder'} = [];
392 $mapping{'languagemaporder'} = [];
393
394 # dirnames is used to check for collisions. Start this off
395 # with the manditory directory names
396 my %dirnames = ('text'=>'text',
397 'extra'=>'extra');
398 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
399
400 foreach $index (@$indexes) {
401 my ($fields, $subcollection, $languages) = split (":", $index);
402
403 # the directory name starts with a processed version of index fields
404 my ($pindex) = $self->process_field($fields);
405 # next comes a processed version of the index
406 $pindex = lc ($pindex);
407
408 # next comes a processed version of the subcollection if there is one.
409 my $psub = $self->process_field ($subcollection);
410 $psub = lc ($psub);
411
412 # next comes a processed version of the language if there is one.
413 my $plang = $self->process_field ($languages);
414 $plang = lc ($plang);
415
416 my $dirname = $pindex . $psub . $plang;
417
418 # check to be sure all index names are unique
419 while (defined ($dirnames{$dirname})) {
420 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
421 }
422
423 $mapping{$index} = $dirname;
424
425 # store the mapping orders as well as the maps
426 # also put index, subcollection and language fields into the mapping thing -
427 # (the full index name (eg document:text:subcol:lang) is not used on
428 # the query page) -these are used for collectionmeta later on
429 if (!defined $mapping{'indexmap'}{"$fields"}) {
430 $mapping{'indexmap'}{"$fields"} = $pindex;
431 push (@{$mapping{'indexmaporder'}}, "$fields");
432 if (!defined $mapping{"$fields"}) {
433 $mapping{"$fields"} = $pindex;
434 }
435 }
436 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
437 $mapping{'subcollectionmap'}{$subcollection} = $psub;
438 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
439 $mapping{$subcollection} = $psub;
440 }
441 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
442 $mapping{'languagemap'}{$languages} = $plang;
443 push (@{$mapping{'languagemaporder'}}, $language);
444 $mapping{$languages} = $plang;
445 }
446 $dirnames{$dirname} = $index;
447 $pnames{'index'}{$pindex} = "$fields";
448 $pnames{'subcollection'}{$psub} = $subcollection;
449 $pnames{'languages'}{$plang} = $languages;
450 }
451
452 return \%mapping;
453}
454
455# returns a processed version of a field.
456# if the field has only one component the processed
457# version will contain the first character and next consonant
458# of that componant - otherwise it will contain the first
459# character of the first two components
460sub process_field {
461 my $self = shift (@_);
462 my ($field) = @_;
463
464 return "" unless (defined ($field) && $field =~ /\w/);
465
466 my @components = split /,/, $field;
467 if (scalar @components >= 2) {
468 splice (@components, 2);
469 map {s/^(.).*$/$1/;} @components;
470 return join("", @components);
471 } else {
472 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
473 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
474 return "$a$b";
475 }
476}
477
478sub make_unique {
479 my $self = shift (@_);
480 my ($namehash, $index, $indexref, $subref, $langref) = @_;
481 my ($fields, $subcollection, $languages) = split (":", $index);
482
483 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
484 $self->get_next_version ($indexref);
485 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
486 $self->get_next_version ($subref);
487 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
488 $self->get_next_version ($langref);
489 }
490 return "$$indexref$$subref$$langref";
491}
492
493sub get_next_version {
494 my $self = shift (@_);
495 my ($nameref) = @_;
496
497 if ($$nameref =~ /(\d\d)$/) {
498 my $num = $1; $num ++;
499 $$nameref =~ s/\d\d$/$num/;
500 } elsif ($$nameref =~ /(\d)$/) {
501 my $num = $1;
502 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
503 else {$num ++; $$nameref =~ s/\d$/$num/;}
504 } else {
505 $$nameref =~ s/.$/0/;
506 }
507}
508
509sub build_index {
510 my $self = shift (@_);
511 my ($index) = @_;
512 my $outhandle = $self->{'outhandle'};
513
514 # get the full index directory path and make sure it exists
515 my $indexdir = $self->{'index_mapping'}->{$index};
516 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
517 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
518 $indexdir,
519 $self->{'collection'});
520 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
521 $self->{'collection'});
522
523 # get any os specific stuff
524 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
525
526 my $exe = &util::get_os_exe ();
527 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
528
529 # define the section names for mgpasses
530 my $mgpp_passes_sections = "";
531 foreach $level (keys (%{$self->{'levels'}})) {
532 if ($level eq "Section" || $level eq "Paragraph") {
533 $mgpp_passes_sections .= "-K $level ";
534 }
535 }
536
537 my $mgpp_perf_hash_build_exe =
538 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
539 my $mgpp_weights_build_exe =
540 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
541 my $mgpp_invf_dict_exe =
542 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
543 my $mgpp_stem_idx_exe =
544 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
545
546 my $osextra = "";
547 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
548 $fullindexprefix =~ s@/@\\@g;
549 } else {
550 $osextra = " -d /";
551 if ($outhandle ne "STDERR") {
552 # so mgpp_passes doesn't print to stderr if we redirect output
553 $osextra .= " 2>/dev/null";
554 }
555 }
556
557 # get the index expression if this index belongs
558 # to a subcollection
559 my $indexexparr = [];
560
561 # there may be subcollection info, and language info.
562 my ($fields, $subcollection, $language) = split (":", $index);
563 my @subcollections = ();
564 @subcollections = split /,/, $subcollection if (defined $subcollection);
565
566 foreach $subcollection (@subcollections) {
567 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
568 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
569 }
570 }
571
572 # add expressions for languages if this index belongs to
573 # a language subcollection - only put languages expressions for the
574 # ones we want in the index
575
576 my @languages = ();
577 @languages = split /,/, $language if (defined $language);
578 foreach $language (@languages) {
579 my $not=0;
580 if ($language =~ s/^\!//) {
581 $not = 1;
582 }
583 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
584 if ($lang eq $language) {
585 if ($not) {
586 push (@$indexexparr, "!Language/$language/");
587 } else {
588 push (@$indexexparr, "Language/$language/");
589 }
590 last;
591 }
592 }
593 }
594
595 # Build index dictionary. Uses verbatim stem method
596 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
597 my ($handle);
598 if ($self->{'debug'}) {
599 $handle = STDOUT;
600 } else {
601 if (!-e "$mgpp_passes_exe" ||
602 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
603 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
604 }
605 $handle = mgppbuilder::PIPEOUT;
606 }
607
608 # set up the document processor
609 $self->{'buildproc'}->set_output_handle ($handle);
610 $self->{'buildproc'}->set_mode ('text');
611 $self->{'buildproc'}->set_index ($index, $indexexparr);
612 $self->{'buildproc'}->set_indexing_text (1);
613 $self->{'buildproc'}->set_store_text(1);
614 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
615 $self->{'buildproc'}->set_levels ($self->{'levels'});
616 $self->{'buildproc'}->reset();
617 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
618 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
619 close ($handle) unless $self->{'debug'};
620
621 $self->print_stats();
622
623 if (!$self->{'debug'}) {
624 # create the perfect hash function
625 if (!-e "$mgpp_perf_hash_build_exe") {
626 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
627 }
628 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
629
630 if (!-e "$mgpp_passes_exe" ||
631 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
632 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
633 }
634 }
635
636 # invert the text
637 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
638
639 $self->{'buildproc'}->reset();
640 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
641 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
642
643 $self->print_stats ();
644
645 if (!$self->{'debug'}) {
646
647 close ($handle);
648
649 # create the weights file
650 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
651 if (!-e "$mgpp_weights_build_exe") {
652 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
653 }
654 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
655
656 # create 'on-disk' stemmed dictionary
657 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
658 if (!-e "$mgpp_invf_dict_exe") {
659 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
660 }
661 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
662
663
664 # creates stem index files for the various stemming methods
665 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
666 if (!-e "$mgpp_stem_idx_exe") {
667 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
668 }
669 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
670 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
671 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
672
673
674 # remove unwanted files
675 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
676 opendir (DIR, $tmpdir) || die
677 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
678 foreach $file (readdir(DIR)) {
679 next if $file =~ /^\./;
680 my ($suffix) = $file =~ /\.([^\.]+)$/;
681 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
682 # delete it!
683 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
684 #&util::rm (&util::filename_cat ($tmpdir, $file));
685 }
686 }
687 closedir (DIR);
688 }
689}
690
691sub make_infodatabase {
692 my $self = shift (@_);
693 my $outhandle = $self->{'outhandle'};
694
695
696 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
697 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
698 &util::mk_all_dir ($textdir);
699 &util::mk_all_dir ($assocdir);
700
701 # get db name
702 my $dbext = ".bdb";
703 $dbext = ".ldb" if &util::is_little_endian();
704 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
705 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
706
707 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
708 my $exe = &util::get_os_exe ();
709 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
710
711 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
712 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
713 #check build.cfg to see if indexfields have been filled in
714 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
715 if (-e $buildconfigfile) {
716 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
717 if (defined $buildcfg->{'indexfields'}) {
718 foreach $field (@{$buildcfg->{'indexfields'}}) {
719 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
720 }
721 }
722 if (defined $buildcfg->{'indexfieldmap'}) {
723 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
724 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
725 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
726 }
727 }
728 }
729 }
730
731 print $outhandle "\n*** creating the info database and processing associated files\n"
732 if ($self->{'verbosity'} >= 1);
733
734 # init all the classifiers
735 &classify::init_classifiers ($self->{'classifiers'});
736
737 # set up the document processor
738 my ($handle);
739 if ($self->{'debug'}) {
740 $handle = STDOUT;
741 } else {
742 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
743 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
744 }
745 $handle = mgppbuilder::PIPEOUT;
746 }
747
748 $self->{'buildproc'}->set_output_handle ($handle);
749 $self->{'buildproc'}->set_mode ('infodb');
750 $self->{'buildproc'}->set_assocdir ($assocdir);
751 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
752 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
753 $self->{'buildproc'}->set_indexing_text (0);
754 $self->{'buildproc'}->set_store_text(1);
755 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
756
757 $self->{'buildproc'}->reset();
758
759 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
760
761 if (!defined $self->{'index_mapping'}) {
762 $self->{'index_mapping'} =
763 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
764 }
765
766 print $handle "[collection]\n";
767
768 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
769 my $defaultfound=0;
770 my $first=1;
771 my $metadata_entry = "";
772 my $default="";
773 my $cmetamap = "";
774 if ($cmeta =~ s/^\.//) {
775 if (defined $self->{'index_mapping'}->{$cmeta}) {
776 $cmetamap = $self->{'index_mapping'}->{$cmeta};
777 $cmeta = ".$cmeta";
778 }
779 else {
780 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
781 next; #ignore this one
782 }
783 }
784 else {
785 $cmetamap = $cmeta; # just using the same name
786 }
787 #iterate through the languages
788 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
789 if ($first) {
790 $first=0;
791 #set the default default to the first entry
792 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
793 }
794 if ($lang =~ /default/) {
795 $defaultfound=1;
796 #the default entry goes first
797 $metadata_entry = "<$cmetamap>" .
798 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
799 }
800 else {
801 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
802 if ($l) {
803 $metadata_entry .= "<$cmetamap:$l>" .
804 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
805 }
806 }
807 }
808 #if we haven't found a default, put one in
809 if (!$defaultfound) {
810 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
811 }
812 #write the entry to the file
813 print $handle $metadata_entry;
814
815 }
816
817 #add the indexfieldmap macros to [collection]
818 # eg <TI>Title
819 # <SU>Subject
820 # these may be overidden for other langs if add to macro files
821 $field_entry="";
822 foreach $longfield (keys %{$self->{'buildproc'}->{'indexfieldmap'}}){
823 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
824 next if $shortfield eq 1;
825 $field_entry .= "<$shortfield>$longfield\n";
826 }
827 print $handle $field_entry;
828
829 print $handle "\n" . ('-' x 70) . "\n";
830
831 }
832
833 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
834 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
835
836 # output classification information
837 &classify::output_classify_info ($self->{'classifiers'}, $handle,
838 $self->{'allclassifications'});
839
840 #output doclist
841 my @doclist = $self->{'buildproc'}->get_doc_list();
842 my $docs = join (";",@doclist);
843 print $handle "[browselist]\n";
844 print $handle "<hastxt>0\n";
845 print $handle "<childtype>VList\n";
846 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
847 print $handle "<thistype>Invisible\n";
848 print $handle "<contains>$docs";
849 print $handle "\n" . ('-' x 70) . "\n";
850 close ($handle) if !$self->{'debug'};
851
852}
853
854sub collect_specific {
855 my $self = shift (@_);
856}
857
858sub make_auxiliary_files {
859 my $self = shift (@_);
860 my ($index);
861 my %build_cfg = ();
862
863 my $outhandle = $self->{'outhandle'};
864 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
865
866 # get the text directory
867 &util::mk_all_dir ($self->{'build_dir'});
868
869 # store the build date
870 $build_cfg->{'builddate'} = time;
871 $build_cfg->{'buildtype'} = "mgpp";
872
873 # store the number of documents and number of bytes
874 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
875 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
876
877 # store the mapping between the index names and the directory names
878 my @indexmap = ();
879 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
880 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
881 }
882 $build_cfg->{'indexmap'} = \@indexmap;
883
884 my @subcollectionmap = ();
885 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
886 push (@subcollectionmap, "$subcollection\-\>" .
887 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
888 }
889 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
890
891 my @languagemap = ();
892 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
893 push (@languagemap, "$language\-\>" .
894 $self->{'index_mapping'}->{'languagemap'}->{$language});
895 }
896 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
897
898 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
899
900 # store the indexfieldmap information
901 my @indexfieldmap = ();
902 #add all fields bit - sort based on keys. text only is put at front
903 if (defined $self->{'buildproc'}->{'indexfields'}->{'TextOnly'}) {
904 push (@indexfieldmap, "TextOnly\-\>TX");
905 }
906 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
907 next if $field eq "TextOnly";
908 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
909 }
910
911 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
912
913 #store the indexed field information
914 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
915
916 push (@{$build_cfg->{'indexfields'}}, $field);
917 }
918 # write out the build information
919 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
920 '^(builddate|buildtype|numdocs|numbytes)$',
921 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
922
923}
924
925sub deinit {
926 my $self = shift (@_);
927}
928
929sub print_stats {
930 my $self = shift (@_);
931
932 my $outhandle = $self->{'outhandle'};
933 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
934 my $index = $self->{'buildproc'}->get_index();
935 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
936 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
937
938 if ($indexing_text) {
939 print $outhandle "Stats (Creating index $index)\n";
940 } else {
941 print $outhandle "Stats (Compressing text from $index)\n";
942 }
943 print $outhandle "Total bytes in collection: $num_bytes\n";
944 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
945
946 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
947 print $outhandle "***************\n";
948 if ($indexing_text) {
949 print $outhandle "WARNING: There is very little or no text to process for $index\n";
950 } elsif (!$self->{'no_text'}) {
951 print $outhandle "WARNING: There is very little or no text to compress\n";
952 }
953 print $outhandle " Was this your intention?\n";
954 print $outhandle "***************\n";
955 }
956
957}
958
9591;
960
961
Note: See TracBrowser for help on using the repository browser.