source: trunk/protemix/perllib/ptmxbuilder.pm@ 3170

Last change on this file since 3170 was 3170, checked in by sjboddie, 22 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 31.1 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'w'=>1,
62 'wa'=>1);
63
64# change this so a user can add their own ones in via a file or cfg
65%static_indexfield_map = ('Title'=>'TI',
66 'TI'=>1,
67 'Subject'=>'SU',
68 'SU'=>1,
69 'Creator'=>'CR',
70 'CR'=>1,
71 'Organization'=>'OR',
72 'OR'=>1,
73 'Source'=>'SO',
74 'SO'=>1,
75 'Howto'=>'HT',
76 'HT'=>1,
77 'ItemTitle'=>'IT',
78 'IT'=>1,
79 'ProgNumber'=>'PN',
80 'PN'=>1,
81 'People'=>'PE',
82 'PE'=>1,
83 'Class1'=>'CL',
84 'CL'=>1,
85 'Class2'=>'CA',
86 'CA'=>1,
87 'Class3'=>'CS',
88 'CS'=>1,
89 'TextOnly'=>'TX',
90 'TX'=>1);
91
92sub new {
93 my ($class, $collection, $source_dir, $build_dir, $verbosity,
94 $maxdocs, $debug, $keepold, $allclassifications,
95 $outhandle, $no_text) = @_;
96
97 $outhandle = STDERR unless defined $outhandle;
98 $no_text = 0 unless defined $no_text;
99
100 # create an mgppbuilder object
101 my $self = bless {'collection'=>$collection,
102 'source_dir'=>$source_dir,
103 'build_dir'=>$build_dir,
104 'verbosity'=>$verbosity,
105 'maxdocs'=>$maxdocs,
106 'debug'=>$debug,
107 'keepold'=>$keepold,
108 'allclassifications'=>$allclassifications,
109 'outhandle'=>$outhandle,
110 'no_text'=>$no_text,
111 'notbuilt'=>[], # indexes not built
112 'indexfieldmap'=>\%static_indexfield_map
113 }, $class;
114
115
116 # read in the collection configuration file
117 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
118 if (!-e $colcfgname) {
119 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
120 }
121 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
122
123 # sort out subcollection indexes
124 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
125 my $indexes = $self->{'collect_cfg'}->{'indexes'};
126 $self->{'collect_cfg'}->{'indexes'} = [];
127 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
128 foreach $index (@$indexes) {
129 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
130 }
131 }
132 }
133
134 # sort out language subindexes
135 if (defined $self->{'collect_cfg'}->{'languages'}) {
136 my $indexes = $self->{'collect_cfg'}->{'indexes'};
137 $self->{'collect_cfg'}->{'indexes'} = [];
138 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
139 foreach $index (@$indexes) {
140 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
142 }
143 else { # add in an empty subcollection field
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
145 }
146 }
147 }
148 }
149
150 # make sure that the same index isn't specified more than once
151 my %tmphash = ();
152 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
153 $self->{'collect_cfg'}->{'indexes'} = [];
154 foreach my $i (@tmparray) {
155 if (!defined ($tmphash{$i})) {
156 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
157 $tmphash{$i} = 1;
158 }
159 }
160
161
162 # get the levels (Section, Paragraph) for indexing and compression
163 $self->{'levels'} = {};
164 if (defined $self->{'collect_cfg'}->{'levels'}) {
165 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
166 $self->{'levels'}->{$level} = 1;
167 }
168 }
169
170 # get the list of plugins for this collection
171 my $plugins = [];
172 if (defined $self->{'collect_cfg'}->{'plugin'}) {
173 $plugins = $self->{'collect_cfg'}->{'plugin'};
174 }
175
176 # load all the plugins
177 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
178 if (scalar(@{$self->{'pluginfo'}}) == 0) {
179 print $outhandle "No plugins were loaded.\n";
180 die "\n";
181 }
182
183 # get the list of classifiers for this collection
184 my $classifiers = [];
185 if (defined $self->{'collect_cfg'}->{'classify'}) {
186 $classifiers = $self->{'collect_cfg'}->{'classify'};
187 }
188
189 # load all the classifiers
190 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
191
192 # load up any dontgdbm fields
193 $self->{'dontgdbm'} = {};
194 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
195 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
196 $self->{'dontgdbm'}->{$dg} = 1;
197 }
198 }
199
200 # load up the document processor for building
201 # if a buildproc class has been created for this collection, use it
202 # otherwise, use the mgpp buildproc
203 my ($buildprocdir, $buildproctype);
204 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
205 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
206 $buildproctype = "${collection}buildproc";
207 } else {
208 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
209 $buildproctype = "mgppbuildproc";
210 }
211 require "$buildprocdir/$buildproctype.pm";
212
213 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
214 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
215 die "$@" if $@;
216
217
218 return $self;
219}
220
221sub init {
222 my $self = shift (@_);
223
224 if (!$self->{'debug'} && !$self->{'keepold'}) {
225 # remove any old builds
226 &util::rm_r($self->{'build_dir'});
227 &util::mk_all_dir($self->{'build_dir'});
228
229 # make the text directory
230 my $textdir = "$self->{'build_dir'}/text";
231 &util::mk_all_dir($textdir);
232 }
233}
234
235sub set_strip_html {
236 my $self = shift (@_);
237 my ($strip) = @_;
238
239 $self->{'strip_html'} = $strip;
240 $self->{'buildproc'}->set_strip_html($strip);
241}
242
243sub compress_text {
244
245 my $self = shift (@_);
246 my ($textindex) = @_;
247
248 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
249 my $exe = &util::get_os_exe ();
250 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
251 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
252 my $outhandle = $self->{'outhandle'};
253
254 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
255
256 my $basefilename = "text/$self->{'collection'}";
257 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
258
259 my $osextra = "";
260 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
261 $fulltextprefix =~ s@/@\\@g;
262 }
263 else {
264 $osextra = " -d /";
265 }
266
267
268 # define the section names for mgpasses
269 # the compressor doesn't need to know about paragraphs - never want to
270 # retrieve them
271 my $mgpp_passes_sections = "";
272 if ($self->{'levels'}->{'Section'}) {
273 $mgpp_passes_sections .= "-K Section ";
274 }
275
276 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
277
278 # collect the statistics for the text
279 # -b $maxdocsize sets the maximum document size to be 12 meg
280 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
281
282 my ($handle);
283 if ($self->{'debug'}) {
284 $handle = STDOUT;
285 } else {
286 if (!-e "$mgpp_passes_exe" ||
287 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
288 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
289 }
290 $handle = mgppbuilder::PIPEOUT;
291 }
292 $self->{'buildproc'}->set_output_handle ($handle);
293 $self->{'buildproc'}->set_mode ('text');
294 $self->{'buildproc'}->set_index ($textindex);
295 $self->{'buildproc'}->set_indexing_text (0);
296 if ($self->{'no_text'}) {
297 $self->{'buildproc'}->set_store_text(0);
298 } else {
299 $self->{'buildproc'}->set_store_text(1);
300 }
301 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
302 $self->{'buildproc'}->set_levels ($self->{'levels'});
303 $self->{'buildproc'}->reset();
304 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
305 $self->{'buildproc'}, $self->{'maxdocs'});
306 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
307 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
308 &plugin::end($self->{'pluginfo'});
309 close (PIPEOUT);
310
311 close ($handle) unless $self->{'debug'};
312
313 $self->print_stats();
314
315 # create the compression dictionary
316 # the compression dictionary is built by assuming the stats are from a seed
317 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
318 # and the resulting dictionary must be less than 5 meg with the most
319 # frequent words being put into the dictionary first (-2 -k 5120)
320 # note: these options are left over from mg version
321 if (!$self->{'debug'}) {
322 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
323 if (!-e "$mgpp_compression_dict_exe") {
324 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
325 }
326 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
327
328 if (!$self->{'debug'}) {
329 if (!-e "$mgpp_passes_exe" ||
330 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
331 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
332 }
333 }
334 }
335
336 $self->{'buildproc'}->reset();
337 # compress the text
338 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
339 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
340 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
341 close ($handle) unless $self->{'debug'};
342
343 $self->print_stats();
344}
345
346sub want_built {
347 my $self = shift (@_);
348 my ($index) = @_;
349
350 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
351 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
352 if ($index =~ /^$checkstr$/) {
353 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
354 return 0;
355 }
356 }
357 }
358
359 return 1;
360}
361
362sub build_indexes {
363 my $self = shift (@_);
364 my ($indexname) = @_;
365 my $outhandle = $self->{'outhandle'};
366
367 my $indexes = [];
368 if (defined $indexname && $indexname =~ /\w/) {
369 push @$indexes, $indexname;
370 } else {
371 $indexes = $self->{'collect_cfg'}->{'indexes'};
372 }
373
374 # create the mapping between the index descriptions
375 # and their directory names
376 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
377
378 # build each of the indexes
379 foreach $index (@$indexes) {
380 if ($self->want_built($index)) {
381 print $outhandle "\n*** building index $index in subdirectory " .
382 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
383 $self->build_index($index);
384 } else {
385 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
386 }
387 }
388}
389
390# creates directory names for each of the index descriptions
391sub create_index_mapping {
392 my $self = shift (@_);
393 my ($indexes) = @_;
394
395 my %mapping = ();
396 $mapping{'indexmaporder'} = [];
397 $mapping{'subcollectionmaporder'} = [];
398 $mapping{'languagemaporder'} = [];
399
400 # dirnames is used to check for collisions. Start this off
401 # with the manditory directory names
402 my %dirnames = ('text'=>'text',
403 'extra'=>'extra');
404 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
405
406 foreach $index (@$indexes) {
407 my ($fields, $subcollection, $languages) = split (":", $index);
408
409 # the directory name starts with a processed version of index fields
410 my ($pindex) = $self->process_field($fields);
411 # next comes a processed version of the index
412 $pindex = lc ($pindex);
413
414 # next comes a processed version of the subcollection if there is one.
415 my $psub = $self->process_field ($subcollection);
416 $psub = lc ($psub);
417
418 # next comes a processed version of the language if there is one.
419 my $plang = $self->process_field ($languages);
420 $plang = lc ($plang);
421
422 my $dirname = $pindex . $psub . $plang;
423
424 # check to be sure all index names are unique
425 while (defined ($dirnames{$dirname})) {
426 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
427 }
428
429 $mapping{$index} = $dirname;
430
431 # store the mapping orders as well as the maps
432 # also put index, subcollection and language fields into the mapping thing -
433 # (the full index name (eg document:text:subcol:lang) is not used on
434 # the query page) -these are used for collectionmeta later on
435 if (!defined $mapping{'indexmap'}{"$fields"}) {
436 $mapping{'indexmap'}{"$fields"} = $pindex;
437 push (@{$mapping{'indexmaporder'}}, "$fields");
438 if (!defined $mapping{"$fields"}) {
439 $mapping{"$fields"} = $pindex;
440 }
441 }
442 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
443 $mapping{'subcollectionmap'}{$subcollection} = $psub;
444 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
445 $mapping{$subcollection} = $psub;
446 }
447 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
448 $mapping{'languagemap'}{$languages} = $plang;
449 push (@{$mapping{'languagemaporder'}}, $language);
450 $mapping{$languages} = $plang;
451 }
452 $dirnames{$dirname} = $index;
453 $pnames{'index'}{$pindex} = "$fields";
454 $pnames{'subcollection'}{$psub} = $subcollection;
455 $pnames{'languages'}{$plang} = $languages;
456 }
457
458 return \%mapping;
459}
460
461# returns a processed version of a field.
462# if the field has only one component the processed
463# version will contain the first character and next consonant
464# of that componant - otherwise it will contain the first
465# character of the first two components
466sub process_field {
467 my $self = shift (@_);
468 my ($field) = @_;
469
470 return "" unless (defined ($field) && $field =~ /\w/);
471
472 my @components = split /,/, $field;
473 if (scalar @components >= 2) {
474 splice (@components, 2);
475 map {s/^(.).*$/$1/;} @components;
476 return join("", @components);
477 } else {
478 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
479 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
480 return "$a$b";
481 }
482}
483
484sub make_unique {
485 my $self = shift (@_);
486 my ($namehash, $index, $indexref, $subref, $langref) = @_;
487 my ($fields, $subcollection, $languages) = split (":", $index);
488
489 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
490 $self->get_next_version ($indexref);
491 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
492 $self->get_next_version ($subref);
493 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
494 $self->get_next_version ($langref);
495 }
496 return "$$indexref$$subref$$langref";
497}
498
499sub get_next_version {
500 my $self = shift (@_);
501 my ($nameref) = @_;
502
503 if ($$nameref =~ /(\d\d)$/) {
504 my $num = $1; $num ++;
505 $$nameref =~ s/\d\d$/$num/;
506 } elsif ($$nameref =~ /(\d)$/) {
507 my $num = $1;
508 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
509 else {$num ++; $$nameref =~ s/\d$/$num/;}
510 } else {
511 $$nameref =~ s/.$/0/;
512 }
513}
514
515sub build_index {
516 my $self = shift (@_);
517 my ($index) = @_;
518 my $outhandle = $self->{'outhandle'};
519
520 # get the full index directory path and make sure it exists
521 my $indexdir = $self->{'index_mapping'}->{$index};
522 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
523 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
524 $indexdir,
525 $self->{'collection'});
526 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
527 $self->{'collection'});
528
529 # get any os specific stuff
530 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
531
532 my $exe = &util::get_os_exe ();
533 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
534
535 # define the section names for mgpasses
536 my $mgpp_passes_sections = "";
537 foreach $level (keys (%{$self->{'levels'}})) {
538 if ($level eq "Section" || $level eq "Paragraph") {
539 $mgpp_passes_sections .= "-K $level ";
540 }
541 }
542
543 my $mgpp_perf_hash_build_exe =
544 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
545 my $mgpp_weights_build_exe =
546 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
547 my $mgpp_invf_dict_exe =
548 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
549 my $mgpp_stem_idx_exe =
550 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
551
552 my $osextra = "";
553 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
554 $fullindexprefix =~ s@/@\\@g;
555 } else {
556 $osextra = " -d /";
557 if ($outhandle ne "STDERR") {
558 # so mgpp_passes doesn't print to stderr if we redirect output
559 $osextra .= " 2>/dev/null";
560 }
561 }
562
563 # get the index expression if this index belongs
564 # to a subcollection
565 my $indexexparr = [];
566
567 # there may be subcollection info, and language info.
568 my ($fields, $subcollection, $language) = split (":", $index);
569 my @subcollections = ();
570 @subcollections = split /,/, $subcollection if (defined $subcollection);
571
572 foreach $subcollection (@subcollections) {
573 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
574 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
575 }
576 }
577
578 # add expressions for languages if this index belongs to
579 # a language subcollection - only put languages expressions for the
580 # ones we want in the index
581
582 my @languages = ();
583 @languages = split /,/, $language if (defined $language);
584 foreach $language (@languages) {
585 my $not=0;
586 if ($language =~ s/^\!//) {
587 $not = 1;
588 }
589 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
590 if ($lang eq $language) {
591 if ($not) {
592 push (@$indexexparr, "!Language/$language/");
593 } else {
594 push (@$indexexparr, "Language/$language/");
595 }
596 last;
597 }
598 }
599 }
600
601 # Build index dictionary. Uses verbatim stem method
602 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
603 my ($handle);
604 if ($self->{'debug'}) {
605 $handle = STDOUT;
606 } else {
607 if (!-e "$mgpp_passes_exe" ||
608 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
609 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
610 }
611 $handle = mgppbuilder::PIPEOUT;
612 }
613
614 # set up the document processor
615 $self->{'buildproc'}->set_output_handle ($handle);
616 $self->{'buildproc'}->set_mode ('text');
617 $self->{'buildproc'}->set_index ($index, $indexexparr);
618 $self->{'buildproc'}->set_indexing_text (1);
619 $self->{'buildproc'}->set_store_text(1);
620 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
621 $self->{'buildproc'}->set_levels ($self->{'levels'});
622 $self->{'buildproc'}->reset();
623 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
624 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
625 close ($handle) unless $self->{'debug'};
626
627 $self->print_stats();
628
629 if (!$self->{'debug'}) {
630 # create the perfect hash function
631 if (!-e "$mgpp_perf_hash_build_exe") {
632 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
633 }
634 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
635
636 if (!-e "$mgpp_passes_exe" ||
637 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
638 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
639 }
640 }
641
642 # invert the text
643 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
644
645 $self->{'buildproc'}->reset();
646 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
647 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
648
649 $self->print_stats ();
650
651 if (!$self->{'debug'}) {
652
653 close ($handle);
654
655 # create the weights file
656 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
657 if (!-e "$mgpp_weights_build_exe") {
658 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
659 }
660 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
661
662 # create 'on-disk' stemmed dictionary
663 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
664 if (!-e "$mgpp_invf_dict_exe") {
665 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
666 }
667 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
668
669
670 # creates stem index files for the various stemming methods
671 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
672 if (!-e "$mgpp_stem_idx_exe") {
673 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
674 }
675 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
676 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
677 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
678
679
680 # remove unwanted files
681 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
682 opendir (DIR, $tmpdir) || die
683 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
684 foreach $file (readdir(DIR)) {
685 next if $file =~ /^\./;
686 my ($suffix) = $file =~ /\.([^\.]+)$/;
687 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
688 # delete it!
689 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
690 #&util::rm (&util::filename_cat ($tmpdir, $file));
691 }
692 }
693 closedir (DIR);
694 }
695}
696
697sub make_infodatabase {
698 my $self = shift (@_);
699 my $outhandle = $self->{'outhandle'};
700
701
702 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
703 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
704 &util::mk_all_dir ($textdir);
705 &util::mk_all_dir ($assocdir);
706
707 # get db name
708 my $dbext = ".bdb";
709 $dbext = ".ldb" if &util::is_little_endian();
710 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
711 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
712
713 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
714 my $exe = &util::get_os_exe ();
715 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
716
717 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
718 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
719 #check build.cfg to see if indexfields have been filled in
720 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
721 if (-e $buildconfigfile) {
722 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
723 if (defined $buildcfg->{'indexfields'}) {
724 foreach $field (@{$buildcfg->{'indexfields'}}) {
725 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
726 }
727 }
728 if (defined $buildcfg->{'indexfieldmap'}) {
729 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
730 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
731 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
732 }
733 }
734 }
735 }
736
737 print $outhandle "\n*** creating the info database and processing associated files\n"
738 if ($self->{'verbosity'} >= 1);
739
740 # init all the classifiers
741 &classify::init_classifiers ($self->{'classifiers'});
742
743 # set up the document processor
744 my ($handle);
745 if ($self->{'debug'}) {
746 $handle = STDOUT;
747 } else {
748 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
749 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
750 }
751 $handle = mgppbuilder::PIPEOUT;
752 }
753
754 $self->{'buildproc'}->set_output_handle ($handle);
755 $self->{'buildproc'}->set_mode ('infodb');
756 $self->{'buildproc'}->set_assocdir ($assocdir);
757 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
758 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
759 $self->{'buildproc'}->set_indexing_text (0);
760 $self->{'buildproc'}->set_store_text(1);
761 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
762
763 $self->{'buildproc'}->reset();
764
765 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
766
767 if (!defined $self->{'index_mapping'}) {
768 $self->{'index_mapping'} =
769 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
770 }
771
772 print $handle "[collection]\n";
773
774 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
775 my $defaultfound=0;
776 my $first=1;
777 my $metadata_entry = "";
778 my $default="";
779 my $cmetamap = "";
780 if ($cmeta =~ s/^\.//) {
781 if (defined $self->{'index_mapping'}->{$cmeta}) {
782 $cmetamap = $self->{'index_mapping'}->{$cmeta};
783 $cmeta = ".$cmeta";
784 }
785 else {
786 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
787 next; #ignore this one
788 }
789 }
790 else {
791 $cmetamap = $cmeta; # just using the same name
792 }
793 #iterate through the languages
794 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
795 if ($first) {
796 $first=0;
797 #set the default default to the first entry
798 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
799 }
800 if ($lang =~ /default/) {
801 $defaultfound=1;
802 #the default entry goes first
803 $metadata_entry = "<$cmetamap>" .
804 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
805 }
806 else {
807 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
808 if ($l) {
809 $metadata_entry .= "<$cmetamap:$l>" .
810 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
811 }
812 }
813 }
814 #if we haven't found a default, put one in
815 if (!$defaultfound) {
816 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
817 }
818 #write the entry to the file
819 print $handle $metadata_entry;
820
821 }
822
823 #add the indexfieldmap macros to [collection]
824 # eg <TI>Title
825 # <SU>Subject
826 # these may be overidden for other langs if add to macro files
827 $field_entry="";
828 foreach $longfield (keys %{$self->{'buildproc'}->{'indexfieldmap'}}){
829 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
830 next if $shortfield eq 1;
831 $field_entry .= "<$shortfield>$longfield\n";
832 }
833 print $handle $field_entry;
834
835 print $handle "\n" . ('-' x 70) . "\n";
836
837 }
838
839 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
840 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
841
842 # output classification information
843 &classify::output_classify_info ($self->{'classifiers'}, $handle,
844 $self->{'allclassifications'});
845
846 #output doclist
847 my @doclist = $self->{'buildproc'}->get_doc_list();
848 my $docs = join (";",@doclist);
849 print $handle "[browselist]\n";
850 print $handle "<hastxt>0\n";
851 print $handle "<childtype>VList\n";
852 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
853 print $handle "<thistype>Invisible\n";
854 print $handle "<contains>$docs";
855 print $handle "\n" . ('-' x 70) . "\n";
856 close ($handle) if !$self->{'debug'};
857
858}
859
860sub collect_specific {
861 my $self = shift (@_);
862}
863
864sub make_auxiliary_files {
865 my $self = shift (@_);
866 my ($index);
867 my %build_cfg = ();
868
869 my $outhandle = $self->{'outhandle'};
870 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
871
872 # get the text directory
873 &util::mk_all_dir ($self->{'build_dir'});
874
875 # store the build date
876 $build_cfg->{'builddate'} = time;
877 $build_cfg->{'buildtype'} = "mgpp";
878
879 # store the number of documents and number of bytes
880 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
881 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
882
883 # store the mapping between the index names and the directory names
884 my @indexmap = ();
885 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
886 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
887 }
888 $build_cfg->{'indexmap'} = \@indexmap;
889
890 my @subcollectionmap = ();
891 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
892 push (@subcollectionmap, "$subcollection\-\>" .
893 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
894 }
895 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
896
897 my @languagemap = ();
898 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
899 push (@languagemap, "$language\-\>" .
900 $self->{'index_mapping'}->{'languagemap'}->{$language});
901 }
902 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
903
904 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
905
906 # store the indexfieldmap information
907 my @indexfieldmap = ();
908 #add all fields bit - sort based on keys. text only is put at front
909 if (defined $self->{'buildproc'}->{'indexfields'}->{'TextOnly'}) {
910 push (@indexfieldmap, "TextOnly\-\>TX");
911 }
912 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
913 next if $field eq "TextOnly";
914 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
915 }
916
917 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
918
919 #store the indexed field information
920 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
921
922 push (@{$build_cfg->{'indexfields'}}, $field);
923 }
924 # write out the build information
925 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
926 '^(builddate|buildtype|numdocs|numbytes)$',
927 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
928
929}
930
931sub deinit {
932 my $self = shift (@_);
933}
934
935sub print_stats {
936 my $self = shift (@_);
937
938 my $outhandle = $self->{'outhandle'};
939 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
940 my $index = $self->{'buildproc'}->get_index();
941 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
942 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
943
944 if ($indexing_text) {
945 print $outhandle "Stats (Creating index $index)\n";
946 } else {
947 print $outhandle "Stats (Compressing text from $index)\n";
948 }
949 print $outhandle "Total bytes in collection: $num_bytes\n";
950 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
951
952 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
953 print $outhandle "***************\n";
954 if ($indexing_text) {
955 print $outhandle "WARNING: There is very little or no text to process for $index\n";
956 } elsif (!$self->{'no_text'}) {
957 print $outhandle "WARNING: There is very little or no text to compress\n";
958 }
959 print $outhandle " Was this your intention?\n";
960 print $outhandle "***************\n";
961 }
962
963}
964
9651;
966
967
Note: See TracBrowser for help on using the repository browser.