source: trunk/gsdl/perllib/mgppbuilder.pm@ 2478

Last change on this file since 2478 was 2478, checked in by kjm18, 23 years ago

brought it in line with changes to buildcol.pl, mgbuilder.pm
now uses the new mgpp executable names (mgpp_passes instead of mg_passes)

  • Property svn:keywords set to Author Date Id Revision
File size: 28.9 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'w'=>1,
62 'wa'=>1);
63
64# change this so a user can add their own ones in via a file or cfg
65%static_indexfield_map = ('Title'=>'TI',
66 'TI'=>1,
67 'Subject'=>'SU',
68 'SU'=>1,
69 'Creator'=>'CR',
70 'CR'=>1,
71 'Organization'=>'OR',
72 'OR'=>1,
73 'Source'=>'SO',
74 'SO'=>1,
75 'Howto'=>'HT',
76 'HT'=>1,
77 'ItemTitle'=>'IT',
78 'IT'=>1,
79 'ProgNumber'=>'PN',
80 'PN'=>1,
81 'People'=>'PE',
82 'PE'=>1,
83 'TextOnly'=>'TX',
84 'TX'=>1);
85
86sub new {
87 my ($class, $collection, $source_dir, $build_dir, $verbosity,
88 $maxdocs, $debug, $keepold, $allclassifications,
89 $outhandle, $no_text) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92 $no_text = 0 unless defined $no_text;
93
94 # create an mgppbuilder object
95 my $self = bless {'collection'=>$collection,
96 'source_dir'=>$source_dir,
97 'build_dir'=>$build_dir,
98 'verbosity'=>$verbosity,
99 'maxdocs'=>$maxdocs,
100 'debug'=>$debug,
101 'keepold'=>$keepold,
102 'allclassifications'=>$allclassifications,
103 'outhandle'=>$outhandle,
104 'no_text'=>$no_text,
105 'notbuilt'=>[], # indexes not built
106 'indexfieldmap'=>\%static_indexfield_map
107 }, $class;
108
109
110 # read in the collection configuration file
111 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
112 if (!-e $colcfgname) {
113 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
114 }
115 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
116
117 # sort out subcollection indexes
118 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
119 my $indexes = $self->{'collect_cfg'}->{'indexes'};
120 $self->{'collect_cfg'}->{'indexes'} = [];
121 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
122 foreach $index (@$indexes) {
123 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
124 }
125 }
126 }
127
128 # sort out language subindexes
129 if (defined $self->{'collect_cfg'}->{'languages'}) {
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
133 foreach $index (@$indexes) {
134 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
136 }
137 else { # add in an empty subcollection field
138 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
139 }
140 }
141 }
142 }
143
144 # make sure that the same index isn't specified more than once
145 my %tmphash = ();
146 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 foreach my $i (@tmparray) {
149 if (!defined ($tmphash{$i})) {
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
151 $tmphash{$i} = 1;
152 }
153 }
154
155
156 # get the levels (Section, Paragraph) for indexing and compression
157 $self->{'levels'} = {};
158 if (defined $self->{'collect_cfg'}->{'levels'}) {
159 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
160 $self->{'levels'}->{$level} = 1;
161 }
162 }
163
164 # get the list of plugins for this collection
165 my $plugins = [];
166 if (defined $self->{'collect_cfg'}->{'plugin'}) {
167 $plugins = $self->{'collect_cfg'}->{'plugin'};
168 }
169
170 # load all the plugins
171 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
172 if (scalar(@{$self->{'pluginfo'}}) == 0) {
173 print $outhandle "No plugins were loaded.\n";
174 die "\n";
175 }
176
177 # get the list of classifiers for this collection
178 my $classifiers = [];
179 if (defined $self->{'collect_cfg'}->{'classify'}) {
180 $classifiers = $self->{'collect_cfg'}->{'classify'};
181 }
182
183 # load all the classifiers
184 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
185
186 # load up any dontgdbm fields
187 $self->{'dontgdbm'} = {};
188 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
189 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
190 $self->{'dontgdbm'}->{$dg} = 1;
191 }
192 }
193
194 # load up the document processor for building
195 # if a buildproc class has been created for this collection, use it
196 # otherwise, use the mgpp buildproc
197 my ($buildprocdir, $buildproctype);
198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
199 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
200 $buildproctype = "${collection}buildproc";
201 } else {
202 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
203 $buildproctype = "mgppbuildproc";
204 }
205 require "$buildprocdir/$buildproctype.pm";
206
207 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
208 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
209 die "$@" if $@;
210
211
212 return $self;
213}
214
215sub init {
216 my $self = shift (@_);
217
218 if (!$self->{'debug'} && !$self->{'keepold'}) {
219 # remove any old builds
220 &util::rm_r($self->{'build_dir'});
221 &util::mk_all_dir($self->{'build_dir'});
222
223 # make the text directory
224 my $textdir = "$self->{'build_dir'}/text";
225 &util::mk_all_dir($textdir);
226 }
227}
228
229sub set_strip_html {
230 my $self = shift (@_);
231 my ($strip) = @_;
232
233 $self->{'strip_html'} = $strip;
234 $self->{'buildproc'}->set_strip_html($strip);
235}
236
237sub compress_text {
238
239 my $self = shift (@_);
240 my ($textindex) = @_;
241
242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243 my $exe = &util::get_os_exe ();
244 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
245 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
246 my $outhandle = $self->{'outhandle'};
247
248 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
249
250 my $builddir = $self->{'build_dir'};
251 my $basefilename = "text/$self->{'collection'}";
252
253 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
254 $basefilename =~ s/\//\\/g;
255 $builddir =~ s/\//\\/g;
256
257 }
258
259
260 # define the section names for mgpasses
261 # the compressor doesn't need to know about paragraphs - never want to
262 # retrieve them
263 my $mgpp_passes_sections = "";
264 if ($self->{'levels'}->{'Section'}) {
265 $mgpp_passes_sections .= "-K Section ";
266 }
267
268 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
269
270 # collect the statistics for the text
271 # -b $maxdocsize sets the maximum document size to be 12 meg
272 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
273
274 my ($handle);
275 if ($self->{'debug'}) {
276 $handle = STDOUT;
277 } else {
278 if (!-e "$mgpp_passes_exe" ||
279 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -T1")) {
280 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
281 }
282 $handle = mgppbuilder::PIPEOUT;
283 }
284
285 $self->{'buildproc'}->set_output_handle ($handle);
286 $self->{'buildproc'}->set_mode ('text');
287 $self->{'buildproc'}->set_index ($textindex);
288 $self->{'buildproc'}->set_indexing_text (0);
289 if ($self->{'no_text'}) {
290 $self->{'buildproc'}->set_store_text(0);
291 } else {
292 $self->{'buildproc'}->set_store_text(1);
293 }
294 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
295 $self->{'buildproc'}->set_levels ($self->{'levels'});
296 $self->{'buildproc'}->reset();
297 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
298 $self->{'buildproc'}, $self->{'maxdocs'});
299 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
300 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
301 &plugin::end($self->{'pluginfo'});
302 close (PIPEOUT);
303
304 close ($handle) unless $self->{'debug'};
305
306 $self->print_stats();
307
308 # create the compression dictionary
309 # the compression dictionary is built by assuming the stats are from a seed
310 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
311 # and the resulting dictionary must be less than 5 meg with the most
312 # frequent words being put into the dictionary first (-2 -k 5120)
313 # note: these options are left over from mg version
314 if (!$self->{'debug'}) {
315 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
316 if (!-e "$mgpp_compression_dict_exe") {
317 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
318 }
319 system ("mgpp_compression_dict$exe -d $builddir -f $basefilename -S -H -2 -k 5120");
320
321 if (!$self->{'debug'}) {
322 if (!-e "$mgpp_passes_exe" ||
323 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f $basefilename -d $builddir -T2")) {
324 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
325 }
326 }
327 }
328
329 $self->{'buildproc'}->reset();
330 # compress the text
331 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
332 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
333 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
334 close ($handle) unless $self->{'debug'};
335
336 $self->print_stats();
337}
338
339sub want_built {
340 my $self = shift (@_);
341 my ($index) = @_;
342
343 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
344 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
345 if ($index =~ /^$checkstr$/) {
346 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
347 return 0;
348 }
349 }
350 }
351
352 return 1;
353}
354
355sub build_indexes {
356 my $self = shift (@_);
357 my ($indexname) = @_;
358 my $outhandle = $self->{'outhandle'};
359
360 my $indexes = [];
361 if (defined $indexname && $indexname =~ /\w/) {
362 push @$indexes, $indexname;
363 } else {
364 $indexes = $self->{'collect_cfg'}->{'indexes'};
365 }
366
367 # create the mapping between the index descriptions
368 # and their directory names
369 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
370
371 # build each of the indexes
372 foreach $index (@$indexes) {
373 if ($self->want_built($index)) {
374 print $outhandle "\n*** building index $index in subdirectory " .
375 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
376 $self->build_index($index);
377 } else {
378 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
379 }
380 }
381}
382
383# creates directory names for each of the index descriptions
384sub create_index_mapping {
385 my $self = shift (@_);
386 my ($indexes) = @_;
387
388 my %mapping = ();
389 $mapping{'indexmaporder'} = [];
390 $mapping{'subcollectionmaporder'} = [];
391 $mapping{'languagemaporder'} = [];
392
393 # dirnames is used to check for collisions. Start this off
394 # with the manditory directory names
395 my %dirnames = ('text'=>'text',
396 'extra'=>'extra');
397 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
398
399 foreach $index (@$indexes) {
400 my ($fields, $subcollection, $languages) = split (":", $index);
401
402 # the directory name starts with a processed version of index fields
403 my ($pindex) = $self->process_field($fields);
404 # next comes a processed version of the index
405 $pindex = lc ($pindex);
406
407 # next comes a processed version of the subcollection if there is one.
408 my $psub = $self->process_field ($subcollection);
409 $psub = lc ($psub);
410
411 # next comes a processed version of the language if there is one.
412 my $plang = $self->process_field ($languages);
413 $plang = lc ($plang);
414
415 my $dirname = $pindex . $psub . $plang;
416
417 # check to be sure all index names are unique
418 while (defined ($dirnames{$dirname})) {
419 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
420 }
421
422 $mapping{$index} = $dirname;
423
424 # store the mapping orders as well as the maps
425 # also put index, subcollection and language fields into the mapping thing -
426 # (the full index name (eg document:text:subcol:lang) is not used on
427 # the query page) -these are used for collectionmeta later on
428 if (!defined $mapping{'indexmap'}{"$fields"}) {
429 $mapping{'indexmap'}{"$fields"} = $pindex;
430 push (@{$mapping{'indexmaporder'}}, "$fields");
431 if (!defined $mapping{"$fields"}) {
432 $mapping{"$fields"} = $pindex;
433 }
434 }
435 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
436 $mapping{'subcollectionmap'}{$subcollection} = $psub;
437 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
438 $mapping{$subcollection} = $psub;
439 }
440 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
441 $mapping{'languagemap'}{$languages} = $plang;
442 push (@{$mapping{'languagemaporder'}}, $language);
443 $mapping{$languages} = $plang;
444 }
445 $dirnames{$dirname} = $index;
446 $pnames{'index'}{$pindex} = "$fields";
447 $pnames{'subcollection'}{$psub} = $subcollection;
448 $pnames{'languages'}{$plang} = $languages;
449 }
450
451 return \%mapping;
452}
453
454# returns a processed version of a field.
455# if the field has only one component the processed
456# version will contain the first character and next consonant
457# of that componant - otherwise it will contain the first
458# character of the first two components
459sub process_field {
460 my $self = shift (@_);
461 my ($field) = @_;
462
463 return "" unless (defined ($field) && $field =~ /\w/);
464
465 my @components = split /,/, $field;
466 if (scalar @components >= 2) {
467 splice (@components, 2);
468 map {s/^(.).*$/$1/;} @components;
469 return join("", @components);
470 } else {
471 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
472 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
473 return "$a$b";
474 }
475}
476
477sub make_unique {
478 my $self = shift (@_);
479 my ($namehash, $index, $indexref, $subref, $langref) = @_;
480 my ($fields, $subcollection, $languages) = split (":", $index);
481
482 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
483 $self->get_next_version ($indexref);
484 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
485 $self->get_next_version ($subref);
486 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
487 $self->get_next_version ($langref);
488 }
489 return "$$indexref$$subref$$langref";
490}
491
492sub get_next_version {
493 my $self = shift (@_);
494 my ($nameref) = @_;
495
496 if ($$nameref =~ /(\d\d)$/) {
497 my $num = $1; $num ++;
498 $$nameref =~ s/\d\d$/$num/;
499 } elsif ($$nameref =~ /(\d)$/) {
500 my $num = $1;
501 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
502 else {$num ++; $$nameref =~ s/\d$/$num/;}
503 } else {
504 $$nameref =~ s/.$/0/;
505 }
506}
507
508sub build_index {
509 my $self = shift (@_);
510 my ($index) = @_;
511 my $outhandle = $self->{'outhandle'};
512
513 # get the full index directory path and make sure it exists
514 my $indexdir = $self->{'index_mapping'}->{$index};
515 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
516 my $builddir = $self->{'build_dir'};
517
518 my $basefilename = &util::filename_cat ($indexdir,
519 $self->{'collection'});
520
521 # get any os specific stuff
522 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
523
524 my $exe = &util::get_os_exe ();
525 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
526
527 # define the section names for mgpasses
528 my $mgpp_passes_sections = "";
529 foreach $level (keys (%{$self->{'levels'}})) {
530 if ($level eq "Section" || $level eq "Paragraph") {
531 $mgpp_passes_sections .= "-K $level ";
532 }
533 }
534
535 my $mgpp_perf_hash_build_exe =
536 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
537 my $mgpp_weights_build_exe =
538 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
539 my $mgpp_invf_dict_exe =
540 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
541 my $mgpp_stem_idx_exe =
542 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
543
544 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
545 $builddir=~ s/\//\\/g;
546 $basefilename =~ s/\//\\/g;
547 }
548
549 # get the index expression if this index belongs
550 # to a subcollection
551 my $indexexparr = [];
552
553 # there may be subcollection info, and language info.
554 my ($fields, $subcollection, $language) = split (":", $index);
555 my @subcollections = ();
556 @subcollections = split /,/, $subcollection if (defined $subcollection);
557
558 foreach $subcollection (@subcollections) {
559 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
560 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
561 }
562 }
563
564 # add expressions for languages if this index belongs to
565 # a language subcollection - only put languages expressions for the
566 # ones we want in the index
567
568 my @languages = ();
569 @languages = split /,/, $language if (defined $language);
570 foreach $language (@languages) {
571 my $not=0;
572 if ($language =~ s/^\!//) {
573 $not = 1;
574 }
575 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
576 if ($lang eq $language) {
577 if ($not) {
578 push (@$indexexparr, "!Language/$language/");
579 } else {
580 push (@$indexexparr, "Language/$language/");
581 }
582 last;
583 }
584 }
585 }
586
587 # Build index dictionary. Uses verbatim stem method
588 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
589 my ($handle);
590 if ($self->{'debug'}) {
591 $handle = STDOUT;
592 } else {
593 if (!-e "$mgpp_passes_exe" ||
594 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -I1")) {
595 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
596 }
597 $handle = mgppbuilder::PIPEOUT;
598 }
599
600 # set up the document processor
601 $self->{'buildproc'}->set_output_handle ($handle);
602 $self->{'buildproc'}->set_mode ('text');
603 $self->{'buildproc'}->set_index ($index, $indexexparr);
604 $self->{'buildproc'}->set_indexing_text (1);
605 $self->{'buildproc'}->set_store_text(1);
606 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
607 $self->{'buildproc'}->set_levels ($self->{'levels'});
608 $self->{'buildproc'}->reset();
609 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
610 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
611 close ($handle) unless $self->{'debug'};
612
613 $self->print_stats();
614
615 if (!$self->{'debug'}) {
616 # create the perfect hash function
617 if (!-e "$mgpp_perf_hash_build_exe") {
618 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
619 }
620 system ("mgpp_perf_hash_build$exe -d $builddir -f $basefilename");
621
622 if (!-e "$mgpp_passes_exe" ||
623 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -I2")) {
624 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
625 }
626 }
627
628 # invert the text
629 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
630
631 $self->{'buildproc'}->reset();
632 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
633 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
634
635 $self->print_stats ();
636
637 if (!$self->{'debug'}) {
638
639 close ($handle);
640
641 # create the weights file
642 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
643 if (!-e "$mgpp_weights_build_exe") {
644 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
645 }
646 system ("mgpp_weights_build$exe -d $builddir -f $basefilename");
647
648 # create 'on-disk' stemmed dictionary
649 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
650 if (!-e "$mgpp_invf_dict_exe") {
651 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
652 }
653 system ("mgpp_invf_dict$exe -d $builddir -f $basefilename");
654
655
656 # creates stem index files for the various stemming methods
657 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
658 if (!-e "$mgpp_stem_idx_exe") {
659 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
660 }
661 system ("mgpp_stem_idx$exe -b 4096 -s1 -d $builddir -f $basefilename");
662 system ("mgpp_stem_idx$exe -b 4096 -s2 -d $builddir -f $basefilename");
663 system ("mgpp_stem_idx$exe -b 4096 -s3 -d $builddir -f $basefilename");
664
665
666 # remove unwanted files
667 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
668 opendir (DIR, $tmpdir) || die
669 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
670 foreach $file (readdir(DIR)) {
671 next if $file =~ /^\./;
672 my ($suffix) = $file =~ /\.([^\.]+)$/;
673 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
674 # delete it!
675 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
676 &util::rm (&util::filename_cat ($tmpdir, $file));
677 }
678 }
679 closedir (DIR);
680 }
681}
682
683sub make_infodatabase {
684 my $self = shift (@_);
685 my $outhandle = $self->{'outhandle'};
686
687
688 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
689 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
690 &util::mk_all_dir ($textdir);
691 &util::mk_all_dir ($assocdir);
692
693 # get db name
694 my $dbext = ".bdb";
695 $dbext = ".ldb" if &util::is_little_endian();
696 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
697 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
698
699 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
700 my $exe = &util::get_os_exe ();
701 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
702
703 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
704 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
705 #check build.cfg to see if indexfields have been filled in
706 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
707 if (-e $buildconfigfile) {
708 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
709 if (defined $buildcfg->{'indexfields'}) {
710 foreach $field (@{$buildcfg->{'indexfields'}}) {
711 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
712 }
713 }
714 if (defined $buildcfg->{'indexfieldmap'}) {
715 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
716 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
717 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
718 }
719 }
720 }
721 }
722
723 print $outhandle "\n*** creating the info database and processing associated files\n"
724 if ($self->{'verbosity'} >= 1);
725
726 # init all the classifiers
727 &classify::init_classifiers ($self->{'classifiers'});
728
729 # set up the document processor
730 my ($handle);
731 if ($self->{'debug'}) {
732 $handle = STDOUT;
733 } else {
734 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
735 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
736 }
737 $handle = mgppbuilder::PIPEOUT;
738 }
739
740 $self->{'buildproc'}->set_output_handle ($handle);
741 $self->{'buildproc'}->set_mode ('infodb');
742 $self->{'buildproc'}->set_assocdir ($assocdir);
743 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
744 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
745 $self->{'buildproc'}->set_indexing_text (0);
746 $self->{'buildproc'}->set_store_text(1);
747 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
748
749 $self->{'buildproc'}->reset();
750
751 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
752
753 if (!defined $self->{'index_mapping'}) {
754 $self->{'index_mapping'} =
755 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
756 }
757
758 print $handle "[collection]\n";
759
760 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
761 if ($cmeta =~ s/^\.//) {
762 if (defined $self->{'index_mapping'}->{$cmeta}) {
763 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
764 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
765 print $outhandle "have .section entry in collect file\n";
766 } else {
767 print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
768 }
769 } else {
770 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
771 }
772 }
773 #print out the indexfield mapping
774 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
775 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
776 print $handle "<$shortname>$field\n" if defined $shortname;
777 }
778 print $handle "\n" . ('-' x 70) . "\n";
779
780 }
781
782 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
783 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
784
785 # output classification information
786 &classify::output_classify_info ($self->{'classifiers'}, $handle,
787 $self->{'allclassifications'});
788
789 close ($handle) if !$self->{'debug'};
790}
791
792sub collect_specific {
793 my $self = shift (@_);
794}
795
796sub make_auxiliary_files {
797 my $self = shift (@_);
798 my ($index);
799 my %build_cfg = ();
800
801 my $outhandle = $self->{'outhandle'};
802 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
803
804 # get the text directory
805 &util::mk_all_dir ($self->{'build_dir'});
806
807 # store the build date
808 $build_cfg->{'builddate'} = time;
809 $build_cfg->{'buildtype'} = "mgpp";
810
811 # store the number of documents and number of bytes
812 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
813 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
814
815 # store the mapping between the index names and the directory names
816 my @indexmap = ();
817 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
818 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
819 }
820 $build_cfg->{'indexmap'} = \@indexmap;
821
822 my @subcollectionmap = ();
823 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
824 push (@subcollectionmap, "$subcollection\-\>" .
825 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
826 }
827 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
828
829 my @languagemap = ();
830 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
831 push (@languagemap, "$language\-\>" .
832 $self->{'index_mapping'}->{'languagemap'}->{$language});
833 }
834 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
835
836 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
837
838 # store the indexfieldmap information
839 my @indexfieldmap = ();
840 #add all fields bit
841 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
842 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
843 }
844
845 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
846
847 #store the indexed field information
848 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
849
850 push (@{$build_cfg->{'indexfields'}}, $field);
851 }
852 # write out the build information
853 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
854 '^(builddate|buildtype|numdocs|numbytes)$',
855 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
856
857}
858
859sub deinit {
860 my $self = shift (@_);
861}
862
863sub print_stats {
864 my $self = shift (@_);
865
866 my $outhandle = $self->{'outhandle'};
867 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
868 my $index = $self->{'buildproc'}->get_index();
869 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
870 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
871
872 if ($indexing_text) {
873 print $outhandle "Stats (Creating index $index)\n";
874 } else {
875 print $outhandle "Stats (Compressing text from $index)\n";
876 }
877 print $outhandle "Total bytes in collection: $num_bytes\n";
878 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
879
880 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
881 print $outhandle "***************\n";
882 if ($indexing_text) {
883 print $outhandle "WARNING: There is very little or no text to process for $index\n";
884 } elsif (!$self->{'no_text'}) {
885 print $outhandle "WARNING: There is very little or no text to compress\n";
886 }
887 print $outhandle " Was this your intention?\n";
888 print $outhandle "***************\n";
889 }
890
891}
892
8931;
894
895
Note: See TracBrowser for help on using the repository browser.