source: trunk/gsdl/perllib/mgppbuilder.pm@ 2562

Last change on this file since 2562 was 2525, checked in by kjm18, 23 years ago

removed unneeded output

  • Property svn:keywords set to Author Date Id Revision
File size: 28.9 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'w'=>1,
62 'wa'=>1);
63
64# change this so a user can add their own ones in via a file or cfg
65%static_indexfield_map = ('Title'=>'TI',
66 'TI'=>1,
67 'Subject'=>'SU',
68 'SU'=>1,
69 'Creator'=>'CR',
70 'CR'=>1,
71 'Organization'=>'OR',
72 'OR'=>1,
73 'Source'=>'SO',
74 'SO'=>1,
75 'Howto'=>'HT',
76 'HT'=>1,
77 'ItemTitle'=>'IT',
78 'IT'=>1,
79 'ProgNumber'=>'PN',
80 'PN'=>1,
81 'People'=>'PE',
82 'PE'=>1,
83 'TextOnly'=>'TX',
84 'TX'=>1);
85
86sub new {
87 my ($class, $collection, $source_dir, $build_dir, $verbosity,
88 $maxdocs, $debug, $keepold, $allclassifications,
89 $outhandle, $no_text) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92 $no_text = 0 unless defined $no_text;
93
94 # create an mgppbuilder object
95 my $self = bless {'collection'=>$collection,
96 'source_dir'=>$source_dir,
97 'build_dir'=>$build_dir,
98 'verbosity'=>$verbosity,
99 'maxdocs'=>$maxdocs,
100 'debug'=>$debug,
101 'keepold'=>$keepold,
102 'allclassifications'=>$allclassifications,
103 'outhandle'=>$outhandle,
104 'no_text'=>$no_text,
105 'notbuilt'=>[], # indexes not built
106 'indexfieldmap'=>\%static_indexfield_map
107 }, $class;
108
109
110 # read in the collection configuration file
111 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
112 if (!-e $colcfgname) {
113 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
114 }
115 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
116
117 # sort out subcollection indexes
118 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
119 my $indexes = $self->{'collect_cfg'}->{'indexes'};
120 $self->{'collect_cfg'}->{'indexes'} = [];
121 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
122 foreach $index (@$indexes) {
123 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
124 }
125 }
126 }
127
128 # sort out language subindexes
129 if (defined $self->{'collect_cfg'}->{'languages'}) {
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
133 foreach $index (@$indexes) {
134 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
136 }
137 else { # add in an empty subcollection field
138 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
139 }
140 }
141 }
142 }
143
144 # make sure that the same index isn't specified more than once
145 my %tmphash = ();
146 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 foreach my $i (@tmparray) {
149 if (!defined ($tmphash{$i})) {
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
151 $tmphash{$i} = 1;
152 }
153 }
154
155
156 # get the levels (Section, Paragraph) for indexing and compression
157 $self->{'levels'} = {};
158 if (defined $self->{'collect_cfg'}->{'levels'}) {
159 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
160 $self->{'levels'}->{$level} = 1;
161 }
162 }
163
164 # get the list of plugins for this collection
165 my $plugins = [];
166 if (defined $self->{'collect_cfg'}->{'plugin'}) {
167 $plugins = $self->{'collect_cfg'}->{'plugin'};
168 }
169
170 # load all the plugins
171 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
172 if (scalar(@{$self->{'pluginfo'}}) == 0) {
173 print $outhandle "No plugins were loaded.\n";
174 die "\n";
175 }
176
177 # get the list of classifiers for this collection
178 my $classifiers = [];
179 if (defined $self->{'collect_cfg'}->{'classify'}) {
180 $classifiers = $self->{'collect_cfg'}->{'classify'};
181 }
182
183 # load all the classifiers
184 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
185
186 # load up any dontgdbm fields
187 $self->{'dontgdbm'} = {};
188 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
189 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
190 $self->{'dontgdbm'}->{$dg} = 1;
191 }
192 }
193
194 # load up the document processor for building
195 # if a buildproc class has been created for this collection, use it
196 # otherwise, use the mgpp buildproc
197 my ($buildprocdir, $buildproctype);
198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
199 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
200 $buildproctype = "${collection}buildproc";
201 } else {
202 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
203 $buildproctype = "mgppbuildproc";
204 }
205 require "$buildprocdir/$buildproctype.pm";
206
207 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
208 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
209 die "$@" if $@;
210
211
212 return $self;
213}
214
215sub init {
216 my $self = shift (@_);
217
218 if (!$self->{'debug'} && !$self->{'keepold'}) {
219 # remove any old builds
220 &util::rm_r($self->{'build_dir'});
221 &util::mk_all_dir($self->{'build_dir'});
222
223 # make the text directory
224 my $textdir = "$self->{'build_dir'}/text";
225 &util::mk_all_dir($textdir);
226 }
227}
228
229sub set_strip_html {
230 my $self = shift (@_);
231 my ($strip) = @_;
232
233 $self->{'strip_html'} = $strip;
234 $self->{'buildproc'}->set_strip_html($strip);
235}
236
237sub compress_text {
238
239 my $self = shift (@_);
240 my ($textindex) = @_;
241
242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243 my $exe = &util::get_os_exe ();
244 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
245 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
246 my $outhandle = $self->{'outhandle'};
247
248 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
249
250 my $builddir = $self->{'build_dir'};
251 my $basefilename = "text/$self->{'collection'}";
252
253 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
254 $basefilename =~ s/\//\\/g;
255 $builddir =~ s/\//\\/g;
256
257 }
258
259
260 # define the section names for mgpasses
261 # the compressor doesn't need to know about paragraphs - never want to
262 # retrieve them
263 my $mgpp_passes_sections = "";
264 if ($self->{'levels'}->{'Section'}) {
265 $mgpp_passes_sections .= "-K Section ";
266 }
267
268 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
269
270 # collect the statistics for the text
271 # -b $maxdocsize sets the maximum document size to be 12 meg
272 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
273
274 my ($handle);
275 if ($self->{'debug'}) {
276 $handle = STDOUT;
277 } else {
278 if (!-e "$mgpp_passes_exe" ||
279 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -T1")) {
280 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
281 }
282 $handle = mgppbuilder::PIPEOUT;
283 }
284
285 $self->{'buildproc'}->set_output_handle ($handle);
286 $self->{'buildproc'}->set_mode ('text');
287 $self->{'buildproc'}->set_index ($textindex);
288 $self->{'buildproc'}->set_indexing_text (0);
289 if ($self->{'no_text'}) {
290 $self->{'buildproc'}->set_store_text(0);
291 } else {
292 $self->{'buildproc'}->set_store_text(1);
293 }
294 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
295 $self->{'buildproc'}->set_levels ($self->{'levels'});
296 $self->{'buildproc'}->reset();
297 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
298 $self->{'buildproc'}, $self->{'maxdocs'});
299 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
300 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
301 &plugin::end($self->{'pluginfo'});
302 close (PIPEOUT);
303
304 close ($handle) unless $self->{'debug'};
305
306 $self->print_stats();
307
308 # create the compression dictionary
309 # the compression dictionary is built by assuming the stats are from a seed
310 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
311 # and the resulting dictionary must be less than 5 meg with the most
312 # frequent words being put into the dictionary first (-2 -k 5120)
313 # note: these options are left over from mg version
314 if (!$self->{'debug'}) {
315 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
316 if (!-e "$mgpp_compression_dict_exe") {
317 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
318 }
319 system ("mgpp_compression_dict$exe -d $builddir -f $basefilename -S -H -2 -k 5120");
320
321 if (!$self->{'debug'}) {
322 if (!-e "$mgpp_passes_exe" ||
323 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f $basefilename -d $builddir -T2")) {
324 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
325 }
326 }
327 }
328
329 $self->{'buildproc'}->reset();
330 # compress the text
331 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
332 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
333 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
334 close ($handle) unless $self->{'debug'};
335
336 $self->print_stats();
337}
338
339sub want_built {
340 my $self = shift (@_);
341 my ($index) = @_;
342
343 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
344 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
345 if ($index =~ /^$checkstr$/) {
346 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
347 return 0;
348 }
349 }
350 }
351
352 return 1;
353}
354
355sub build_indexes {
356 my $self = shift (@_);
357 my ($indexname) = @_;
358 my $outhandle = $self->{'outhandle'};
359
360 my $indexes = [];
361 if (defined $indexname && $indexname =~ /\w/) {
362 push @$indexes, $indexname;
363 } else {
364 $indexes = $self->{'collect_cfg'}->{'indexes'};
365 }
366
367 # create the mapping between the index descriptions
368 # and their directory names
369 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
370
371 # build each of the indexes
372 foreach $index (@$indexes) {
373 if ($self->want_built($index)) {
374 print $outhandle "\n*** building index $index in subdirectory " .
375 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
376 $self->build_index($index);
377 } else {
378 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
379 }
380 }
381}
382
383# creates directory names for each of the index descriptions
384sub create_index_mapping {
385 my $self = shift (@_);
386 my ($indexes) = @_;
387
388 my %mapping = ();
389 $mapping{'indexmaporder'} = [];
390 $mapping{'subcollectionmaporder'} = [];
391 $mapping{'languagemaporder'} = [];
392
393 # dirnames is used to check for collisions. Start this off
394 # with the manditory directory names
395 my %dirnames = ('text'=>'text',
396 'extra'=>'extra');
397 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
398
399 foreach $index (@$indexes) {
400 my ($fields, $subcollection, $languages) = split (":", $index);
401
402 # the directory name starts with a processed version of index fields
403 my ($pindex) = $self->process_field($fields);
404 # next comes a processed version of the index
405 $pindex = lc ($pindex);
406
407 # next comes a processed version of the subcollection if there is one.
408 my $psub = $self->process_field ($subcollection);
409 $psub = lc ($psub);
410
411 # next comes a processed version of the language if there is one.
412 my $plang = $self->process_field ($languages);
413 $plang = lc ($plang);
414
415 my $dirname = $pindex . $psub . $plang;
416
417 # check to be sure all index names are unique
418 while (defined ($dirnames{$dirname})) {
419 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
420 }
421
422 $mapping{$index} = $dirname;
423
424 # store the mapping orders as well as the maps
425 # also put index, subcollection and language fields into the mapping thing -
426 # (the full index name (eg document:text:subcol:lang) is not used on
427 # the query page) -these are used for collectionmeta later on
428 if (!defined $mapping{'indexmap'}{"$fields"}) {
429 $mapping{'indexmap'}{"$fields"} = $pindex;
430 push (@{$mapping{'indexmaporder'}}, "$fields");
431 if (!defined $mapping{"$fields"}) {
432 $mapping{"$fields"} = $pindex;
433 }
434 }
435 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
436 $mapping{'subcollectionmap'}{$subcollection} = $psub;
437 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
438 $mapping{$subcollection} = $psub;
439 }
440 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
441 $mapping{'languagemap'}{$languages} = $plang;
442 push (@{$mapping{'languagemaporder'}}, $language);
443 $mapping{$languages} = $plang;
444 }
445 $dirnames{$dirname} = $index;
446 $pnames{'index'}{$pindex} = "$fields";
447 $pnames{'subcollection'}{$psub} = $subcollection;
448 $pnames{'languages'}{$plang} = $languages;
449 }
450
451 return \%mapping;
452}
453
454# returns a processed version of a field.
455# if the field has only one component the processed
456# version will contain the first character and next consonant
457# of that componant - otherwise it will contain the first
458# character of the first two components
459sub process_field {
460 my $self = shift (@_);
461 my ($field) = @_;
462
463 return "" unless (defined ($field) && $field =~ /\w/);
464
465 my @components = split /,/, $field;
466 if (scalar @components >= 2) {
467 splice (@components, 2);
468 map {s/^(.).*$/$1/;} @components;
469 return join("", @components);
470 } else {
471 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
472 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
473 return "$a$b";
474 }
475}
476
477sub make_unique {
478 my $self = shift (@_);
479 my ($namehash, $index, $indexref, $subref, $langref) = @_;
480 my ($fields, $subcollection, $languages) = split (":", $index);
481
482 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
483 $self->get_next_version ($indexref);
484 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
485 $self->get_next_version ($subref);
486 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
487 $self->get_next_version ($langref);
488 }
489 return "$$indexref$$subref$$langref";
490}
491
492sub get_next_version {
493 my $self = shift (@_);
494 my ($nameref) = @_;
495
496 if ($$nameref =~ /(\d\d)$/) {
497 my $num = $1; $num ++;
498 $$nameref =~ s/\d\d$/$num/;
499 } elsif ($$nameref =~ /(\d)$/) {
500 my $num = $1;
501 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
502 else {$num ++; $$nameref =~ s/\d$/$num/;}
503 } else {
504 $$nameref =~ s/.$/0/;
505 }
506}
507
508sub build_index {
509 my $self = shift (@_);
510 my ($index) = @_;
511 my $outhandle = $self->{'outhandle'};
512
513 # get the full index directory path and make sure it exists
514 my $indexdir = $self->{'index_mapping'}->{$index};
515 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
516 my $builddir = $self->{'build_dir'};
517
518 my $basefilename = &util::filename_cat ($indexdir,
519 $self->{'collection'});
520
521 # get any os specific stuff
522 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
523
524 my $exe = &util::get_os_exe ();
525 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
526
527 # define the section names for mgpasses
528 my $mgpp_passes_sections = "";
529 foreach $level (keys (%{$self->{'levels'}})) {
530 if ($level eq "Section" || $level eq "Paragraph") {
531 $mgpp_passes_sections .= "-K $level ";
532 }
533 }
534
535 my $mgpp_perf_hash_build_exe =
536 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
537 my $mgpp_weights_build_exe =
538 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
539 my $mgpp_invf_dict_exe =
540 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
541 my $mgpp_stem_idx_exe =
542 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
543
544 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
545 $builddir=~ s/\//\\/g;
546 $basefilename =~ s/\//\\/g;
547 }
548
549 # get the index expression if this index belongs
550 # to a subcollection
551 my $indexexparr = [];
552
553 # there may be subcollection info, and language info.
554 my ($fields, $subcollection, $language) = split (":", $index);
555 my @subcollections = ();
556 @subcollections = split /,/, $subcollection if (defined $subcollection);
557
558 foreach $subcollection (@subcollections) {
559 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
560 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
561 }
562 }
563
564 # add expressions for languages if this index belongs to
565 # a language subcollection - only put languages expressions for the
566 # ones we want in the index
567
568 my @languages = ();
569 @languages = split /,/, $language if (defined $language);
570 foreach $language (@languages) {
571 my $not=0;
572 if ($language =~ s/^\!//) {
573 $not = 1;
574 }
575 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
576 if ($lang eq $language) {
577 if ($not) {
578 push (@$indexexparr, "!Language/$language/");
579 } else {
580 push (@$indexexparr, "Language/$language/");
581 }
582 last;
583 }
584 }
585 }
586
587 # Build index dictionary. Uses verbatim stem method
588 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
589 my ($handle);
590 if ($self->{'debug'}) {
591 $handle = STDOUT;
592 } else {
593 if (!-e "$mgpp_passes_exe" ||
594 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -I1")) {
595 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
596 }
597 $handle = mgppbuilder::PIPEOUT;
598 }
599
600 # set up the document processor
601 $self->{'buildproc'}->set_output_handle ($handle);
602 $self->{'buildproc'}->set_mode ('text');
603 $self->{'buildproc'}->set_index ($index, $indexexparr);
604 $self->{'buildproc'}->set_indexing_text (1);
605 $self->{'buildproc'}->set_store_text(1);
606 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
607 $self->{'buildproc'}->set_levels ($self->{'levels'});
608 $self->{'buildproc'}->reset();
609 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
610 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
611 close ($handle) unless $self->{'debug'};
612
613 $self->print_stats();
614
615 if (!$self->{'debug'}) {
616 # create the perfect hash function
617 if (!-e "$mgpp_perf_hash_build_exe") {
618 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
619 }
620 system ("mgpp_perf_hash_build$exe -d $builddir -f $basefilename");
621
622 if (!-e "$mgpp_passes_exe" ||
623 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -I2")) {
624 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
625 }
626 }
627
628 # invert the text
629 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
630
631 $self->{'buildproc'}->reset();
632 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
633 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
634
635 $self->print_stats ();
636
637 if (!$self->{'debug'}) {
638
639 close ($handle);
640
641 # create the weights file
642 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
643 if (!-e "$mgpp_weights_build_exe") {
644 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
645 }
646 system ("mgpp_weights_build$exe -d $builddir -f $basefilename");
647
648 # create 'on-disk' stemmed dictionary
649 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
650 if (!-e "$mgpp_invf_dict_exe") {
651 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
652 }
653 system ("mgpp_invf_dict$exe -d $builddir -f $basefilename");
654
655
656 # creates stem index files for the various stemming methods
657 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
658 if (!-e "$mgpp_stem_idx_exe") {
659 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
660 }
661 system ("mgpp_stem_idx$exe -b 4096 -s1 -d $builddir -f $basefilename");
662 system ("mgpp_stem_idx$exe -b 4096 -s2 -d $builddir -f $basefilename");
663 system ("mgpp_stem_idx$exe -b 4096 -s3 -d $builddir -f $basefilename");
664
665
666 # remove unwanted files
667 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
668 opendir (DIR, $tmpdir) || die
669 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
670 foreach $file (readdir(DIR)) {
671 next if $file =~ /^\./;
672 my ($suffix) = $file =~ /\.([^\.]+)$/;
673 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
674 # delete it!
675 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
676 &util::rm (&util::filename_cat ($tmpdir, $file));
677 }
678 }
679 closedir (DIR);
680 }
681}
682
683sub make_infodatabase {
684 my $self = shift (@_);
685 my $outhandle = $self->{'outhandle'};
686
687
688 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
689 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
690 &util::mk_all_dir ($textdir);
691 &util::mk_all_dir ($assocdir);
692
693 # get db name
694 my $dbext = ".bdb";
695 $dbext = ".ldb" if &util::is_little_endian();
696 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
697 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
698
699 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
700 my $exe = &util::get_os_exe ();
701 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
702
703 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
704 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
705 #check build.cfg to see if indexfields have been filled in
706 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
707 if (-e $buildconfigfile) {
708 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
709 if (defined $buildcfg->{'indexfields'}) {
710 foreach $field (@{$buildcfg->{'indexfields'}}) {
711 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
712 }
713 }
714 if (defined $buildcfg->{'indexfieldmap'}) {
715 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
716 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
717 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
718 }
719 }
720 }
721 }
722
723 print $outhandle "\n*** creating the info database and processing associated files\n"
724 if ($self->{'verbosity'} >= 1);
725
726 # init all the classifiers
727 &classify::init_classifiers ($self->{'classifiers'});
728
729 # set up the document processor
730 my ($handle);
731 if ($self->{'debug'}) {
732 $handle = STDOUT;
733 } else {
734 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
735 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
736 }
737 $handle = mgppbuilder::PIPEOUT;
738 }
739
740 $self->{'buildproc'}->set_output_handle ($handle);
741 $self->{'buildproc'}->set_mode ('infodb');
742 $self->{'buildproc'}->set_assocdir ($assocdir);
743 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
744 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
745 $self->{'buildproc'}->set_indexing_text (0);
746 $self->{'buildproc'}->set_store_text(1);
747 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
748
749 $self->{'buildproc'}->reset();
750
751 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
752
753 if (!defined $self->{'index_mapping'}) {
754 $self->{'index_mapping'} =
755 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
756 }
757
758 print $handle "[collection]\n";
759
760 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
761 if ($cmeta =~ s/^\.//) {
762 if (defined $self->{'index_mapping'}->{$cmeta}) {
763 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
764 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
765 } else {
766 print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
767 }
768 } else {
769 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
770 }
771 }
772 #print out the indexfield mapping
773 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
774 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
775 print $handle "<$shortname>$field\n" if defined $shortname;
776 }
777 print $handle "\n" . ('-' x 70) . "\n";
778
779 }
780
781 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
782 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
783
784 # output classification information
785 &classify::output_classify_info ($self->{'classifiers'}, $handle,
786 $self->{'allclassifications'});
787
788 close ($handle) if !$self->{'debug'};
789}
790
791sub collect_specific {
792 my $self = shift (@_);
793}
794
795sub make_auxiliary_files {
796 my $self = shift (@_);
797 my ($index);
798 my %build_cfg = ();
799
800 my $outhandle = $self->{'outhandle'};
801 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
802
803 # get the text directory
804 &util::mk_all_dir ($self->{'build_dir'});
805
806 # store the build date
807 $build_cfg->{'builddate'} = time;
808 $build_cfg->{'buildtype'} = "mgpp";
809
810 # store the number of documents and number of bytes
811 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
812 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
813
814 # store the mapping between the index names and the directory names
815 my @indexmap = ();
816 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
817 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
818 }
819 $build_cfg->{'indexmap'} = \@indexmap;
820
821 my @subcollectionmap = ();
822 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
823 push (@subcollectionmap, "$subcollection\-\>" .
824 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
825 }
826 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
827
828 my @languagemap = ();
829 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
830 push (@languagemap, "$language\-\>" .
831 $self->{'index_mapping'}->{'languagemap'}->{$language});
832 }
833 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
834
835 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
836
837 # store the indexfieldmap information
838 my @indexfieldmap = ();
839 #add all fields bit
840 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
841 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
842 }
843
844 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
845
846 #store the indexed field information
847 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
848
849 push (@{$build_cfg->{'indexfields'}}, $field);
850 }
851 # write out the build information
852 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
853 '^(builddate|buildtype|numdocs|numbytes)$',
854 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
855
856}
857
858sub deinit {
859 my $self = shift (@_);
860}
861
862sub print_stats {
863 my $self = shift (@_);
864
865 my $outhandle = $self->{'outhandle'};
866 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
867 my $index = $self->{'buildproc'}->get_index();
868 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
869 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
870
871 if ($indexing_text) {
872 print $outhandle "Stats (Creating index $index)\n";
873 } else {
874 print $outhandle "Stats (Compressing text from $index)\n";
875 }
876 print $outhandle "Total bytes in collection: $num_bytes\n";
877 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
878
879 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
880 print $outhandle "***************\n";
881 if ($indexing_text) {
882 print $outhandle "WARNING: There is very little or no text to process for $index\n";
883 } elsif (!$self->{'no_text'}) {
884 print $outhandle "WARNING: There is very little or no text to compress\n";
885 }
886 print $outhandle " Was this your intention?\n";
887 print $outhandle "***************\n";
888 }
889
890}
891
8921;
893
894
Note: See TracBrowser for help on using the repository browser.