source: trunk/gsdl/perllib/mgppbuilder.pm@ 2811

Last change on this file since 2811 was 2772, checked in by kjm18, 23 years ago

changes to enable language specific collectionmeta in collect.cfg
collectionmeta now specified as eg
collectionmeta collectionname [l=en] "greenstone demo"
any entries without language parameter are used as a default

  • Property svn:keywords set to Author Date Id Revision
File size: 30.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'w'=>1,
62 'wa'=>1);
63
64# change this so a user can add their own ones in via a file or cfg
65%static_indexfield_map = ('Title'=>'TI',
66 'TI'=>1,
67 'Subject'=>'SU',
68 'SU'=>1,
69 'Creator'=>'CR',
70 'CR'=>1,
71 'Organization'=>'OR',
72 'OR'=>1,
73 'Source'=>'SO',
74 'SO'=>1,
75 'Howto'=>'HT',
76 'HT'=>1,
77 'ItemTitle'=>'IT',
78 'IT'=>1,
79 'ProgNumber'=>'PN',
80 'PN'=>1,
81 'People'=>'PE',
82 'PE'=>1,
83 'TextOnly'=>'TX',
84 'TX'=>1);
85
86sub new {
87 my ($class, $collection, $source_dir, $build_dir, $verbosity,
88 $maxdocs, $debug, $keepold, $allclassifications,
89 $outhandle, $no_text) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92 $no_text = 0 unless defined $no_text;
93
94 # create an mgppbuilder object
95 my $self = bless {'collection'=>$collection,
96 'source_dir'=>$source_dir,
97 'build_dir'=>$build_dir,
98 'verbosity'=>$verbosity,
99 'maxdocs'=>$maxdocs,
100 'debug'=>$debug,
101 'keepold'=>$keepold,
102 'allclassifications'=>$allclassifications,
103 'outhandle'=>$outhandle,
104 'no_text'=>$no_text,
105 'notbuilt'=>[], # indexes not built
106 'indexfieldmap'=>\%static_indexfield_map
107 }, $class;
108
109
110 # read in the collection configuration file
111 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
112 if (!-e $colcfgname) {
113 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
114 }
115 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
116
117 # sort out subcollection indexes
118 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
119 my $indexes = $self->{'collect_cfg'}->{'indexes'};
120 $self->{'collect_cfg'}->{'indexes'} = [];
121 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
122 foreach $index (@$indexes) {
123 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
124 }
125 }
126 }
127
128 # sort out language subindexes
129 if (defined $self->{'collect_cfg'}->{'languages'}) {
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 $self->{'collect_cfg'}->{'indexes'} = [];
132 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
133 foreach $index (@$indexes) {
134 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
136 }
137 else { # add in an empty subcollection field
138 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
139 }
140 }
141 }
142 }
143
144 # make sure that the same index isn't specified more than once
145 my %tmphash = ();
146 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
147 $self->{'collect_cfg'}->{'indexes'} = [];
148 foreach my $i (@tmparray) {
149 if (!defined ($tmphash{$i})) {
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
151 $tmphash{$i} = 1;
152 }
153 }
154
155
156 # get the levels (Section, Paragraph) for indexing and compression
157 $self->{'levels'} = {};
158 if (defined $self->{'collect_cfg'}->{'levels'}) {
159 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
160 $self->{'levels'}->{$level} = 1;
161 }
162 }
163
164 # get the list of plugins for this collection
165 my $plugins = [];
166 if (defined $self->{'collect_cfg'}->{'plugin'}) {
167 $plugins = $self->{'collect_cfg'}->{'plugin'};
168 }
169
170 # load all the plugins
171 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
172 if (scalar(@{$self->{'pluginfo'}}) == 0) {
173 print $outhandle "No plugins were loaded.\n";
174 die "\n";
175 }
176
177 # get the list of classifiers for this collection
178 my $classifiers = [];
179 if (defined $self->{'collect_cfg'}->{'classify'}) {
180 $classifiers = $self->{'collect_cfg'}->{'classify'};
181 }
182
183 # load all the classifiers
184 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
185
186 # load up any dontgdbm fields
187 $self->{'dontgdbm'} = {};
188 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
189 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
190 $self->{'dontgdbm'}->{$dg} = 1;
191 }
192 }
193
194 # load up the document processor for building
195 # if a buildproc class has been created for this collection, use it
196 # otherwise, use the mgpp buildproc
197 my ($buildprocdir, $buildproctype);
198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
199 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
200 $buildproctype = "${collection}buildproc";
201 } else {
202 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
203 $buildproctype = "mgppbuildproc";
204 }
205 require "$buildprocdir/$buildproctype.pm";
206
207 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
208 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
209 die "$@" if $@;
210
211
212 return $self;
213}
214
215sub init {
216 my $self = shift (@_);
217
218 if (!$self->{'debug'} && !$self->{'keepold'}) {
219 # remove any old builds
220 &util::rm_r($self->{'build_dir'});
221 &util::mk_all_dir($self->{'build_dir'});
222
223 # make the text directory
224 my $textdir = "$self->{'build_dir'}/text";
225 &util::mk_all_dir($textdir);
226 }
227}
228
229sub set_strip_html {
230 my $self = shift (@_);
231 my ($strip) = @_;
232
233 $self->{'strip_html'} = $strip;
234 $self->{'buildproc'}->set_strip_html($strip);
235}
236
237sub compress_text {
238
239 my $self = shift (@_);
240 my ($textindex) = @_;
241
242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243 my $exe = &util::get_os_exe ();
244 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
245 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
246 my $outhandle = $self->{'outhandle'};
247
248 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
249
250 my $basefilename = "text/$self->{'collection'}";
251 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
252
253 my $osextra = "";
254 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 $fulltextprefix =~ s/\//\\/g;
256 }
257 else {
258 $osextra = " -d /";
259 }
260
261
262 # define the section names for mgpasses
263 # the compressor doesn't need to know about paragraphs - never want to
264 # retrieve them
265 my $mgpp_passes_sections = "";
266 if ($self->{'levels'}->{'Section'}) {
267 $mgpp_passes_sections .= "-K Section ";
268 }
269
270 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
271
272 # collect the statistics for the text
273 # -b $maxdocsize sets the maximum document size to be 12 meg
274 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
275
276 my ($handle);
277 if ($self->{'debug'}) {
278 $handle = STDOUT;
279 } else {
280 if (!-e "$mgpp_passes_exe" ||
281 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
282 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
283 }
284 $handle = mgppbuilder::PIPEOUT;
285 }
286 $self->{'buildproc'}->set_output_handle ($handle);
287 $self->{'buildproc'}->set_mode ('text');
288 $self->{'buildproc'}->set_index ($textindex);
289 $self->{'buildproc'}->set_indexing_text (0);
290 if ($self->{'no_text'}) {
291 $self->{'buildproc'}->set_store_text(0);
292 } else {
293 $self->{'buildproc'}->set_store_text(1);
294 }
295 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
296 $self->{'buildproc'}->set_levels ($self->{'levels'});
297 $self->{'buildproc'}->reset();
298 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
299 $self->{'buildproc'}, $self->{'maxdocs'});
300 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
301 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
302 &plugin::end($self->{'pluginfo'});
303 close (PIPEOUT);
304
305 close ($handle) unless $self->{'debug'};
306
307 $self->print_stats();
308
309 # create the compression dictionary
310 # the compression dictionary is built by assuming the stats are from a seed
311 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
312 # and the resulting dictionary must be less than 5 meg with the most
313 # frequent words being put into the dictionary first (-2 -k 5120)
314 # note: these options are left over from mg version
315 if (!$self->{'debug'}) {
316 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
317 if (!-e "$mgpp_compression_dict_exe") {
318 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
319 }
320 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
321
322 if (!$self->{'debug'}) {
323 if (!-e "$mgpp_passes_exe" ||
324 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
325 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
326 }
327 }
328 }
329
330 $self->{'buildproc'}->reset();
331 # compress the text
332 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
333 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
334 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
335 close ($handle) unless $self->{'debug'};
336
337 $self->print_stats();
338}
339
340sub want_built {
341 my $self = shift (@_);
342 my ($index) = @_;
343
344 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
345 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
346 if ($index =~ /^$checkstr$/) {
347 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
348 return 0;
349 }
350 }
351 }
352
353 return 1;
354}
355
356sub build_indexes {
357 my $self = shift (@_);
358 my ($indexname) = @_;
359 my $outhandle = $self->{'outhandle'};
360
361 my $indexes = [];
362 if (defined $indexname && $indexname =~ /\w/) {
363 push @$indexes, $indexname;
364 } else {
365 $indexes = $self->{'collect_cfg'}->{'indexes'};
366 }
367
368 # create the mapping between the index descriptions
369 # and their directory names
370 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
371
372 # build each of the indexes
373 foreach $index (@$indexes) {
374 if ($self->want_built($index)) {
375 print $outhandle "\n*** building index $index in subdirectory " .
376 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
377 $self->build_index($index);
378 } else {
379 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
380 }
381 }
382}
383
384# creates directory names for each of the index descriptions
385sub create_index_mapping {
386 my $self = shift (@_);
387 my ($indexes) = @_;
388
389 my %mapping = ();
390 $mapping{'indexmaporder'} = [];
391 $mapping{'subcollectionmaporder'} = [];
392 $mapping{'languagemaporder'} = [];
393
394 # dirnames is used to check for collisions. Start this off
395 # with the manditory directory names
396 my %dirnames = ('text'=>'text',
397 'extra'=>'extra');
398 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
399
400 foreach $index (@$indexes) {
401 my ($fields, $subcollection, $languages) = split (":", $index);
402
403 # the directory name starts with a processed version of index fields
404 my ($pindex) = $self->process_field($fields);
405 # next comes a processed version of the index
406 $pindex = lc ($pindex);
407
408 # next comes a processed version of the subcollection if there is one.
409 my $psub = $self->process_field ($subcollection);
410 $psub = lc ($psub);
411
412 # next comes a processed version of the language if there is one.
413 my $plang = $self->process_field ($languages);
414 $plang = lc ($plang);
415
416 my $dirname = $pindex . $psub . $plang;
417
418 # check to be sure all index names are unique
419 while (defined ($dirnames{$dirname})) {
420 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
421 }
422
423 $mapping{$index} = $dirname;
424
425 # store the mapping orders as well as the maps
426 # also put index, subcollection and language fields into the mapping thing -
427 # (the full index name (eg document:text:subcol:lang) is not used on
428 # the query page) -these are used for collectionmeta later on
429 if (!defined $mapping{'indexmap'}{"$fields"}) {
430 $mapping{'indexmap'}{"$fields"} = $pindex;
431 push (@{$mapping{'indexmaporder'}}, "$fields");
432 if (!defined $mapping{"$fields"}) {
433 $mapping{"$fields"} = $pindex;
434 }
435 }
436 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
437 $mapping{'subcollectionmap'}{$subcollection} = $psub;
438 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
439 $mapping{$subcollection} = $psub;
440 }
441 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
442 $mapping{'languagemap'}{$languages} = $plang;
443 push (@{$mapping{'languagemaporder'}}, $language);
444 $mapping{$languages} = $plang;
445 }
446 $dirnames{$dirname} = $index;
447 $pnames{'index'}{$pindex} = "$fields";
448 $pnames{'subcollection'}{$psub} = $subcollection;
449 $pnames{'languages'}{$plang} = $languages;
450 }
451
452 return \%mapping;
453}
454
455# returns a processed version of a field.
456# if the field has only one component the processed
457# version will contain the first character and next consonant
458# of that componant - otherwise it will contain the first
459# character of the first two components
460sub process_field {
461 my $self = shift (@_);
462 my ($field) = @_;
463
464 return "" unless (defined ($field) && $field =~ /\w/);
465
466 my @components = split /,/, $field;
467 if (scalar @components >= 2) {
468 splice (@components, 2);
469 map {s/^(.).*$/$1/;} @components;
470 return join("", @components);
471 } else {
472 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
473 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
474 return "$a$b";
475 }
476}
477
478sub make_unique {
479 my $self = shift (@_);
480 my ($namehash, $index, $indexref, $subref, $langref) = @_;
481 my ($fields, $subcollection, $languages) = split (":", $index);
482
483 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
484 $self->get_next_version ($indexref);
485 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
486 $self->get_next_version ($subref);
487 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
488 $self->get_next_version ($langref);
489 }
490 return "$$indexref$$subref$$langref";
491}
492
493sub get_next_version {
494 my $self = shift (@_);
495 my ($nameref) = @_;
496
497 if ($$nameref =~ /(\d\d)$/) {
498 my $num = $1; $num ++;
499 $$nameref =~ s/\d\d$/$num/;
500 } elsif ($$nameref =~ /(\d)$/) {
501 my $num = $1;
502 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
503 else {$num ++; $$nameref =~ s/\d$/$num/;}
504 } else {
505 $$nameref =~ s/.$/0/;
506 }
507}
508
509sub build_index {
510 my $self = shift (@_);
511 my ($index) = @_;
512 my $outhandle = $self->{'outhandle'};
513
514 # get the full index directory path and make sure it exists
515 my $indexdir = $self->{'index_mapping'}->{$index};
516 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
517 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
518 $indexdir,
519 $self->{'collection'});
520 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
521 $self->{'collection'});
522
523 # get any os specific stuff
524 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
525
526 my $exe = &util::get_os_exe ();
527 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
528
529 # define the section names for mgpasses
530 my $mgpp_passes_sections = "";
531 foreach $level (keys (%{$self->{'levels'}})) {
532 if ($level eq "Section" || $level eq "Paragraph") {
533 $mgpp_passes_sections .= "-K $level ";
534 }
535 }
536
537 my $mgpp_perf_hash_build_exe =
538 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
539 my $mgpp_weights_build_exe =
540 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
541 my $mgpp_invf_dict_exe =
542 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
543 my $mgpp_stem_idx_exe =
544 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
545
546 my $osextra = "";
547 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
548 $fullindexprefix =~ s/\//\\/g;
549 } else {
550 $osextra = " -d /";
551 }
552
553 # get the index expression if this index belongs
554 # to a subcollection
555 my $indexexparr = [];
556
557 # there may be subcollection info, and language info.
558 my ($fields, $subcollection, $language) = split (":", $index);
559 my @subcollections = ();
560 @subcollections = split /,/, $subcollection if (defined $subcollection);
561
562 foreach $subcollection (@subcollections) {
563 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
564 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
565 }
566 }
567
568 # add expressions for languages if this index belongs to
569 # a language subcollection - only put languages expressions for the
570 # ones we want in the index
571
572 my @languages = ();
573 @languages = split /,/, $language if (defined $language);
574 foreach $language (@languages) {
575 my $not=0;
576 if ($language =~ s/^\!//) {
577 $not = 1;
578 }
579 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
580 if ($lang eq $language) {
581 if ($not) {
582 push (@$indexexparr, "!Language/$language/");
583 } else {
584 push (@$indexexparr, "Language/$language/");
585 }
586 last;
587 }
588 }
589 }
590
591 # Build index dictionary. Uses verbatim stem method
592 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
593 my ($handle);
594 if ($self->{'debug'}) {
595 $handle = STDOUT;
596 } else {
597 if (!-e "$mgpp_passes_exe" ||
598 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
599 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
600 }
601 $handle = mgppbuilder::PIPEOUT;
602 }
603
604 # set up the document processor
605 $self->{'buildproc'}->set_output_handle ($handle);
606 $self->{'buildproc'}->set_mode ('text');
607 $self->{'buildproc'}->set_index ($index, $indexexparr);
608 $self->{'buildproc'}->set_indexing_text (1);
609 $self->{'buildproc'}->set_store_text(1);
610 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
611 $self->{'buildproc'}->set_levels ($self->{'levels'});
612 $self->{'buildproc'}->reset();
613 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
614 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
615 close ($handle) unless $self->{'debug'};
616
617 $self->print_stats();
618
619 if (!$self->{'debug'}) {
620 # create the perfect hash function
621 if (!-e "$mgpp_perf_hash_build_exe") {
622 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
623 }
624 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
625
626 if (!-e "$mgpp_passes_exe" ||
627 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
628 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
629 }
630 }
631
632 # invert the text
633 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
634
635 $self->{'buildproc'}->reset();
636 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
637 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
638
639 $self->print_stats ();
640
641 if (!$self->{'debug'}) {
642
643 close ($handle);
644
645 # create the weights file
646 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
647 if (!-e "$mgpp_weights_build_exe") {
648 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
649 }
650 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
651
652 # create 'on-disk' stemmed dictionary
653 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
654 if (!-e "$mgpp_invf_dict_exe") {
655 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
656 }
657 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
658
659
660 # creates stem index files for the various stemming methods
661 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
662 if (!-e "$mgpp_stem_idx_exe") {
663 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
664 }
665 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
666 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
667 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
668
669
670 # remove unwanted files
671 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
672 opendir (DIR, $tmpdir) || die
673 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
674 foreach $file (readdir(DIR)) {
675 next if $file =~ /^\./;
676 my ($suffix) = $file =~ /\.([^\.]+)$/;
677 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
678 # delete it!
679 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
680 #&util::rm (&util::filename_cat ($tmpdir, $file));
681 }
682 }
683 closedir (DIR);
684 }
685}
686
687sub make_infodatabase {
688 my $self = shift (@_);
689 my $outhandle = $self->{'outhandle'};
690
691
692 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
693 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
694 &util::mk_all_dir ($textdir);
695 &util::mk_all_dir ($assocdir);
696
697 # get db name
698 my $dbext = ".bdb";
699 $dbext = ".ldb" if &util::is_little_endian();
700 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
701 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
702
703 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
704 my $exe = &util::get_os_exe ();
705 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
706
707 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
708 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
709 #check build.cfg to see if indexfields have been filled in
710 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
711 if (-e $buildconfigfile) {
712 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
713 if (defined $buildcfg->{'indexfields'}) {
714 foreach $field (@{$buildcfg->{'indexfields'}}) {
715 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
716 }
717 }
718 if (defined $buildcfg->{'indexfieldmap'}) {
719 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
720 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
721 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
722 }
723 }
724 }
725 }
726
727 print $outhandle "\n*** creating the info database and processing associated files\n"
728 if ($self->{'verbosity'} >= 1);
729
730 # init all the classifiers
731 &classify::init_classifiers ($self->{'classifiers'});
732
733 # set up the document processor
734 my ($handle);
735 if ($self->{'debug'}) {
736 $handle = STDOUT;
737 } else {
738 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
739 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
740 }
741 $handle = mgppbuilder::PIPEOUT;
742 }
743
744 $self->{'buildproc'}->set_output_handle ($handle);
745 $self->{'buildproc'}->set_mode ('infodb');
746 $self->{'buildproc'}->set_assocdir ($assocdir);
747 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
748 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
749 $self->{'buildproc'}->set_indexing_text (0);
750 $self->{'buildproc'}->set_store_text(1);
751 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
752
753 $self->{'buildproc'}->reset();
754
755 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
756
757 if (!defined $self->{'index_mapping'}) {
758 $self->{'index_mapping'} =
759 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
760 }
761
762 print $handle "[collection]\n";
763
764 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
765 my $defaultfound=0;
766 my $first=1;
767 my $metadata_entry = "";
768 my $default="";
769 my $cmetamap = "";
770 if ($cmeta =~ s/^\.//) {
771 if (defined $self->{'index_mapping'}->{$cmeta}) {
772 $cmetamap = $self->{'index_mapping'}->{$cmeta};
773 $cmeta = ".$cmeta";
774 }
775 else {
776 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
777 next; #ignore this one
778 }
779 }
780 else {
781 $cmetamap = $cmeta; # just using the same name
782 }
783 #iterate through the languages
784 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
785 if ($first) {
786 $first=0;
787 #set the default default to the first entry
788 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
789 }
790 if ($lang =~ /default/) {
791 $defaultfound=1;
792 #the default entry goes first
793 $metadata_entry = "<$cmetamap>" .
794 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
795 }
796 else {
797 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
798 if ($l) {
799 $metadata_entry .= "<$cmetamap:$l>" .
800 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
801 }
802 }
803 }
804 #if we haven't found a default, put one in
805 if (!$defaultfound) {
806 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
807 }
808 #write the entry to the file
809 print $handle $metadata_entry;
810
811 }
812
813 print $handle "\n" . ('-' x 70) . "\n";
814
815 }
816
817 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
818 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
819
820 # output classification information
821 &classify::output_classify_info ($self->{'classifiers'}, $handle,
822 $self->{'allclassifications'});
823
824 #output doclist
825 my @doclist = $self->{'buildproc'}->get_doc_list();
826 my $docs = join (";",@doclist);
827 print $handle "[browselist]\n";
828 print $handle "<hastxt>0\n";
829 print $handle "<childtype>VList\n";
830 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
831 print $handle "<thistype>Invisible\n";
832 print $handle "<contains>$docs";
833 print $handle "\n" . ('-' x 70) . "\n";
834 close ($handle) if !$self->{'debug'};
835
836}
837
838sub collect_specific {
839 my $self = shift (@_);
840}
841
842sub make_auxiliary_files {
843 my $self = shift (@_);
844 my ($index);
845 my %build_cfg = ();
846
847 my $outhandle = $self->{'outhandle'};
848 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
849
850 # get the text directory
851 &util::mk_all_dir ($self->{'build_dir'});
852
853 # store the build date
854 $build_cfg->{'builddate'} = time;
855 $build_cfg->{'buildtype'} = "mgpp";
856
857 # store the number of documents and number of bytes
858 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
859 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
860
861 # store the mapping between the index names and the directory names
862 my @indexmap = ();
863 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
864 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
865 }
866 $build_cfg->{'indexmap'} = \@indexmap;
867
868 my @subcollectionmap = ();
869 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
870 push (@subcollectionmap, "$subcollection\-\>" .
871 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
872 }
873 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
874
875 my @languagemap = ();
876 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
877 push (@languagemap, "$language\-\>" .
878 $self->{'index_mapping'}->{'languagemap'}->{$language});
879 }
880 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
881
882 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
883
884 # store the indexfieldmap information
885 my @indexfieldmap = ();
886 #add all fields bit
887 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
888 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
889 }
890
891 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
892
893 #store the indexed field information
894 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
895
896 push (@{$build_cfg->{'indexfields'}}, $field);
897 }
898 # write out the build information
899 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
900 '^(builddate|buildtype|numdocs|numbytes)$',
901 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
902
903}
904
905sub deinit {
906 my $self = shift (@_);
907}
908
909sub print_stats {
910 my $self = shift (@_);
911
912 my $outhandle = $self->{'outhandle'};
913 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
914 my $index = $self->{'buildproc'}->get_index();
915 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
916 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
917
918 if ($indexing_text) {
919 print $outhandle "Stats (Creating index $index)\n";
920 } else {
921 print $outhandle "Stats (Compressing text from $index)\n";
922 }
923 print $outhandle "Total bytes in collection: $num_bytes\n";
924 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
925
926 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
927 print $outhandle "***************\n";
928 if ($indexing_text) {
929 print $outhandle "WARNING: There is very little or no text to process for $index\n";
930 } elsif (!$self->{'no_text'}) {
931 print $outhandle "WARNING: There is very little or no text to compress\n";
932 }
933 print $outhandle " Was this your intention?\n";
934 print $outhandle "***************\n";
935 }
936
937}
938
9391;
940
941
Note: See TracBrowser for help on using the repository browser.