source: main/tags/2.41-fiji/gsdl/perllib/mgppbuilder.pm@ 32083

Last change on this file since 32083 was 5768, checked in by kjdon, 21 years ago

added a check for failed indexing - just test to see if the .id file exists. if not, don't continue with building that index. Indexes that haven't been built are no longer included in the indexmap entry in teh build config file - and therefore wont appear in the list of indexes

  • Property svn:keywords set to Author Date Id Revision
File size: 37.8 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my ($class, $collection, $source_dir, $build_dir, $verbosity,
111 $maxdocs, $debug, $keepold, $allclassifications,
112 $outhandle, $no_text) = @_;
113
114 $outhandle = STDERR unless defined $outhandle;
115 $no_text = 0 unless defined $no_text;
116
117 # create an mgppbuilder object
118 my $self = bless {'collection'=>$collection,
119 'source_dir'=>$source_dir,
120 'build_dir'=>$build_dir,
121 'verbosity'=>$verbosity,
122 'maxdocs'=>$maxdocs,
123 'debug'=>$debug,
124 'keepold'=>$keepold,
125 'allclassifications'=>$allclassifications,
126 'outhandle'=>$outhandle,
127 'no_text'=>$no_text,
128 'notbuilt'=>{}, # indexes not built
129 'indexfieldmap'=>\%static_indexfield_map
130 }, $class;
131
132
133 # read in the collection configuration file
134 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
135 if (!-e $colcfgname) {
136 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
137 }
138 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
139
140 # sort out the indexes
141 #indexes are specified with spaces, but we put them into one index
142 my $indexes = $self->{'collect_cfg'}->{'indexes'};
143 $self->{'collect_cfg'}->{'indexes'} = [];
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
145
146
147 # sort out subcollection indexes
148 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
149 my $indexes = $self->{'collect_cfg'}->{'indexes'};
150 $self->{'collect_cfg'}->{'indexes'} = [];
151 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
152 foreach $index (@$indexes) {
153 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
154 }
155 }
156 }
157
158 # sort out language subindexes
159 if (defined $self->{'collect_cfg'}->{'languages'}) {
160 my $indexes = $self->{'collect_cfg'}->{'indexes'};
161 $self->{'collect_cfg'}->{'indexes'} = [];
162 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
163 foreach $index (@$indexes) {
164 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
165 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
166 }
167 else { # add in an empty subcollection field
168 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
169 }
170 }
171 }
172 }
173
174 # make sure that the same index isn't specified more than once
175 my %tmphash = ();
176 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
177 $self->{'collect_cfg'}->{'indexes'} = [];
178 foreach my $i (@tmparray) {
179 if (!defined ($tmphash{$i})) {
180 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
181 $tmphash{$i} = 1;
182 }
183 }
184
185
186 # get the levels (Section, Paragraph) for indexing and compression
187 $self->{'levels'} = {};
188 $self->{'levelorder'} = ();
189 if (defined $self->{'collect_cfg'}->{'levels'}) {
190 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
191 $level =~ tr/A-Z/a-z/;
192 $self->{'levels'}->{$level} = 1;
193 push (@{$self->{'levelorder'}}, $level);
194 }
195 } else { # default to document
196 $self->{'levels'}->{'document'} = 1;
197 push (@{$self->{'levelorder'}}, 'document');
198 }
199
200 $self->{'doc_level'} = "document";
201 if (! $self->{'levels'}->{'document'}) {
202 if ($self->{'levels'}->{'section'}) {
203 $self->{'doc_level'} = "section";
204 } else {
205 die "you must have either document or section level specified!!\n";
206 }
207 }
208 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
209 # get the list of plugins for this collection
210 my $plugins = [];
211 if (defined $self->{'collect_cfg'}->{'plugin'}) {
212 $plugins = $self->{'collect_cfg'}->{'plugin'};
213 }
214
215 # load all the plugins
216 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
217 if (scalar(@{$self->{'pluginfo'}}) == 0) {
218 print $outhandle "No plugins were loaded.\n";
219 die "\n";
220 }
221
222 # get the list of classifiers for this collection
223 my $classifiers = [];
224 if (defined $self->{'collect_cfg'}->{'classify'}) {
225 $classifiers = $self->{'collect_cfg'}->{'classify'};
226 }
227
228 # load all the classifiers
229 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
230
231 # load up any dontgdbm fields
232 $self->{'dontgdbm'} = {};
233 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
234 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
235 $self->{'dontgdbm'}->{$dg} = 1;
236 }
237 }
238
239 # load up the document processor for building
240 # if a buildproc class has been created for this collection, use it
241 # otherwise, use the mgpp buildproc
242 my ($buildprocdir, $buildproctype);
243 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
244 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
245 $buildproctype = "${collection}buildproc";
246 } else {
247 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
248 $buildproctype = "mgppbuildproc";
249 }
250 require "$buildprocdir/$buildproctype.pm";
251
252 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
253 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
254 die "$@" if $@;
255
256
257 return $self;
258}
259
260sub init {
261 my $self = shift (@_);
262
263 if (!$self->{'debug'} && !$self->{'keepold'}) {
264 # remove any old builds
265 &util::rm_r($self->{'build_dir'});
266 &util::mk_all_dir($self->{'build_dir'});
267
268 # make the text directory
269 my $textdir = "$self->{'build_dir'}/text";
270 &util::mk_all_dir($textdir);
271 }
272}
273
274sub set_strip_html {
275 my $self = shift (@_);
276 my ($strip) = @_;
277
278 $self->{'strip_html'} = $strip;
279 $self->{'buildproc'}->set_strip_html($strip);
280}
281
282sub compress_text {
283
284 my $self = shift (@_);
285 my ($textindex) = @_;
286
287 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
288 my $exe = &util::get_os_exe ();
289 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
290 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
291 my $outhandle = $self->{'outhandle'};
292
293 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
294
295 my $basefilename = "text/$self->{'collection'}";
296 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
297
298 my $osextra = "";
299 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
300 $fulltextprefix =~ s@/@\\@g;
301 }
302 else {
303 $osextra = " -d /";
304 }
305
306
307 # define the section names and possibly the doc name for mgpasses
308 # the compressor doesn't need to know about paragraphs - never want to
309 # retrieve them
310 my $mgpp_passes_sections = "";
311 my ($doc_level) = $self->{'doc_level'};
312 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} . " ";
313 foreach $level (keys %{$self->{'levels'}}) {
314 if ($level ne $doc_level && $level ne "paragraph") {
315 $mgpp_passes_sections .= "-K " . %level_map->{$level} . " ";
316 }
317 }
318
319 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
320
321 # collect the statistics for the text
322 # -b $maxdocsize sets the maximum document size to be 12 meg
323 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
324
325 my ($handle);
326 if ($self->{'debug'}) {
327 $handle = STDOUT;
328 } else {
329 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
330 if (!-e "$mgpp_passes_exe" ||
331 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
332 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
333 }
334 $handle = mgppbuilder::PIPEOUT;
335 }
336 $self->{'buildproc'}->set_output_handle ($handle);
337 $self->{'buildproc'}->set_mode ('text');
338 $self->{'buildproc'}->set_index ($textindex);
339 $self->{'buildproc'}->set_indexing_text (0);
340 if ($self->{'no_text'}) {
341 $self->{'buildproc'}->set_store_text(0);
342 } else {
343 $self->{'buildproc'}->set_store_text(1);
344 }
345 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
346 $self->{'buildproc'}->set_levels ($self->{'levels'});
347 $self->{'buildproc'}->reset();
348 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
349 $self->{'buildproc'}, $self->{'maxdocs'});
350 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
351 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
352 &plugin::end($self->{'pluginfo'});
353 close (PIPEOUT);
354
355 close ($handle) unless $self->{'debug'};
356
357 $self->print_stats();
358
359 # create the compression dictionary
360 # the compression dictionary is built by assuming the stats are from a seed
361 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
362 # and the resulting dictionary must be less than 5 meg with the most
363 # frequent words being put into the dictionary first (-2 -k 5120)
364 # note: these options are left over from mg version
365 if (!$self->{'debug'}) {
366 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
367 if (!-e "$mgpp_compression_dict_exe") {
368 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
369 }
370 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
371
372 if (!$self->{'debug'}) {
373 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
374 if (!-e "$mgpp_passes_exe" ||
375 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
376 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
377 }
378 }
379 }
380
381 $self->{'buildproc'}->reset();
382 # compress the text
383 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
384 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
385 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
386 close ($handle) unless $self->{'debug'};
387
388 $self->print_stats();
389}
390
391sub want_built {
392 my $self = shift (@_);
393 my ($index) = @_;
394
395 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
396 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
397 if ($index =~ /^$checkstr$/) {
398 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
399 $self->{'notbuilt'}->{$index} = 1;
400 return 0;
401 }
402 }
403 }
404
405 return 1;
406}
407
408sub build_indexes {
409 my $self = shift (@_);
410 my ($indexname) = @_;
411 my $outhandle = $self->{'outhandle'};
412
413 my $indexes = [];
414 if (defined $indexname && $indexname =~ /\w/) {
415 push @$indexes, $indexname;
416 } else {
417 $indexes = $self->{'collect_cfg'}->{'indexes'};
418 }
419
420 # create the mapping between the index descriptions
421 # and their directory names (includes subcolls and langs)
422 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
423
424 # build each of the indexes
425 foreach $index (@$indexes) {
426 if ($self->want_built($index)) {
427 print $outhandle "\n*** building index $index in subdirectory " .
428 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
429 $self->build_index($index);
430 } else {
431 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
432 }
433 }
434
435 #define the final field lists
436 $self->make_final_field_list();
437
438}
439
440# creates directory names for each of the index descriptions
441sub create_index_mapping {
442 my $self = shift (@_);
443 my ($indexes) = @_;
444
445 my %mapping = ();
446 $mapping{'indexmaporder'} = [];
447 $mapping{'subcollectionmaporder'} = [];
448 $mapping{'languagemaporder'} = [];
449
450 # dirnames is used to check for collisions. Start this off
451 # with the manditory directory names
452 my %dirnames = ('text'=>'text',
453 'extra'=>'extra');
454 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
455
456 foreach $index (@$indexes) {
457 my ($fields, $subcollection, $languages) = split (":", $index);
458
459 # the directory name starts with a processed version of index fields
460 #my ($pindex) = $self->process_field($fields);
461 #$pindex = lc ($pindex);
462 # now we only ever have one index, and its called 'idx'
463 $pindex = 'idx';
464
465 # next comes a processed version of the subcollection if there is one.
466 my $psub = $self->process_field ($subcollection);
467 $psub = lc ($psub);
468
469 # next comes a processed version of the language if there is one.
470 my $plang = $self->process_field ($languages);
471 $plang = lc ($plang);
472
473 my $dirname = $pindex . $psub . $plang;
474
475 # check to be sure all index names are unique
476 while (defined ($dirnames{$dirname})) {
477 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
478 }
479
480 $mapping{$index} = $dirname;
481
482 # store the mapping orders as well as the maps
483 # also put index, subcollection and language fields into the mapping thing -
484 # (the full index name (eg text:subcol:lang) is not used on
485 # the query page) -these are used for collectionmeta later on
486 if (!defined $mapping{'indexmap'}{"$fields"}) {
487 $mapping{'indexmap'}{"$fields"} = $pindex;
488 push (@{$mapping{'indexmaporder'}}, "$fields");
489 if (!defined $mapping{"$fields"}) {
490 $mapping{"$fields"} = $pindex;
491 }
492 }
493 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
494 $mapping{'subcollectionmap'}{$subcollection} = $psub;
495 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
496 $mapping{$subcollection} = $psub;
497 }
498 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
499 $mapping{'languagemap'}{$languages} = $plang;
500 push (@{$mapping{'languagemaporder'}}, $language);
501 $mapping{$languages} = $plang;
502 }
503 $dirnames{$dirname} = $index;
504 $pnames{'index'}{$pindex} = "$fields";
505 $pnames{'subcollection'}{$psub} = $subcollection;
506 $pnames{'languages'}{$plang} = $languages;
507 }
508
509 return \%mapping;
510}
511
512# returns a processed version of a field.
513# if the field has only one component the processed
514# version will contain the first character and next consonant
515# of that componant - otherwise it will contain the first
516# character of the first two components
517sub process_field {
518 my $self = shift (@_);
519 my ($field) = @_;
520
521 return "" unless (defined ($field) && $field =~ /\w/);
522
523 my @components = split /,/, $field;
524 if (scalar @components >= 2) {
525 splice (@components, 2);
526 map {s/^(.).*$/$1/;} @components;
527 return join("", @components);
528 } else {
529 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
530 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
531 return "$a$b";
532 }
533}
534
535sub make_unique {
536 my $self = shift (@_);
537 my ($namehash, $index, $indexref, $subref, $langref) = @_;
538 my ($fields, $subcollection, $languages) = split (":", $index);
539
540 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
541 $self->get_next_version ($indexref);
542 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
543 $self->get_next_version ($subref);
544 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
545 $self->get_next_version ($langref);
546 }
547 return "$$indexref$$subref$$langref";
548}
549
550sub get_next_version {
551 my $self = shift (@_);
552 my ($nameref) = @_;
553
554 if ($$nameref =~ /(\d\d)$/) {
555 my $num = $1; $num ++;
556 $$nameref =~ s/\d\d$/$num/;
557 } elsif ($$nameref =~ /(\d)$/) {
558 my $num = $1;
559 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
560 else {$num ++; $$nameref =~ s/\d$/$num/;}
561 } else {
562 $$nameref =~ s/.$/0/;
563 }
564}
565
566sub build_index {
567 my $self = shift (@_);
568 my ($index) = @_;
569 my $outhandle = $self->{'outhandle'};
570
571 # get the full index directory path and make sure it exists
572 my $indexdir = $self->{'index_mapping'}->{$index};
573 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
574 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
575 $indexdir,
576 $self->{'collection'});
577 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
578 $self->{'collection'});
579
580 # get any os specific stuff
581 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
582
583 my $exe = &util::get_os_exe ();
584 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
585
586 # define the section names for mgpasses
587 # define the section names and possibly the doc name for mgpasses
588 my $mgpp_passes_sections = "";
589 my ($doc_level) = $self->{'doc_level'};
590 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
591
592 foreach $level (keys %{$self->{'levels'}}) {
593 if ($level ne $doc_level) {
594 $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
595 }
596 }
597
598 my $mgpp_perf_hash_build_exe =
599 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
600 my $mgpp_weights_build_exe =
601 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
602 my $mgpp_invf_dict_exe =
603 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
604 my $mgpp_stem_idx_exe =
605 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
606
607 my $osextra = "";
608 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
609 $fullindexprefix =~ s@/@\\@g;
610 } else {
611 $osextra = " -d /";
612 if ($outhandle ne "STDERR") {
613 # so mgpp_passes doesn't print to stderr if we redirect output
614 $osextra .= " 2>/dev/null";
615 }
616 }
617
618 # get the index expression if this index belongs
619 # to a subcollection
620 my $indexexparr = [];
621
622 # there may be subcollection info, and language info.
623 my ($fields, $subcollection, $language) = split (":", $index);
624 my @subcollections = ();
625 @subcollections = split /,/, $subcollection if (defined $subcollection);
626
627 foreach $subcollection (@subcollections) {
628 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
629 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
630 }
631 }
632
633 # add expressions for languages if this index belongs to
634 # a language subcollection - only put languages expressions for the
635 # ones we want in the index
636
637 my @languages = ();
638 @languages = split /,/, $language if (defined $language);
639 foreach $language (@languages) {
640 my $not=0;
641 if ($language =~ s/^\!//) {
642 $not = 1;
643 }
644 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
645 if ($lang eq $language) {
646 if ($not) {
647 push (@$indexexparr, "!Language/$language/");
648 } else {
649 push (@$indexexparr, "Language/$language/");
650 }
651 last;
652 }
653 }
654 }
655
656 # Build index dictionary. Uses verbatim stem method
657 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
658 my ($handle);
659 if ($self->{'debug'}) {
660 $handle = STDOUT;
661 } else {
662 if (!-e "$mgpp_passes_exe" ||
663 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
664 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
665 }
666 $handle = mgppbuilder::PIPEOUT;
667 }
668
669 # set up the document processr
670 $self->{'buildproc'}->set_output_handle ($handle);
671 $self->{'buildproc'}->set_mode ('text');
672 $self->{'buildproc'}->set_index ($index, $indexexparr);
673 $self->{'buildproc'}->set_indexing_text (1);
674 $self->{'buildproc'}->set_store_text(1);
675 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
676 $self->{'buildproc'}->set_levels ($self->{'levels'});
677 $self->{'buildproc'}->reset();
678 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
679 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
680 close ($handle) unless $self->{'debug'};
681
682 $self->print_stats();
683
684 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
685 # we check on the .id file - index dictionary
686 my $dict_file = "$fullindexprefix.id";
687 if (!-e $dict_file) {
688 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
689 $self->{'notbuilt'}->{$index}=1;
690 return;
691 }
692
693 if (!$self->{'debug'}) {
694 # create the perfect hash function
695 if (!-e "$mgpp_perf_hash_build_exe") {
696 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
697 }
698 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
699
700 if (!-e "$mgpp_passes_exe" ||
701 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
702 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
703 }
704 }
705
706 # invert the text
707 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
708
709 $self->{'buildproc'}->reset();
710 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
711 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
712
713 $self->print_stats ();
714
715 if (!$self->{'debug'}) {
716
717 close ($handle);
718
719 # create the weights file
720 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
721 if (!-e "$mgpp_weights_build_exe") {
722 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
723 }
724 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
725
726 # create 'on-disk' stemmed dictionary
727 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
728 if (!-e "$mgpp_invf_dict_exe") {
729 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
730 }
731 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
732
733
734 # creates stem index files for the various stemming methods
735 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
736 if (!-e "$mgpp_stem_idx_exe") {
737 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
738 }
739 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
740 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
741 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
742
743 # remove unwanted files
744 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
745 opendir (DIR, $tmpdir) || die
746 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
747 foreach $file (readdir(DIR)) {
748 next if $file =~ /^\./;
749 my ($suffix) = $file =~ /\.([^\.]+)$/;
750 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
751 # delete it!
752 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
753 #&util::rm (&util::filename_cat ($tmpdir, $file));
754 }
755 }
756 closedir (DIR);
757 }
758}
759
760sub make_infodatabase {
761 my $self = shift (@_);
762 my $outhandle = $self->{'outhandle'};
763
764
765 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
766 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
767 &util::mk_all_dir ($textdir);
768 &util::mk_all_dir ($assocdir);
769
770 # get db name
771 my $dbext = ".bdb";
772 $dbext = ".ldb" if &util::is_little_endian();
773 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
774 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
775
776 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
777 my $exe = &util::get_os_exe ();
778 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
779
780 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
781 if (!defined $self->{'build_cfg'}) {
782 $self->read_final_field_list();
783 }
784 print $outhandle "\n*** creating the info database and processing associated files\n"
785 if ($self->{'verbosity'} >= 1);
786
787 # init all the classifiers
788 &classify::init_classifiers ($self->{'classifiers'});
789
790 # set up the document processor
791 my ($handle);
792 if ($self->{'debug'}) {
793 $handle = STDOUT;
794 } else {
795 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
796 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
797 }
798 $handle = mgppbuilder::PIPEOUT;
799 }
800
801 $self->{'buildproc'}->set_output_handle ($handle);
802 $self->{'buildproc'}->set_mode ('infodb');
803 $self->{'buildproc'}->set_assocdir ($assocdir);
804 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
805 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
806 $self->{'buildproc'}->set_indexing_text (0);
807 $self->{'buildproc'}->set_store_text(1);
808 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
809
810 $self->{'buildproc'}->reset();
811
812 # do the collection info
813 print $handle "[collection]\n";
814
815 # first do the collection meta stuff - everything without a dot
816 my $collmetadefined = 0;
817 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
818 $collmetadefined = 1;
819 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
820 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
821 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
822 #write the entry to the file
823 print $handle $metadata_entry;
824
825 } # foreach collmeta key
826 }
827 #add the index field macros to [collection]
828 # eg <TI>Title
829 # <SU>Subject
830 # these now come from collection meta. if that is not defined, usses the metadata name
831 $field_entry="";
832 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
833 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
834 next if $shortfield eq 1;
835
836 # we need to check if some coll meta has been defined
837 my $collmeta = ".$longfield";
838 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
839 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
840 $field_entry .= $metadata_entry;
841 } else { #use the metadata names, or the text macros for allfields and textonly
842 if ($longfield eq "allfields") {
843 $field_entry .= "<$shortfield>_query:textallfields_\n";
844 } elsif ($longfield eq "text") {
845 $field_entry .= "<$shortfield>_query:texttextonly_\n";
846 } else {
847 $field_entry .= "<$shortfield>$longfield\n";
848 }
849 }
850 }
851 print $handle $field_entry;
852
853 # now add the level names
854 $level_entry = "";
855 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
856 my $collmeta = ".$level"; # based on the original specification
857 $level =~ tr/A-Z/a-z/; # make it lower case
858 my $levelid = %level_map->{$level}; # find the actual value we used in the index
859 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
860 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
861 $level_entry .= $metadata_entry;
862 } else {
863 # use the default macro
864 $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
865 }
866 }
867 print $handle $level_entry;
868 #end the collection entry
869 print $handle "\n" . ('-' x 70) . "\n";
870
871
872
873 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
874 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
875
876 # output classification information
877 &classify::output_classify_info ($self->{'classifiers'}, $handle,
878 $self->{'allclassifications'});
879
880 #output doclist
881 my @doclist = $self->{'buildproc'}->get_doc_list();
882 my $docs = join (";",@doclist);
883 print $handle "[browselist]\n";
884 print $handle "<hastxt>0\n";
885 print $handle "<childtype>VList\n";
886 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
887 print $handle "<thistype>Invisible\n";
888 print $handle "<contains>$docs";
889 print $handle "\n" . ('-' x 70) . "\n";
890 close ($handle) if !$self->{'debug'};
891
892}
893
894sub create_language_db_map {
895 my $self = shift (@_);
896 my ($metaname, $mapname) = @_;
897 my $outhandle = $self->{'outhandle'};
898 my $defaultfound=0;
899 my $first=1;
900 my $metadata_entry = "";
901 my $default="";
902 #iterate through the languages
903 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
904 if ($first) {
905 $first=0;
906 #set the default default to the first entry
907 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
908 }
909 if ($lang =~ /default/) {
910 $defaultfound=1;
911 #the default entry goes first
912 $metadata_entry = "<$mapname>" .
913 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
914 }
915 else {
916 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
917 if ($l) {
918 $metadata_entry .= "<$mapname:$l>" .
919 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
920 }
921 }
922 } #foreach lang
923 #if we haven't found a default, put one in
924 if (!$defaultfound) {
925 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
926 }
927 return $metadata_entry;
928
929}
930sub collect_specific {
931 my $self = shift (@_);
932}
933
934# at the end of building, we have an indexfieldmap with all teh mappings, plus
935# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
936# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
937# we store these in a build.cfg bit
938sub make_final_field_list {
939 my $self = shift (@_);
940
941 $self->{'build_cfg'} = {};
942
943 # store the indexfieldmap information
944 my @indexfieldmap = ();
945 my @indexfields = ();
946 my $specifiedfields = {};
947 my @specifiedfieldorder = ();
948 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
949 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
950 # remove subcoll stuff
951 my $parts = $field;
952 $parts =~ s/:.*$//;
953 my @fs = split(',', $parts);
954 foreach $f(@fs) {
955 if (!defined $specifiedfields->{$f}) {
956 $specifiedfields->{$f}=1;
957 push (@specifiedfieldorder, "$f");
958 }
959 }
960 }
961
962 #add all fields bit
963 foreach $field (@specifiedfieldorder) {
964 if ($field eq "metadata") {
965 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
966 if (!defined $specifiedfields->{$newfield}) {
967 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
968 push (@indexfields, "$newfield");
969 }
970 }
971
972 } elsif ($field eq 'text') {
973 push (@indexfieldmap, "text\-\>TX");
974 push (@indexfields, "text");
975 } elsif ($field eq 'allfields') {
976 push (@indexfieldmap, "allfields\-\>ZZ");
977 push (@indexfields, "allfields");
978 } else {
979 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
980 push (@indexfields, "$field");
981
982 }
983 }
984 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
985 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
986
987
988}
989
990
991# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
992sub read_final_field_list {
993 my $self = shift (@_);
994 $self->{'build_cfg'} = {};
995 my @indexfieldmap = ();
996 my @indexfields = ();
997
998 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
999 # set the default mapping
1000 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1001 }
1002 # we read the stuff in from the build.cfg file - if its there
1003 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1004
1005 if (!-e $buildconfigfile) {
1006 # try the index dir - but do we know where it is?? try here
1007 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1008 if (!-e $buildconfigfile) {
1009 #we cant find a config file - just ignore the field list
1010 return;
1011 }
1012 }
1013 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1014 if (defined $buildcfg->{'indexfields'}) {
1015 foreach $field (@{$buildcfg->{'indexfields'}}) {
1016 push (@indexfields, "$field");
1017 }
1018 }
1019 if (defined $buildcfg->{'indexfieldmap'}) {
1020 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1021 push (@indexfieldmap, "$field");
1022 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1023 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1024 }
1025 }
1026
1027 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1028 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1029
1030}
1031sub make_auxiliary_files {
1032 my $self = shift (@_);
1033 my ($index);
1034
1035 my $build_cfg = {};
1036 # this already includes indexfieldmap and indexfields
1037 if (defined $self->{'build_cfg'}) {
1038 $build_cfg = $self->{'build_cfg'};
1039 }
1040 #my %build_cfg = ();
1041
1042 my $outhandle = $self->{'outhandle'};
1043 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1044
1045 # get the text directory
1046 &util::mk_all_dir ($self->{'build_dir'});
1047
1048 # store the build date
1049 $build_cfg->{'builddate'} = time;
1050 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1051
1052 # store the level info
1053 my @indexlevels = ();
1054 foreach $l (@{$self->{'levelorder'}}) {
1055 push (@indexlevels, %level_map->{$l});
1056 }
1057 $build_cfg->{'indexlevels'} = \@indexlevels;
1058
1059 if ($self->{'levels'}->{'section'}) {
1060 $build_cfg->{'textlevel'} = %level_map->{'section'};
1061 } else {
1062 $build_cfg->{'textlevel'} = %level_map->{'document'};
1063 }
1064 # store the number of documents and number of bytes
1065 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1066 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1067
1068 # store the mapping between the index names and the directory names
1069 my @indexmap = ();
1070 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1071 if (not defined ($self->{'notbuilt'}->{$index})) {
1072 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1073 }
1074 }
1075 $build_cfg->{'indexmap'} = \@indexmap;
1076
1077 my @subcollectionmap = ();
1078 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1079 push (@subcollectionmap, "$subcollection\-\>" .
1080 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1081 }
1082 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1083
1084 my @languagemap = ();
1085 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1086 push (@languagemap, "$language\-\>" .
1087 $self->{'index_mapping'}->{'languagemap'}->{$language});
1088 }
1089 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1090
1091 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1092 my @notbuilt = ();
1093 foreach $nb (keys %{$self->{'notbuilt'}}) {
1094 push (@notbuilt, $nb);
1095 }
1096 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1097
1098 # write out the build information
1099 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1100 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1101 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1102
1103}
1104
1105sub deinit {
1106 my $self = shift (@_);
1107}
1108
1109sub print_stats {
1110 my $self = shift (@_);
1111
1112 my $outhandle = $self->{'outhandle'};
1113 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1114 my $index = $self->{'buildproc'}->get_index();
1115 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1116 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1117
1118 if ($indexing_text) {
1119 print $outhandle "Stats (Creating index $index)\n";
1120 } else {
1121 print $outhandle "Stats (Compressing text from $index)\n";
1122 }
1123 print $outhandle "Total bytes in collection: $num_bytes\n";
1124 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1125
1126 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1127 print $outhandle "***************\n";
1128 if ($indexing_text) {
1129 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1130 } elsif (!$self->{'no_text'}) {
1131 print $outhandle "WARNING: There is very little or no text to compress\n";
1132 }
1133 print $outhandle " Was this your intention?\n";
1134 print $outhandle "***************\n";
1135 }
1136
1137}
1138
11391;
1140
1141
Note: See TracBrowser for help on using the repository browser.