source: main/tags/2.41/gsdl/perllib/mgppbuilder.pm@ 25339

Last change on this file since 25339 was 5935, checked in by kjdon, 20 years ago

now handles subcollection collmeta properly

  • Property svn:keywords set to Author Date Id Revision
File size: 38.3 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'Coverage'=>'CO',
96 'CO'=>1,
97 'allfields'=>'ZZ',
98 'ZZ'=>1,
99 'text'=>'TX',
100 'TX'=>1,
101 'AND'=>1,
102 'OR'=>1,
103 'NOT'=>1,
104 'NEAR'=>1,
105 'Doc'=>1,
106 'Sec'=>1,
107 'Para'=>1);
108
109sub new {
110 my ($class, $collection, $source_dir, $build_dir, $verbosity,
111 $maxdocs, $debug, $keepold, $allclassifications,
112 $outhandle, $no_text) = @_;
113
114 $outhandle = STDERR unless defined $outhandle;
115 $no_text = 0 unless defined $no_text;
116
117 # create an mgppbuilder object
118 my $self = bless {'collection'=>$collection,
119 'source_dir'=>$source_dir,
120 'build_dir'=>$build_dir,
121 'verbosity'=>$verbosity,
122 'maxdocs'=>$maxdocs,
123 'debug'=>$debug,
124 'keepold'=>$keepold,
125 'allclassifications'=>$allclassifications,
126 'outhandle'=>$outhandle,
127 'no_text'=>$no_text,
128 'notbuilt'=>{}, # indexes not built
129 'indexfieldmap'=>\%static_indexfield_map
130 }, $class;
131
132
133 # read in the collection configuration file
134 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
135 if (!-e $colcfgname) {
136 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
137 }
138 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
139
140 # sort out the indexes
141 #indexes are specified with spaces, but we put them into one index
142 my $indexes = $self->{'collect_cfg'}->{'indexes'};
143 $self->{'collect_cfg'}->{'indexes'} = [];
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
145
146
147 # sort out subcollection indexes
148 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
149 my $indexes = $self->{'collect_cfg'}->{'indexes'};
150 $self->{'collect_cfg'}->{'indexes'} = [];
151 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
152 foreach $index (@$indexes) {
153 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
154 }
155 }
156 }
157
158 # sort out language subindexes
159 if (defined $self->{'collect_cfg'}->{'languages'}) {
160 my $indexes = $self->{'collect_cfg'}->{'indexes'};
161 $self->{'collect_cfg'}->{'indexes'} = [];
162 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
163 foreach $index (@$indexes) {
164 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
165 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
166 }
167 else { # add in an empty subcollection field
168 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
169 }
170 }
171 }
172 }
173
174 # make sure that the same index isn't specified more than once
175 my %tmphash = ();
176 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
177 $self->{'collect_cfg'}->{'indexes'} = [];
178 foreach my $i (@tmparray) {
179 if (!defined ($tmphash{$i})) {
180 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
181 $tmphash{$i} = 1;
182 }
183 }
184
185
186 # get the levels (Section, Paragraph) for indexing and compression
187 $self->{'levels'} = {};
188 $self->{'levelorder'} = ();
189 if (defined $self->{'collect_cfg'}->{'levels'}) {
190 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
191 $level =~ tr/A-Z/a-z/;
192 $self->{'levels'}->{$level} = 1;
193 push (@{$self->{'levelorder'}}, $level);
194 }
195 } else { # default to document
196 $self->{'levels'}->{'document'} = 1;
197 push (@{$self->{'levelorder'}}, 'document');
198 }
199
200 $self->{'doc_level'} = "document";
201 if (! $self->{'levels'}->{'document'}) {
202 if ($self->{'levels'}->{'section'}) {
203 $self->{'doc_level'} = "section";
204 } else {
205 die "you must have either document or section level specified!!\n";
206 }
207 }
208 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
209 # get the list of plugins for this collection
210 my $plugins = [];
211 if (defined $self->{'collect_cfg'}->{'plugin'}) {
212 $plugins = $self->{'collect_cfg'}->{'plugin'};
213 }
214
215 # load all the plugins
216 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
217 if (scalar(@{$self->{'pluginfo'}}) == 0) {
218 print $outhandle "No plugins were loaded.\n";
219 die "\n";
220 }
221
222 # get the list of classifiers for this collection
223 my $classifiers = [];
224 if (defined $self->{'collect_cfg'}->{'classify'}) {
225 $classifiers = $self->{'collect_cfg'}->{'classify'};
226 }
227
228 # load all the classifiers
229 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
230
231 # load up any dontgdbm fields
232 $self->{'dontgdbm'} = {};
233 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
234 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
235 $self->{'dontgdbm'}->{$dg} = 1;
236 }
237 }
238
239 # load up the document processor for building
240 # if a buildproc class has been created for this collection, use it
241 # otherwise, use the mgpp buildproc
242 my ($buildprocdir, $buildproctype);
243 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
244 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
245 $buildproctype = "${collection}buildproc";
246 } else {
247 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
248 $buildproctype = "mgppbuildproc";
249 }
250 require "$buildprocdir/$buildproctype.pm";
251
252 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
253 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
254 die "$@" if $@;
255
256
257 return $self;
258}
259
260sub init {
261 my $self = shift (@_);
262
263 if (!$self->{'debug'} && !$self->{'keepold'}) {
264 # remove any old builds
265 &util::rm_r($self->{'build_dir'});
266 &util::mk_all_dir($self->{'build_dir'});
267
268 # make the text directory
269 my $textdir = "$self->{'build_dir'}/text";
270 &util::mk_all_dir($textdir);
271 }
272}
273
274sub set_strip_html {
275 my $self = shift (@_);
276 my ($strip) = @_;
277
278 $self->{'strip_html'} = $strip;
279 $self->{'buildproc'}->set_strip_html($strip);
280}
281
282sub compress_text {
283
284 my $self = shift (@_);
285 my ($textindex) = @_;
286
287 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
288 my $exe = &util::get_os_exe ();
289 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
290 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
291 my $outhandle = $self->{'outhandle'};
292
293 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
294
295 my $basefilename = "text/$self->{'collection'}";
296 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
297
298 my $osextra = "";
299 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
300 $fulltextprefix =~ s@/@\\@g;
301 }
302 else {
303 $osextra = " -d /";
304 }
305
306
307 # define the section names and possibly the doc name for mgpasses
308 # the compressor doesn't need to know about paragraphs - never want to
309 # retrieve them
310 my $mgpp_passes_sections = "";
311 my ($doc_level) = $self->{'doc_level'};
312 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} . " ";
313 foreach $level (keys %{$self->{'levels'}}) {
314 if ($level ne $doc_level && $level ne "paragraph") {
315 $mgpp_passes_sections .= "-K " . %level_map->{$level} . " ";
316 }
317 }
318
319 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
320
321 # collect the statistics for the text
322 # -b $maxdocsize sets the maximum document size to be 12 meg
323 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
324
325 my ($handle);
326 if ($self->{'debug'}) {
327 $handle = STDOUT;
328 } else {
329 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
330 if (!-e "$mgpp_passes_exe" ||
331 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
332 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
333 }
334 $handle = mgppbuilder::PIPEOUT;
335 }
336 $self->{'buildproc'}->set_output_handle ($handle);
337 $self->{'buildproc'}->set_mode ('text');
338 $self->{'buildproc'}->set_index ($textindex);
339 $self->{'buildproc'}->set_indexing_text (0);
340 if ($self->{'no_text'}) {
341 $self->{'buildproc'}->set_store_text(0);
342 } else {
343 $self->{'buildproc'}->set_store_text(1);
344 }
345 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
346 $self->{'buildproc'}->set_levels ($self->{'levels'});
347 $self->{'buildproc'}->reset();
348 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
349 $self->{'buildproc'}, $self->{'maxdocs'});
350 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
351 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
352 &plugin::end($self->{'pluginfo'});
353 close (PIPEOUT);
354
355 close ($handle) unless $self->{'debug'};
356
357 $self->print_stats();
358
359 # create the compression dictionary
360 # the compression dictionary is built by assuming the stats are from a seed
361 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
362 # and the resulting dictionary must be less than 5 meg with the most
363 # frequent words being put into the dictionary first (-2 -k 5120)
364 # note: these options are left over from mg version
365 if (!$self->{'debug'}) {
366 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
367 if (!-e "$mgpp_compression_dict_exe") {
368 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
369 }
370 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
371
372 if (!$self->{'debug'}) {
373 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
374 if (!-e "$mgpp_passes_exe" ||
375 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
376 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
377 }
378 }
379 }
380
381 $self->{'buildproc'}->reset();
382 # compress the text
383 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
384 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
385 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
386 close ($handle) unless $self->{'debug'};
387
388 $self->print_stats();
389}
390
391sub want_built {
392 my $self = shift (@_);
393 my ($index) = @_;
394
395 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
396 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
397 if ($index =~ /^$checkstr$/) {
398 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
399 $self->{'notbuilt'}->{$index} = 1;
400 return 0;
401 }
402 }
403 }
404
405 return 1;
406}
407
408sub build_indexes {
409 my $self = shift (@_);
410 my ($indexname) = @_;
411 my $outhandle = $self->{'outhandle'};
412
413 my $indexes = [];
414 if (defined $indexname && $indexname =~ /\w/) {
415 push @$indexes, $indexname;
416 } else {
417 $indexes = $self->{'collect_cfg'}->{'indexes'};
418 }
419
420 # create the mapping between the index descriptions
421 # and their directory names (includes subcolls and langs)
422 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
423
424 # build each of the indexes
425 foreach $index (@$indexes) {
426 if ($self->want_built($index)) {
427 print $outhandle "\n*** building index $index in subdirectory " .
428 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
429 $self->build_index($index);
430 } else {
431 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
432 }
433 }
434
435 #define the final field lists
436 $self->make_final_field_list();
437
438}
439
440# creates directory names for each of the index descriptions
441sub create_index_mapping {
442 my $self = shift (@_);
443 my ($indexes) = @_;
444
445 my %mapping = ();
446
447 $mapping{'indexmaporder'} = [];
448 $mapping{'subcollectionmaporder'} = [];
449 $mapping{'languagemaporder'} = [];
450
451 # dirnames is used to check for collisions. Start this off
452 # with the manditory directory names
453 my %dirnames = ('text'=>'text',
454 'extra'=>'extra');
455 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
456
457 foreach $index (@$indexes) {
458 my ($fields, $subcollection, $languages) = split (":", $index);
459 # the directory name starts with a processed version of index fields
460 #my ($pindex) = $self->process_field($fields);
461 #$pindex = lc ($pindex);
462 # now we only ever have one index, and its called 'idx'
463 $pindex = 'idx';
464
465 # next comes a processed version of the subcollection if there is one.
466 my $psub = $self->process_field ($subcollection);
467 $psub = lc ($psub);
468
469 # next comes a processed version of the language if there is one.
470 my $plang = $self->process_field ($languages);
471 $plang = lc ($plang);
472
473 my $dirname = $pindex . $psub . $plang;
474
475 # check to be sure all index names are unique
476 while (defined ($dirnames{$dirname})) {
477 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
478 }
479
480 $mapping{$index} = $dirname;
481
482 # store the mapping orders as well as the maps
483 # also put index, subcollection and language fields into the mapping thing -
484 # (the full index name (eg text:subcol:lang) is not used on
485 # the query page) -these are used for collectionmeta later on
486 if (!defined $mapping{'indexmap'}{"$fields"}) {
487 $mapping{'indexmap'}{"$fields"} = $pindex;
488 push (@{$mapping{'indexmaporder'}}, "$fields");
489 if (!defined $mapping{"$fields"}) {
490 $mapping{"$fields"} = $pindex;
491 }
492 }
493 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
494 $mapping{'subcollectionmap'}{$subcollection} = $psub;
495 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
496 $mapping{$subcollection} = $psub;
497 }
498 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
499 $mapping{'languagemap'}{$languages} = $plang;
500 push (@{$mapping{'languagemaporder'}}, $language);
501 $mapping{$languages} = $plang;
502 }
503 $dirnames{$dirname} = $index;
504 $pnames{'index'}{$pindex} = "$fields";
505 $pnames{'subcollection'}{$psub} = $subcollection;
506 $pnames{'languages'}{$plang} = $languages;
507 }
508
509 return \%mapping;
510}
511
512# returns a processed version of a field.
513# if the field has only one component the processed
514# version will contain the first character and next consonant
515# of that componant - otherwise it will contain the first
516# character of the first two components
517sub process_field {
518 my $self = shift (@_);
519 my ($field) = @_;
520
521 return "" unless (defined ($field) && $field =~ /\w/);
522
523 my @components = split /,/, $field;
524 if (scalar @components >= 2) {
525 splice (@components, 2);
526 map {s/^(.).*$/$1/;} @components;
527 return join("", @components);
528 } else {
529 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
530 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
531 return "$a$b";
532 }
533}
534
535sub make_unique {
536 my $self = shift (@_);
537 my ($namehash, $index, $indexref, $subref, $langref) = @_;
538 my ($fields, $subcollection, $languages) = split (":", $index);
539
540 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
541 $self->get_next_version ($indexref);
542 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
543 $self->get_next_version ($subref);
544 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
545 $self->get_next_version ($langref);
546 }
547 return "$$indexref$$subref$$langref";
548}
549
550sub get_next_version {
551 my $self = shift (@_);
552 my ($nameref) = @_;
553
554 if ($$nameref =~ /(\d\d)$/) {
555 my $num = $1; $num ++;
556 $$nameref =~ s/\d\d$/$num/;
557 } elsif ($$nameref =~ /(\d)$/) {
558 my $num = $1;
559 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
560 else {$num ++; $$nameref =~ s/\d$/$num/;}
561 } else {
562 $$nameref =~ s/.$/0/;
563 }
564}
565
566sub build_index {
567 my $self = shift (@_);
568 my ($index) = @_;
569 my $outhandle = $self->{'outhandle'};
570
571 # get the full index directory path and make sure it exists
572 my $indexdir = $self->{'index_mapping'}->{$index};
573 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
574 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
575 $indexdir,
576 $self->{'collection'});
577 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
578 $self->{'collection'});
579
580 # get any os specific stuff
581 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
582
583 my $exe = &util::get_os_exe ();
584 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
585
586 # define the section names for mgpasses
587 # define the section names and possibly the doc name for mgpasses
588 my $mgpp_passes_sections = "";
589 my ($doc_level) = $self->{'doc_level'};
590 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
591
592 foreach $level (keys %{$self->{'levels'}}) {
593 if ($level ne $doc_level) {
594 $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
595 }
596 }
597
598 my $mgpp_perf_hash_build_exe =
599 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
600 my $mgpp_weights_build_exe =
601 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
602 my $mgpp_invf_dict_exe =
603 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
604 my $mgpp_stem_idx_exe =
605 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
606
607 my $osextra = "";
608 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
609 $fullindexprefix =~ s@/@\\@g;
610 } else {
611 $osextra = " -d /";
612 if ($outhandle ne "STDERR") {
613 # so mgpp_passes doesn't print to stderr if we redirect output
614 $osextra .= " 2>/dev/null";
615 }
616 }
617
618 # get the index expression if this index belongs
619 # to a subcollection
620 my $indexexparr = [];
621
622 # there may be subcollection info, and language info.
623 my ($fields, $subcollection, $language) = split (":", $index);
624 my @subcollections = ();
625 @subcollections = split /,/, $subcollection if (defined $subcollection);
626
627 foreach $subcollection (@subcollections) {
628 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
629 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
630 }
631 }
632
633 # add expressions for languages if this index belongs to
634 # a language subcollection - only put languages expressions for the
635 # ones we want in the index
636
637 my @languages = ();
638 @languages = split /,/, $language if (defined $language);
639 foreach $language (@languages) {
640 my $not=0;
641 if ($language =~ s/^\!//) {
642 $not = 1;
643 }
644 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
645 if ($lang eq $language) {
646 if ($not) {
647 push (@$indexexparr, "!Language/$language/");
648 } else {
649 push (@$indexexparr, "Language/$language/");
650 }
651 last;
652 }
653 }
654 }
655
656 # Build index dictionary. Uses verbatim stem method
657 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
658 my ($handle);
659 if ($self->{'debug'}) {
660 $handle = STDOUT;
661 } else {
662 if (!-e "$mgpp_passes_exe" ||
663 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
664 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
665 }
666 $handle = mgppbuilder::PIPEOUT;
667 }
668
669 # set up the document processr
670 $self->{'buildproc'}->set_output_handle ($handle);
671 $self->{'buildproc'}->set_mode ('text');
672 $self->{'buildproc'}->set_index ($index, $indexexparr);
673 $self->{'buildproc'}->set_indexing_text (1);
674 $self->{'buildproc'}->set_store_text(1);
675 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
676 $self->{'buildproc'}->set_levels ($self->{'levels'});
677 $self->{'buildproc'}->reset();
678 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
679 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
680 close ($handle) unless $self->{'debug'};
681
682 $self->print_stats();
683
684 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
685 # we check on the .id file - index dictionary
686 my $dict_file = "$fullindexprefix.id";
687 if (!-e $dict_file) {
688 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
689 $self->{'notbuilt'}->{$index}=1;
690 return;
691 }
692
693 if (!$self->{'debug'}) {
694 # create the perfect hash function
695 if (!-e "$mgpp_perf_hash_build_exe") {
696 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
697 }
698 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
699
700 if (!-e "$mgpp_passes_exe" ||
701 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
702 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
703 }
704 }
705
706 # invert the text
707 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
708
709 $self->{'buildproc'}->reset();
710 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
711 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
712
713 $self->print_stats ();
714
715 if (!$self->{'debug'}) {
716
717 close ($handle);
718
719 # create the weights file
720 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
721 if (!-e "$mgpp_weights_build_exe") {
722 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
723 }
724 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
725
726 # create 'on-disk' stemmed dictionary
727 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
728 if (!-e "$mgpp_invf_dict_exe") {
729 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
730 }
731 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
732
733
734 # creates stem index files for the various stemming methods
735 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
736 if (!-e "$mgpp_stem_idx_exe") {
737 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
738 }
739 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
740 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
741 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
742
743 # remove unwanted files
744 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
745 opendir (DIR, $tmpdir) || die
746 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
747 foreach $file (readdir(DIR)) {
748 next if $file =~ /^\./;
749 my ($suffix) = $file =~ /\.([^\.]+)$/;
750 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
751 # delete it!
752 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
753 #&util::rm (&util::filename_cat ($tmpdir, $file));
754 }
755 }
756 closedir (DIR);
757 }
758}
759
760sub make_infodatabase {
761 my $self = shift (@_);
762 my $outhandle = $self->{'outhandle'};
763
764
765 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
766 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
767 &util::mk_all_dir ($textdir);
768 &util::mk_all_dir ($assocdir);
769
770 # get db name
771 my $dbext = ".bdb";
772 $dbext = ".ldb" if &util::is_little_endian();
773 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
774 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
775
776 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
777 my $exe = &util::get_os_exe ();
778 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
779
780 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
781 if (!defined $self->{'build_cfg'}) {
782 $self->read_final_field_list();
783 }
784 print $outhandle "\n*** creating the info database and processing associated files\n"
785 if ($self->{'verbosity'} >= 1);
786
787 # init all the classifiers
788 &classify::init_classifiers ($self->{'classifiers'});
789
790 # set up the document processor
791 my ($handle);
792 if ($self->{'debug'}) {
793 $handle = STDOUT;
794 } else {
795 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
796 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
797 }
798 $handle = mgppbuilder::PIPEOUT;
799 }
800
801 $self->{'buildproc'}->set_output_handle ($handle);
802 $self->{'buildproc'}->set_mode ('infodb');
803 $self->{'buildproc'}->set_assocdir ($assocdir);
804 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
805 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
806 $self->{'buildproc'}->set_indexing_text (0);
807 $self->{'buildproc'}->set_store_text(1);
808 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
809
810 $self->{'buildproc'}->reset();
811
812 # do the collection info
813 print $handle "[collection]\n";
814
815 # first do the collection meta stuff - everything without a dot
816 my $collmetadefined = 0;
817 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
818 $collmetadefined = 1;
819 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
820 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
821 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
822 #write the entry to the file
823 print $handle $metadata_entry;
824
825 } # foreach collmeta key
826 }
827 #add the index field macros to [collection]
828 # eg <TI>Title
829 # <SU>Subject
830 # these now come from collection meta. if that is not defined, usses the metadata name
831 $field_entry="";
832 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
833 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
834 next if $shortfield eq 1;
835
836 # we need to check if some coll meta has been defined
837 my $collmeta = ".$longfield";
838 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
839 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
840 $field_entry .= $metadata_entry;
841 } else { #use the metadata names, or the text macros for allfields and textonly
842 if ($longfield eq "allfields") {
843 $field_entry .= "<$shortfield>_query:textallfields_\n";
844 } elsif ($longfield eq "text") {
845 $field_entry .= "<$shortfield>_query:texttextonly_\n";
846 } else {
847 $field_entry .= "<$shortfield>$longfield\n";
848 }
849 }
850 }
851 print $handle $field_entry;
852
853 # now add the level names
854 $level_entry = "";
855 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
856 my $collmeta = ".$level"; # based on the original specification
857 $level =~ tr/A-Z/a-z/; # make it lower case
858 my $levelid = %level_map->{$level}; # find the actual value we used in the index
859 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
860 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
861 $level_entry .= $metadata_entry;
862 } else {
863 # use the default macro
864 $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
865 }
866 }
867 print $handle $level_entry;
868
869 # now add subcoll meta
870 $subcoll_entry = "";
871 foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
872 if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
873 my $shortname = $self->{'index_mapping'}->{$subcoll};
874 $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
875 $subcoll_entry .= $one_entry;
876 } else {
877 $subcoll_entry .= "<$shortname>$subcoll\n";
878 }
879 }
880 print $handle $subcoll_entry;
881 #end the collection entry
882 print $handle "\n" . ('-' x 70) . "\n";
883
884
885
886 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
887 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
888
889 # output classification information
890 &classify::output_classify_info ($self->{'classifiers'}, $handle,
891 $self->{'allclassifications'});
892
893 #output doclist
894 my @doclist = $self->{'buildproc'}->get_doc_list();
895 my $docs = join (";",@doclist);
896 print $handle "[browselist]\n";
897 print $handle "<hastxt>0\n";
898 print $handle "<childtype>VList\n";
899 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
900 print $handle "<thistype>Invisible\n";
901 print $handle "<contains>$docs";
902 print $handle "\n" . ('-' x 70) . "\n";
903 close ($handle) if !$self->{'debug'};
904
905}
906
907sub create_language_db_map {
908 my $self = shift (@_);
909 my ($metaname, $mapname) = @_;
910 my $outhandle = $self->{'outhandle'};
911 my $defaultfound=0;
912 my $first=1;
913 my $metadata_entry = "";
914 my $default="";
915 #iterate through the languages
916 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
917 if ($first) {
918 $first=0;
919 #set the default default to the first entry
920 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
921 }
922 if ($lang =~ /default/) {
923 $defaultfound=1;
924 #the default entry goes first
925 $metadata_entry = "<$mapname>" .
926 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
927 }
928 else {
929 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
930 if ($l) {
931 $metadata_entry .= "<$mapname:$l>" .
932 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
933 }
934 }
935 } #foreach lang
936 #if we haven't found a default, put one in
937 if (!$defaultfound) {
938 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
939 }
940 return $metadata_entry;
941
942}
943sub collect_specific {
944 my $self = shift (@_);
945}
946
947# at the end of building, we have an indexfieldmap with all teh mappings, plus
948# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
949# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
950# we store these in a build.cfg bit
951sub make_final_field_list {
952 my $self = shift (@_);
953
954 $self->{'build_cfg'} = {};
955
956 # store the indexfieldmap information
957 my @indexfieldmap = ();
958 my @indexfields = ();
959 my $specifiedfields = {};
960 my @specifiedfieldorder = ();
961 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
962 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
963 # remove subcoll stuff
964 my $parts = $field;
965 $parts =~ s/:.*$//;
966 my @fs = split(',', $parts);
967 foreach $f(@fs) {
968 if (!defined $specifiedfields->{$f}) {
969 $specifiedfields->{$f}=1;
970 push (@specifiedfieldorder, "$f");
971 }
972 }
973 }
974
975 #add all fields bit
976 foreach $field (@specifiedfieldorder) {
977 if ($field eq "metadata") {
978 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
979 if (!defined $specifiedfields->{$newfield}) {
980 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
981 push (@indexfields, "$newfield");
982 }
983 }
984
985 } elsif ($field eq 'text') {
986 push (@indexfieldmap, "text\-\>TX");
987 push (@indexfields, "text");
988 } elsif ($field eq 'allfields') {
989 push (@indexfieldmap, "allfields\-\>ZZ");
990 push (@indexfields, "allfields");
991 } else {
992 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
993 push (@indexfields, "$field");
994
995 }
996 }
997 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
998 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
999
1000
1001}
1002
1003
1004# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
1005sub read_final_field_list {
1006 my $self = shift (@_);
1007 $self->{'build_cfg'} = {};
1008 my @indexfieldmap = ();
1009 my @indexfields = ();
1010
1011 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
1012 # set the default mapping
1013 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
1014 }
1015 # we read the stuff in from the build.cfg file - if its there
1016 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
1017
1018 if (!-e $buildconfigfile) {
1019 # try the index dir - but do we know where it is?? try here
1020 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
1021 if (!-e $buildconfigfile) {
1022 #we cant find a config file - just ignore the field list
1023 return;
1024 }
1025 }
1026 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
1027 if (defined $buildcfg->{'indexfields'}) {
1028 foreach $field (@{$buildcfg->{'indexfields'}}) {
1029 push (@indexfields, "$field");
1030 }
1031 }
1032 if (defined $buildcfg->{'indexfieldmap'}) {
1033 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1034 push (@indexfieldmap, "$field");
1035 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1036 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1037 }
1038 }
1039
1040 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1041 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1042
1043}
1044sub make_auxiliary_files {
1045 my $self = shift (@_);
1046 my ($index);
1047
1048 my $build_cfg = {};
1049 # this already includes indexfieldmap and indexfields
1050 if (defined $self->{'build_cfg'}) {
1051 $build_cfg = $self->{'build_cfg'};
1052 }
1053 #my %build_cfg = ();
1054
1055 my $outhandle = $self->{'outhandle'};
1056 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1057
1058 # get the text directory
1059 &util::mk_all_dir ($self->{'build_dir'});
1060
1061 # store the build date
1062 $build_cfg->{'builddate'} = time;
1063 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1064
1065 # store the level info
1066 my @indexlevels = ();
1067 foreach $l (@{$self->{'levelorder'}}) {
1068 push (@indexlevels, %level_map->{$l});
1069 }
1070 $build_cfg->{'indexlevels'} = \@indexlevels;
1071
1072 if ($self->{'levels'}->{'section'}) {
1073 $build_cfg->{'textlevel'} = %level_map->{'section'};
1074 } else {
1075 $build_cfg->{'textlevel'} = %level_map->{'document'};
1076 }
1077 # store the number of documents and number of bytes
1078 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1079 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1080
1081 # store the mapping between the index names and the directory names
1082 my @indexmap = ();
1083 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1084 if (not defined ($self->{'notbuilt'}->{$index})) {
1085 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1086 }
1087 }
1088 $build_cfg->{'indexmap'} = \@indexmap;
1089
1090 my @subcollectionmap = ();
1091 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1092 push (@subcollectionmap, "$subcollection\-\>" .
1093 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1094 }
1095 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1096
1097 my @languagemap = ();
1098 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1099 push (@languagemap, "$language\-\>" .
1100 $self->{'index_mapping'}->{'languagemap'}->{$language});
1101 }
1102 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1103
1104 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1105 my @notbuilt = ();
1106 foreach $nb (keys %{$self->{'notbuilt'}}) {
1107 push (@notbuilt, $nb);
1108 }
1109 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
1110
1111 # write out the build information
1112 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1113 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1114 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1115
1116}
1117
1118sub deinit {
1119 my $self = shift (@_);
1120}
1121
1122sub print_stats {
1123 my $self = shift (@_);
1124
1125 my $outhandle = $self->{'outhandle'};
1126 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1127 my $index = $self->{'buildproc'}->get_index();
1128 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1129 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1130
1131 if ($indexing_text) {
1132 print $outhandle "Stats (Creating index $index)\n";
1133 } else {
1134 print $outhandle "Stats (Compressing text from $index)\n";
1135 }
1136 print $outhandle "Total bytes in collection: $num_bytes\n";
1137 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1138
1139 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1140 print $outhandle "***************\n";
1141 if ($indexing_text) {
1142 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1143 } elsif (!$self->{'no_text'}) {
1144 print $outhandle "WARNING: There is very little or no text to compress\n";
1145 }
1146 print $outhandle " Was this your intention?\n";
1147 print $outhandle "***************\n";
1148 }
1149
1150}
1151
11521;
1153
1154
Note: See TracBrowser for help on using the repository browser.