source: trunk/gsdl/perllib/mgppbuilder.pm@ 5608

Last change on this file since 5608 was 5077, checked in by kjdon, 21 years ago

fixed a bug

  • Property svn:keywords set to Author Date Id Revision
File size: 37.0 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'allfields'=>'ZZ',
96 'ZZ'=>1,
97 'text'=>'TX',
98 'TX'=>1,
99 'AND'=>1,
100 'OR'=>1,
101 'NOT'=>1,
102 'NEAR'=>1,
103 'Doc'=>1,
104 'Sec'=>1,
105 'Para'=>1);
106
107sub new {
108 my ($class, $collection, $source_dir, $build_dir, $verbosity,
109 $maxdocs, $debug, $keepold, $allclassifications,
110 $outhandle, $no_text) = @_;
111
112 $outhandle = STDERR unless defined $outhandle;
113 $no_text = 0 unless defined $no_text;
114
115 # create an mgppbuilder object
116 my $self = bless {'collection'=>$collection,
117 'source_dir'=>$source_dir,
118 'build_dir'=>$build_dir,
119 'verbosity'=>$verbosity,
120 'maxdocs'=>$maxdocs,
121 'debug'=>$debug,
122 'keepold'=>$keepold,
123 'allclassifications'=>$allclassifications,
124 'outhandle'=>$outhandle,
125 'no_text'=>$no_text,
126 'notbuilt'=>[], # indexes not built
127 'indexfieldmap'=>\%static_indexfield_map
128 }, $class;
129
130
131 # read in the collection configuration file
132 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
133 if (!-e $colcfgname) {
134 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
135 }
136 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
137
138 # sort out the indexes
139 #indexes are specified with spaces, but we put them into one index
140 my $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
143
144
145 # sort out subcollection indexes
146 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
147 my $indexes = $self->{'collect_cfg'}->{'indexes'};
148 $self->{'collect_cfg'}->{'indexes'} = [];
149 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
150 foreach $index (@$indexes) {
151 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
152 }
153 }
154 }
155
156 # sort out language subindexes
157 if (defined $self->{'collect_cfg'}->{'languages'}) {
158 my $indexes = $self->{'collect_cfg'}->{'indexes'};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
161 foreach $index (@$indexes) {
162 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
163 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
164 }
165 else { # add in an empty subcollection field
166 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
167 }
168 }
169 }
170 }
171
172 # make sure that the same index isn't specified more than once
173 my %tmphash = ();
174 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
175 $self->{'collect_cfg'}->{'indexes'} = [];
176 foreach my $i (@tmparray) {
177 if (!defined ($tmphash{$i})) {
178 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
179 $tmphash{$i} = 1;
180 }
181 }
182
183
184 # get the levels (Section, Paragraph) for indexing and compression
185 $self->{'levels'} = {};
186 $self->{'levelorder'} = ();
187 if (defined $self->{'collect_cfg'}->{'levels'}) {
188 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
189 $level =~ tr/A-Z/a-z/;
190 $self->{'levels'}->{$level} = 1;
191 push (@{$self->{'levelorder'}}, $level);
192 }
193 } else { # default to document
194 $self->{'levels'}->{'document'} = 1;
195 push (@{$self->{'levelorder'}}, 'document');
196 }
197
198 $self->{'doc_level'} = "document";
199 if (! $self->{'levels'}->{'document'}) {
200 if ($self->{'levels'}->{'section'}) {
201 $self->{'doc_level'} = "section";
202 } else {
203 die "you must have either document or section level specified!!\n";
204 }
205 }
206 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
207 # get the list of plugins for this collection
208 my $plugins = [];
209 if (defined $self->{'collect_cfg'}->{'plugin'}) {
210 $plugins = $self->{'collect_cfg'}->{'plugin'};
211 }
212
213 # load all the plugins
214 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
215 if (scalar(@{$self->{'pluginfo'}}) == 0) {
216 print $outhandle "No plugins were loaded.\n";
217 die "\n";
218 }
219
220 # get the list of classifiers for this collection
221 my $classifiers = [];
222 if (defined $self->{'collect_cfg'}->{'classify'}) {
223 $classifiers = $self->{'collect_cfg'}->{'classify'};
224 }
225
226 # load all the classifiers
227 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
228
229 # load up any dontgdbm fields
230 $self->{'dontgdbm'} = {};
231 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
232 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
233 $self->{'dontgdbm'}->{$dg} = 1;
234 }
235 }
236
237 # load up the document processor for building
238 # if a buildproc class has been created for this collection, use it
239 # otherwise, use the mgpp buildproc
240 my ($buildprocdir, $buildproctype);
241 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
242 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
243 $buildproctype = "${collection}buildproc";
244 } else {
245 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
246 $buildproctype = "mgppbuildproc";
247 }
248 require "$buildprocdir/$buildproctype.pm";
249
250 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
251 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
252 die "$@" if $@;
253
254
255 return $self;
256}
257
258sub init {
259 my $self = shift (@_);
260
261 if (!$self->{'debug'} && !$self->{'keepold'}) {
262 # remove any old builds
263 &util::rm_r($self->{'build_dir'});
264 &util::mk_all_dir($self->{'build_dir'});
265
266 # make the text directory
267 my $textdir = "$self->{'build_dir'}/text";
268 &util::mk_all_dir($textdir);
269 }
270}
271
272sub set_strip_html {
273 my $self = shift (@_);
274 my ($strip) = @_;
275
276 $self->{'strip_html'} = $strip;
277 $self->{'buildproc'}->set_strip_html($strip);
278}
279
280sub compress_text {
281
282 my $self = shift (@_);
283 my ($textindex) = @_;
284
285 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
286 my $exe = &util::get_os_exe ();
287 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
288 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
289 my $outhandle = $self->{'outhandle'};
290
291 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
292
293 my $basefilename = "text/$self->{'collection'}";
294 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
295
296 my $osextra = "";
297 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
298 $fulltextprefix =~ s@/@\\@g;
299 }
300 else {
301 $osextra = " -d /";
302 }
303
304
305 # define the section names and possibly the doc name for mgpasses
306 # the compressor doesn't need to know about paragraphs - never want to
307 # retrieve them
308 my $mgpp_passes_sections = "";
309 my ($doc_level) = $self->{'doc_level'};
310 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} . " ";
311 foreach $level (keys %{$self->{'levels'}}) {
312 if ($level ne $doc_level && $level ne "paragraph") {
313 $mgpp_passes_sections .= "-K " . %level_map->{$level} . " ";
314 }
315 }
316
317 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
318
319 # collect the statistics for the text
320 # -b $maxdocsize sets the maximum document size to be 12 meg
321 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
322
323 my ($handle);
324 if ($self->{'debug'}) {
325 $handle = STDOUT;
326 } else {
327 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
328 if (!-e "$mgpp_passes_exe" ||
329 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
330 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
331 }
332 $handle = mgppbuilder::PIPEOUT;
333 }
334 $self->{'buildproc'}->set_output_handle ($handle);
335 $self->{'buildproc'}->set_mode ('text');
336 $self->{'buildproc'}->set_index ($textindex);
337 $self->{'buildproc'}->set_indexing_text (0);
338 if ($self->{'no_text'}) {
339 $self->{'buildproc'}->set_store_text(0);
340 } else {
341 $self->{'buildproc'}->set_store_text(1);
342 }
343 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
344 $self->{'buildproc'}->set_levels ($self->{'levels'});
345 $self->{'buildproc'}->reset();
346 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
347 $self->{'buildproc'}, $self->{'maxdocs'});
348 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
349 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
350 &plugin::end($self->{'pluginfo'});
351 close (PIPEOUT);
352
353 close ($handle) unless $self->{'debug'};
354
355 $self->print_stats();
356
357 # create the compression dictionary
358 # the compression dictionary is built by assuming the stats are from a seed
359 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
360 # and the resulting dictionary must be less than 5 meg with the most
361 # frequent words being put into the dictionary first (-2 -k 5120)
362 # note: these options are left over from mg version
363 if (!$self->{'debug'}) {
364 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
365 if (!-e "$mgpp_compression_dict_exe") {
366 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
367 }
368 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
369
370 if (!$self->{'debug'}) {
371 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
372 if (!-e "$mgpp_passes_exe" ||
373 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
374 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
375 }
376 }
377 }
378
379 $self->{'buildproc'}->reset();
380 # compress the text
381 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
382 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
383 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
384 close ($handle) unless $self->{'debug'};
385
386 $self->print_stats();
387}
388
389sub want_built {
390 my $self = shift (@_);
391 my ($index) = @_;
392
393 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
394 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
395 if ($index =~ /^$checkstr$/) {
396 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
397 return 0;
398 }
399 }
400 }
401
402 return 1;
403}
404
405sub build_indexes {
406 my $self = shift (@_);
407 my ($indexname) = @_;
408 my $outhandle = $self->{'outhandle'};
409
410 my $indexes = [];
411 if (defined $indexname && $indexname =~ /\w/) {
412 push @$indexes, $indexname;
413 } else {
414 $indexes = $self->{'collect_cfg'}->{'indexes'};
415 }
416
417 # create the mapping between the index descriptions
418 # and their directory names (includes subcolls and langs)
419 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
420
421 # build each of the indexes
422 foreach $index (@$indexes) {
423 if ($self->want_built($index)) {
424 print $outhandle "\n*** building index $index in subdirectory " .
425 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
426 $self->build_index($index);
427 } else {
428 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
429 }
430 }
431}
432
433# creates directory names for each of the index descriptions
434sub create_index_mapping {
435 my $self = shift (@_);
436 my ($indexes) = @_;
437
438 my %mapping = ();
439 $mapping{'indexmaporder'} = [];
440 $mapping{'subcollectionmaporder'} = [];
441 $mapping{'languagemaporder'} = [];
442
443 # dirnames is used to check for collisions. Start this off
444 # with the manditory directory names
445 my %dirnames = ('text'=>'text',
446 'extra'=>'extra');
447 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
448
449 foreach $index (@$indexes) {
450 my ($fields, $subcollection, $languages) = split (":", $index);
451
452 # the directory name starts with a processed version of index fields
453 #my ($pindex) = $self->process_field($fields);
454 #$pindex = lc ($pindex);
455 # now we only ever have one index, and its called 'idx'
456 $pindex = 'idx';
457
458 # next comes a processed version of the subcollection if there is one.
459 my $psub = $self->process_field ($subcollection);
460 $psub = lc ($psub);
461
462 # next comes a processed version of the language if there is one.
463 my $plang = $self->process_field ($languages);
464 $plang = lc ($plang);
465
466 my $dirname = $pindex . $psub . $plang;
467
468 # check to be sure all index names are unique
469 while (defined ($dirnames{$dirname})) {
470 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
471 }
472
473 $mapping{$index} = $dirname;
474
475 # store the mapping orders as well as the maps
476 # also put index, subcollection and language fields into the mapping thing -
477 # (the full index name (eg text:subcol:lang) is not used on
478 # the query page) -these are used for collectionmeta later on
479 if (!defined $mapping{'indexmap'}{"$fields"}) {
480 $mapping{'indexmap'}{"$fields"} = $pindex;
481 push (@{$mapping{'indexmaporder'}}, "$fields");
482 if (!defined $mapping{"$fields"}) {
483 $mapping{"$fields"} = $pindex;
484 }
485 }
486 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
487 $mapping{'subcollectionmap'}{$subcollection} = $psub;
488 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
489 $mapping{$subcollection} = $psub;
490 }
491 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
492 $mapping{'languagemap'}{$languages} = $plang;
493 push (@{$mapping{'languagemaporder'}}, $language);
494 $mapping{$languages} = $plang;
495 }
496 $dirnames{$dirname} = $index;
497 $pnames{'index'}{$pindex} = "$fields";
498 $pnames{'subcollection'}{$psub} = $subcollection;
499 $pnames{'languages'}{$plang} = $languages;
500 }
501
502 return \%mapping;
503}
504
505# returns a processed version of a field.
506# if the field has only one component the processed
507# version will contain the first character and next consonant
508# of that componant - otherwise it will contain the first
509# character of the first two components
510sub process_field {
511 my $self = shift (@_);
512 my ($field) = @_;
513
514 return "" unless (defined ($field) && $field =~ /\w/);
515
516 my @components = split /,/, $field;
517 if (scalar @components >= 2) {
518 splice (@components, 2);
519 map {s/^(.).*$/$1/;} @components;
520 return join("", @components);
521 } else {
522 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
523 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
524 return "$a$b";
525 }
526}
527
528sub make_unique {
529 my $self = shift (@_);
530 my ($namehash, $index, $indexref, $subref, $langref) = @_;
531 my ($fields, $subcollection, $languages) = split (":", $index);
532
533 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
534 $self->get_next_version ($indexref);
535 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
536 $self->get_next_version ($subref);
537 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
538 $self->get_next_version ($langref);
539 }
540 return "$$indexref$$subref$$langref";
541}
542
543sub get_next_version {
544 my $self = shift (@_);
545 my ($nameref) = @_;
546
547 if ($$nameref =~ /(\d\d)$/) {
548 my $num = $1; $num ++;
549 $$nameref =~ s/\d\d$/$num/;
550 } elsif ($$nameref =~ /(\d)$/) {
551 my $num = $1;
552 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
553 else {$num ++; $$nameref =~ s/\d$/$num/;}
554 } else {
555 $$nameref =~ s/.$/0/;
556 }
557}
558
559sub build_index {
560 my $self = shift (@_);
561 my ($index) = @_;
562 my $outhandle = $self->{'outhandle'};
563
564 # get the full index directory path and make sure it exists
565 my $indexdir = $self->{'index_mapping'}->{$index};
566 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
567 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
568 $indexdir,
569 $self->{'collection'});
570 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
571 $self->{'collection'});
572
573 # get any os specific stuff
574 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
575
576 my $exe = &util::get_os_exe ();
577 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
578
579 # define the section names for mgpasses
580 # define the section names and possibly the doc name for mgpasses
581 my $mgpp_passes_sections = "";
582 my ($doc_level) = $self->{'doc_level'};
583 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
584
585 foreach $level (keys %{$self->{'levels'}}) {
586 if ($level ne $doc_level) {
587 $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
588 }
589 }
590
591 my $mgpp_perf_hash_build_exe =
592 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
593 my $mgpp_weights_build_exe =
594 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
595 my $mgpp_invf_dict_exe =
596 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
597 my $mgpp_stem_idx_exe =
598 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
599
600 my $osextra = "";
601 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
602 $fullindexprefix =~ s@/@\\@g;
603 } else {
604 $osextra = " -d /";
605 if ($outhandle ne "STDERR") {
606 # so mgpp_passes doesn't print to stderr if we redirect output
607 $osextra .= " 2>/dev/null";
608 }
609 }
610
611 # get the index expression if this index belongs
612 # to a subcollection
613 my $indexexparr = [];
614
615 # there may be subcollection info, and language info.
616 my ($fields, $subcollection, $language) = split (":", $index);
617 my @subcollections = ();
618 @subcollections = split /,/, $subcollection if (defined $subcollection);
619
620 foreach $subcollection (@subcollections) {
621 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
622 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
623 }
624 }
625
626 # add expressions for languages if this index belongs to
627 # a language subcollection - only put languages expressions for the
628 # ones we want in the index
629
630 my @languages = ();
631 @languages = split /,/, $language if (defined $language);
632 foreach $language (@languages) {
633 my $not=0;
634 if ($language =~ s/^\!//) {
635 $not = 1;
636 }
637 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
638 if ($lang eq $language) {
639 if ($not) {
640 push (@$indexexparr, "!Language/$language/");
641 } else {
642 push (@$indexexparr, "Language/$language/");
643 }
644 last;
645 }
646 }
647 }
648
649 # Build index dictionary. Uses verbatim stem method
650 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
651 my ($handle);
652 if ($self->{'debug'}) {
653 $handle = STDOUT;
654 } else {
655 if (!-e "$mgpp_passes_exe" ||
656 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
657 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
658 }
659 $handle = mgppbuilder::PIPEOUT;
660 }
661
662 # set up the document processr
663 $self->{'buildproc'}->set_output_handle ($handle);
664 $self->{'buildproc'}->set_mode ('text');
665 $self->{'buildproc'}->set_index ($index, $indexexparr);
666 $self->{'buildproc'}->set_indexing_text (1);
667 $self->{'buildproc'}->set_store_text(1);
668 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
669 $self->{'buildproc'}->set_levels ($self->{'levels'});
670 $self->{'buildproc'}->reset();
671 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
672 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
673 close ($handle) unless $self->{'debug'};
674
675 $self->print_stats();
676
677 if (!$self->{'debug'}) {
678 # create the perfect hash function
679 if (!-e "$mgpp_perf_hash_build_exe") {
680 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
681 }
682 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
683
684 if (!-e "$mgpp_passes_exe" ||
685 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
686 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
687 }
688 }
689
690 # invert the text
691 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
692
693 $self->{'buildproc'}->reset();
694 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
695 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
696
697 $self->print_stats ();
698
699 if (!$self->{'debug'}) {
700
701 close ($handle);
702
703 # create the weights file
704 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
705 if (!-e "$mgpp_weights_build_exe") {
706 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
707 }
708 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
709
710 # create 'on-disk' stemmed dictionary
711 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
712 if (!-e "$mgpp_invf_dict_exe") {
713 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
714 }
715 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
716
717
718 # creates stem index files for the various stemming methods
719 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
720 if (!-e "$mgpp_stem_idx_exe") {
721 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
722 }
723 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
724 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
725 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
726
727 #define the final field lists
728 $self->make_final_field_list();
729
730 # remove unwanted files
731 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
732 opendir (DIR, $tmpdir) || die
733 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
734 foreach $file (readdir(DIR)) {
735 next if $file =~ /^\./;
736 my ($suffix) = $file =~ /\.([^\.]+)$/;
737 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
738 # delete it!
739 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
740 #&util::rm (&util::filename_cat ($tmpdir, $file));
741 }
742 }
743 closedir (DIR);
744 }
745}
746
747sub make_infodatabase {
748 my $self = shift (@_);
749 my $outhandle = $self->{'outhandle'};
750
751
752 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
753 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
754 &util::mk_all_dir ($textdir);
755 &util::mk_all_dir ($assocdir);
756
757 # get db name
758 my $dbext = ".bdb";
759 $dbext = ".ldb" if &util::is_little_endian();
760 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
761 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
762
763 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
764 my $exe = &util::get_os_exe ();
765 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
766
767 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
768 if (!defined $self->{'build_cfg'}) {
769 $self->read_final_field_list();
770 }
771 print $outhandle "\n*** creating the info database and processing associated files\n"
772 if ($self->{'verbosity'} >= 1);
773
774 # init all the classifiers
775 &classify::init_classifiers ($self->{'classifiers'});
776
777 # set up the document processor
778 my ($handle);
779 if ($self->{'debug'}) {
780 $handle = STDOUT;
781 } else {
782 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
783 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
784 }
785 $handle = mgppbuilder::PIPEOUT;
786 }
787
788 $self->{'buildproc'}->set_output_handle ($handle);
789 $self->{'buildproc'}->set_mode ('infodb');
790 $self->{'buildproc'}->set_assocdir ($assocdir);
791 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
792 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
793 $self->{'buildproc'}->set_indexing_text (0);
794 $self->{'buildproc'}->set_store_text(1);
795 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
796
797 $self->{'buildproc'}->reset();
798
799 # do the collection info
800 print $handle "[collection]\n";
801
802 # first do the collection meta stuff - everything without a dot
803 my $collmetadefined = 0;
804 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
805 $collmetadefined = 1;
806 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
807 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
808 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
809 #write the entry to the file
810 print $handle $metadata_entry;
811
812 } # foreach collmeta key
813 }
814 #add the index field macros to [collection]
815 # eg <TI>Title
816 # <SU>Subject
817 # these now come from collection meta. if that is not defined, usses the metadata name
818 $field_entry="";
819 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
820 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
821 next if $shortfield eq 1;
822
823 # we need to check if some coll meta has been defined
824 my $collmeta = ".$longfield";
825 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
826 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
827 $field_entry .= $metadata_entry;
828 } else { #use the metadata names, or the text macros for allfields and textonly
829 if ($longfield eq "allfields") {
830 $field_entry .= "<$shortfield>_query:textallfields_\n";
831 } elsif ($longfield eq "text") {
832 $field_entry .= "<$shortfield>_query:texttextonly_\n";
833 } else {
834 $field_entry .= "<$shortfield>$longfield\n";
835 }
836 }
837 }
838 print $handle $field_entry;
839
840 # now add the level names
841 $level_entry = "";
842 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
843 my $collmeta = ".$level"; # based on the original specification
844 $level =~ tr/A-Z/a-z/; # make it lower case
845 my $levelid = %level_map->{$level}; # find the actual value we used in the index
846 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
847 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
848 $level_entry .= $metadata_entry;
849 } else {
850 # use the default macro
851 $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
852 }
853 }
854 print $handle $level_entry;
855 #end the collection entry
856 print $handle "\n" . ('-' x 70) . "\n";
857
858
859
860 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
861 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
862
863 # output classification information
864 &classify::output_classify_info ($self->{'classifiers'}, $handle,
865 $self->{'allclassifications'});
866
867 #output doclist
868 my @doclist = $self->{'buildproc'}->get_doc_list();
869 my $docs = join (";",@doclist);
870 print $handle "[browselist]\n";
871 print $handle "<hastxt>0\n";
872 print $handle "<childtype>VList\n";
873 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
874 print $handle "<thistype>Invisible\n";
875 print $handle "<contains>$docs";
876 print $handle "\n" . ('-' x 70) . "\n";
877 close ($handle) if !$self->{'debug'};
878
879}
880
881sub create_language_db_map {
882 my $self = shift (@_);
883 my ($metaname, $mapname) = @_;
884 my $outhandle = $self->{'outhandle'};
885 my $defaultfound=0;
886 my $first=1;
887 my $metadata_entry = "";
888 my $default="";
889 #iterate through the languages
890 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
891 if ($first) {
892 $first=0;
893 #set the default default to the first entry
894 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
895 }
896 if ($lang =~ /default/) {
897 $defaultfound=1;
898 #the default entry goes first
899 $metadata_entry = "<$mapname>" .
900 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
901 }
902 else {
903 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
904 if ($l) {
905 $metadata_entry .= "<$mapname:$l>" .
906 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
907 }
908 }
909 } #foreach lang
910 #if we haven't found a default, put one in
911 if (!$defaultfound) {
912 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
913 }
914 return $metadata_entry;
915
916}
917sub collect_specific {
918 my $self = shift (@_);
919}
920
921# at the end of building, we have an indexfieldmap with all teh mappings, plus
922# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
923# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
924# we store these in a build.cfg bit
925sub make_final_field_list {
926 my $self = shift (@_);
927
928 $self->{'build_cfg'} = {};
929
930 # store the indexfieldmap information
931 my @indexfieldmap = ();
932 my @indexfields = ();
933 my $specifiedfields = {};
934 my @specifiedfieldorder = ();
935 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
936 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
937 my @fs = split(',', $field);
938 foreach $f(@fs) {
939 $specifiedfields->{$f}=1;
940 push (@specifiedfieldorder, "$f");
941 }
942 }
943
944 #add all fields bit
945 foreach $field (@specifiedfieldorder) {
946 if ($field eq "metadata") {
947 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
948 if (!defined $specifiedfields->{$newfield}) {
949 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
950 push (@indexfields, "$newfield");
951 }
952 }
953
954 } elsif ($field eq 'text') {
955 push (@indexfieldmap, "text\-\>TX");
956 push (@indexfields, "text");
957 } elsif ($field eq 'allfields') {
958 push (@indexfieldmap, "allfields\-\>ZZ");
959 push (@indexfields, "allfields");
960 } else {
961 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
962 push (@indexfields, "$field");
963
964 }
965 }
966 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
967 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
968
969
970}
971
972
973# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
974sub read_final_field_list {
975 my $self = shift (@_);
976 $self->{'build_cfg'} = {};
977 my @indexfieldmap = ();
978 my @indexfields = ();
979
980 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
981 # set the default mapping
982 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
983 }
984 # we read the stuff in from the build.cfg file - if its there
985 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
986
987 if (!-e $buildconfigfile) {
988 # try the index dir - but do we know where it is?? try here
989 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
990 if (!-e $buildconfigfile) {
991 #we cant find a config file - just ignore the field list
992 return;
993 }
994 }
995 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
996 if (defined $buildcfg->{'indexfields'}) {
997 foreach $field (@{$buildcfg->{'indexfields'}}) {
998 push (@indexfields, "$field");
999 }
1000 }
1001 if (defined $buildcfg->{'indexfieldmap'}) {
1002 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1003 push (@indexfieldmap, "$field");
1004 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1005 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1006 }
1007 }
1008
1009 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1010 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1011
1012}
1013sub make_auxiliary_files {
1014 my $self = shift (@_);
1015 my ($index);
1016
1017 my $build_cfg = {};
1018 # this already includes indexfieldmap and indexfields
1019 if (defined $self->{'build_cfg'}) {
1020 $build_cfg = $self->{'build_cfg'};
1021 }
1022 #my %build_cfg = ();
1023
1024 my $outhandle = $self->{'outhandle'};
1025 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1026
1027 # get the text directory
1028 &util::mk_all_dir ($self->{'build_dir'});
1029
1030 # store the build date
1031 $build_cfg->{'builddate'} = time;
1032 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1033
1034 # store the level info
1035 my @indexlevels = ();
1036 foreach $l (@{$self->{'levelorder'}}) {
1037 push (@indexlevels, %level_map->{$l});
1038 }
1039 $build_cfg->{'indexlevels'} = \@indexlevels;
1040
1041 if ($self->{'levels'}->{'section'}) {
1042 $build_cfg->{'textlevel'} = %level_map->{'section'};
1043 } else {
1044 $build_cfg->{'textlevel'} = %level_map->{'document'};
1045 }
1046 # store the number of documents and number of bytes
1047 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1048 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1049
1050 # store the mapping between the index names and the directory names
1051 my @indexmap = ();
1052 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1053 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1054 }
1055 $build_cfg->{'indexmap'} = \@indexmap;
1056
1057 my @subcollectionmap = ();
1058 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1059 push (@subcollectionmap, "$subcollection\-\>" .
1060 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1061 }
1062 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1063
1064 my @languagemap = ();
1065 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1066 push (@languagemap, "$language\-\>" .
1067 $self->{'index_mapping'}->{'languagemap'}->{$language});
1068 }
1069 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1070
1071 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1072
1073 # write out the build information
1074 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1075 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1076 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1077
1078}
1079
1080sub deinit {
1081 my $self = shift (@_);
1082}
1083
1084sub print_stats {
1085 my $self = shift (@_);
1086
1087 my $outhandle = $self->{'outhandle'};
1088 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1089 my $index = $self->{'buildproc'}->get_index();
1090 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1091 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1092
1093 if ($indexing_text) {
1094 print $outhandle "Stats (Creating index $index)\n";
1095 } else {
1096 print $outhandle "Stats (Compressing text from $index)\n";
1097 }
1098 print $outhandle "Total bytes in collection: $num_bytes\n";
1099 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1100
1101 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1102 print $outhandle "***************\n";
1103 if ($indexing_text) {
1104 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1105 } elsif (!$self->{'no_text'}) {
1106 print $outhandle "WARNING: There is very little or no text to compress\n";
1107 }
1108 print $outhandle " Was this your intention?\n";
1109 print $outhandle "***************\n";
1110 }
1111
1112}
1113
11141;
1115
1116
Note: See TracBrowser for help on using the repository browser.