source: main/tags/2.40/gsdl/perllib/mgppbuilder.pm@ 21110

Last change on this file since 21110 was 4811, checked in by kjdon, 21 years ago

levels are now specified using upper or lower case, eg Section or section. if levels aren't specified, use document, otherwise use only what is specified eg levels section will only give section level. we use Doc, Sec, Para when passing the stuff to mgpp. the build.cfg file now contains indexlevels and textlevel entries - these give the actual names used by mgpp, and mean that the c++ code no longer has to assume them. collection meta can be specified for the levels, otherwise _textdocument_, _textsection_ and _textparagraph_ will be used.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.8 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mgpp
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50%level_map = ('document'=>'Doc',
51 'section'=>'Sec',
52 'paragraph'=>'Para',
53 'Doc'=>'_textdocument_',
54 'Sec'=>'_textsection_',
55 'Para'=>'_textparagraph_');
56
57#$doc_level = "Doc";
58#$sec_level = "Sec";
59#$para_level = "Para";
60
61%wanted_index_files = ('td'=>1,
62 't'=>1,
63 'tl'=>1,
64 'ti'=>1,
65 'idb'=>1,
66 'ib1'=>1,
67 'ib2'=>1,
68 'ib3'=>1,
69 'i'=>1,
70 'il'=>1,
71 'w'=>1,
72 'wa'=>1);
73
74# change this so a user can add their own ones in via a file or cfg
75#add AND, OR, NOT NEAR to this list - these cannot be used as field names
76#also add the level names (Doc, Sec, Para)
77%static_indexfield_map = ('Title'=>'TI',
78 'TI'=>1,
79 'Subject'=>'SU',
80 'SU'=>1,
81 'Creator'=>'CR',
82 'CR'=>1,
83 'Organization'=>'ORG',
84 'ORG'=>1,
85 'Source'=>'SO',
86 'SO'=>1,
87 'Howto'=>'HT',
88 'HT'=>1,
89 'ItemTitle'=>'IT',
90 'IT'=>1,
91 'ProgNumber'=>'PN',
92 'PN'=>1,
93 'People'=>'PE',
94 'PE'=>1,
95 'allfields'=>'ZZ',
96 'ZZ'=>1,
97 'text'=>'TX',
98 'TX'=>1,
99 'AND'=>1,
100 'OR'=>1,
101 'NOT'=>1,
102 'NEAR'=>1,
103 'Doc'=>1,
104 'Sec'=>1,
105 'Para'=>1);
106
107sub new {
108 my ($class, $collection, $source_dir, $build_dir, $verbosity,
109 $maxdocs, $debug, $keepold, $allclassifications,
110 $outhandle, $no_text) = @_;
111
112 $outhandle = STDERR unless defined $outhandle;
113 $no_text = 0 unless defined $no_text;
114
115 # create an mgppbuilder object
116 my $self = bless {'collection'=>$collection,
117 'source_dir'=>$source_dir,
118 'build_dir'=>$build_dir,
119 'verbosity'=>$verbosity,
120 'maxdocs'=>$maxdocs,
121 'debug'=>$debug,
122 'keepold'=>$keepold,
123 'allclassifications'=>$allclassifications,
124 'outhandle'=>$outhandle,
125 'no_text'=>$no_text,
126 'notbuilt'=>[], # indexes not built
127 'indexfieldmap'=>\%static_indexfield_map
128 }, $class;
129
130
131 # read in the collection configuration file
132 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
133 if (!-e $colcfgname) {
134 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
135 }
136 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
137
138 # sort out the indexes
139 #indexes are specified with spaces, but we put them into one index
140 my $indexes = $self->{'collect_cfg'}->{'indexes'};
141 $self->{'collect_cfg'}->{'indexes'} = [];
142 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
143
144
145 # sort out subcollection indexes
146 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
147 my $indexes = $self->{'collect_cfg'}->{'indexes'};
148 $self->{'collect_cfg'}->{'indexes'} = [];
149 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
150 foreach $index (@$indexes) {
151 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
152 }
153 }
154 }
155
156 # sort out language subindexes
157 if (defined $self->{'collect_cfg'}->{'languages'}) {
158 my $indexes = $self->{'collect_cfg'}->{'indexes'};
159 $self->{'collect_cfg'}->{'indexes'} = [];
160 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
161 foreach $index (@$indexes) {
162 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
163 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
164 }
165 else { # add in an empty subcollection field
166 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
167 }
168 }
169 }
170 }
171
172 # make sure that the same index isn't specified more than once
173 my %tmphash = ();
174 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
175 $self->{'collect_cfg'}->{'indexes'} = [];
176 foreach my $i (@tmparray) {
177 if (!defined ($tmphash{$i})) {
178 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
179 $tmphash{$i} = 1;
180 }
181 }
182
183
184 # get the levels (Section, Paragraph) for indexing and compression
185 $self->{'levels'} = {};
186 $self->{'levelorder'} = ();
187 if (defined $self->{'collect_cfg'}->{'levels'}) {
188 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
189 $level =~ tr/A-Z/a-z/;
190 $self->{'levels'}->{$level} = 1;
191 push (@{$self->{'levelorder'}}, $level);
192 }
193 } else { # default to document
194 $self->{'levels'}->{'document'} = 1;
195 push (@{$self->{'levelorder'}}, 'document');
196 }
197
198 $self->{'doc_level'} = "document";
199 if (! $self->{'levels'}->{'document'}) {
200 if ($self->{'levels'}->{'section'}) {
201 $self->{'doc_level'} = "section";
202 } else {
203 die "you must have either document or section level specified!!\n";
204 }
205 }
206 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
207 # get the list of plugins for this collection
208 my $plugins = [];
209 if (defined $self->{'collect_cfg'}->{'plugin'}) {
210 $plugins = $self->{'collect_cfg'}->{'plugin'};
211 }
212
213 # load all the plugins
214 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
215 if (scalar(@{$self->{'pluginfo'}}) == 0) {
216 print $outhandle "No plugins were loaded.\n";
217 die "\n";
218 }
219
220 # get the list of classifiers for this collection
221 my $classifiers = [];
222 if (defined $self->{'collect_cfg'}->{'classify'}) {
223 $classifiers = $self->{'collect_cfg'}->{'classify'};
224 }
225
226 # load all the classifiers
227 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
228
229 # load up any dontgdbm fields
230 $self->{'dontgdbm'} = {};
231 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
232 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
233 $self->{'dontgdbm'}->{$dg} = 1;
234 }
235 }
236
237 # load up the document processor for building
238 # if a buildproc class has been created for this collection, use it
239 # otherwise, use the mgpp buildproc
240 my ($buildprocdir, $buildproctype);
241 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
242 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
243 $buildproctype = "${collection}buildproc";
244 } else {
245 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
246 $buildproctype = "mgppbuildproc";
247 }
248 require "$buildprocdir/$buildproctype.pm";
249
250 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
251 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
252 die "$@" if $@;
253
254
255 return $self;
256}
257
258sub init {
259 my $self = shift (@_);
260
261 if (!$self->{'debug'} && !$self->{'keepold'}) {
262 # remove any old builds
263 &util::rm_r($self->{'build_dir'});
264 &util::mk_all_dir($self->{'build_dir'});
265
266 # make the text directory
267 my $textdir = "$self->{'build_dir'}/text";
268 &util::mk_all_dir($textdir);
269 }
270}
271
272sub set_strip_html {
273 my $self = shift (@_);
274 my ($strip) = @_;
275
276 $self->{'strip_html'} = $strip;
277 $self->{'buildproc'}->set_strip_html($strip);
278}
279
280sub compress_text {
281
282 my $self = shift (@_);
283 my ($textindex) = @_;
284
285 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
286 my $exe = &util::get_os_exe ();
287 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
288 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
289 my $outhandle = $self->{'outhandle'};
290
291 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
292
293 my $basefilename = "text/$self->{'collection'}";
294 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
295
296 my $osextra = "";
297 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
298 $fulltextprefix =~ s@/@\\@g;
299 }
300 else {
301 $osextra = " -d /";
302 }
303
304
305 # define the section names and possibly the doc name for mgpasses
306 # the compressor doesn't need to know about paragraphs - never want to
307 # retrieve them
308 my $mgpp_passes_sections = "";
309 my ($doc_level) = $self->{'doc_level'};
310 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level};
311 foreach $level (keys %{$self->{'levels'}}) {
312 if ($level ne $doc_level && $level ne "paragraph") {
313 $mgpp_passes_sections .= "-K " . %level_map->{$level};
314 }
315 }
316
317 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
318
319 # collect the statistics for the text
320 # -b $maxdocsize sets the maximum document size to be 12 meg
321 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
322
323 my ($handle);
324 if ($self->{'debug'}) {
325 $handle = STDOUT;
326 } else {
327 if (!-e "$mgpp_passes_exe" ||
328 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
329 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
330 }
331 $handle = mgppbuilder::PIPEOUT;
332 }
333 $self->{'buildproc'}->set_output_handle ($handle);
334 $self->{'buildproc'}->set_mode ('text');
335 $self->{'buildproc'}->set_index ($textindex);
336 $self->{'buildproc'}->set_indexing_text (0);
337 if ($self->{'no_text'}) {
338 $self->{'buildproc'}->set_store_text(0);
339 } else {
340 $self->{'buildproc'}->set_store_text(1);
341 }
342 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
343 $self->{'buildproc'}->set_levels ($self->{'levels'});
344 $self->{'buildproc'}->reset();
345 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
346 $self->{'buildproc'}, $self->{'maxdocs'});
347 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
348 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
349 &plugin::end($self->{'pluginfo'});
350 close (PIPEOUT);
351
352 close ($handle) unless $self->{'debug'};
353
354 $self->print_stats();
355
356 # create the compression dictionary
357 # the compression dictionary is built by assuming the stats are from a seed
358 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
359 # and the resulting dictionary must be less than 5 meg with the most
360 # frequent words being put into the dictionary first (-2 -k 5120)
361 # note: these options are left over from mg version
362 if (!$self->{'debug'}) {
363 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
364 if (!-e "$mgpp_compression_dict_exe") {
365 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
366 }
367 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
368
369 if (!$self->{'debug'}) {
370 if (!-e "$mgpp_passes_exe" ||
371 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
372 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
373 }
374 }
375 }
376
377 $self->{'buildproc'}->reset();
378 # compress the text
379 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
380 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
381 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
382 close ($handle) unless $self->{'debug'};
383
384 $self->print_stats();
385}
386
387sub want_built {
388 my $self = shift (@_);
389 my ($index) = @_;
390
391 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
392 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
393 if ($index =~ /^$checkstr$/) {
394 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
395 return 0;
396 }
397 }
398 }
399
400 return 1;
401}
402
403sub build_indexes {
404 my $self = shift (@_);
405 my ($indexname) = @_;
406 my $outhandle = $self->{'outhandle'};
407
408 my $indexes = [];
409 if (defined $indexname && $indexname =~ /\w/) {
410 push @$indexes, $indexname;
411 } else {
412 $indexes = $self->{'collect_cfg'}->{'indexes'};
413 }
414
415 # create the mapping between the index descriptions
416 # and their directory names (includes subcolls and langs)
417 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
418
419 # build each of the indexes
420 foreach $index (@$indexes) {
421 if ($self->want_built($index)) {
422 print $outhandle "\n*** building index $index in subdirectory " .
423 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
424 $self->build_index($index);
425 } else {
426 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
427 }
428 }
429}
430
431# creates directory names for each of the index descriptions
432sub create_index_mapping {
433 my $self = shift (@_);
434 my ($indexes) = @_;
435
436 my %mapping = ();
437 $mapping{'indexmaporder'} = [];
438 $mapping{'subcollectionmaporder'} = [];
439 $mapping{'languagemaporder'} = [];
440
441 # dirnames is used to check for collisions. Start this off
442 # with the manditory directory names
443 my %dirnames = ('text'=>'text',
444 'extra'=>'extra');
445 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
446
447 foreach $index (@$indexes) {
448 my ($fields, $subcollection, $languages) = split (":", $index);
449
450 # the directory name starts with a processed version of index fields
451 #my ($pindex) = $self->process_field($fields);
452 #$pindex = lc ($pindex);
453 # now we only ever have one index, and its called 'idx'
454 $pindex = 'idx';
455
456 # next comes a processed version of the subcollection if there is one.
457 my $psub = $self->process_field ($subcollection);
458 $psub = lc ($psub);
459
460 # next comes a processed version of the language if there is one.
461 my $plang = $self->process_field ($languages);
462 $plang = lc ($plang);
463
464 my $dirname = $pindex . $psub . $plang;
465
466 # check to be sure all index names are unique
467 while (defined ($dirnames{$dirname})) {
468 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
469 }
470
471 $mapping{$index} = $dirname;
472
473 # store the mapping orders as well as the maps
474 # also put index, subcollection and language fields into the mapping thing -
475 # (the full index name (eg text:subcol:lang) is not used on
476 # the query page) -these are used for collectionmeta later on
477 if (!defined $mapping{'indexmap'}{"$fields"}) {
478 $mapping{'indexmap'}{"$fields"} = $pindex;
479 push (@{$mapping{'indexmaporder'}}, "$fields");
480 if (!defined $mapping{"$fields"}) {
481 $mapping{"$fields"} = $pindex;
482 }
483 }
484 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
485 $mapping{'subcollectionmap'}{$subcollection} = $psub;
486 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
487 $mapping{$subcollection} = $psub;
488 }
489 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
490 $mapping{'languagemap'}{$languages} = $plang;
491 push (@{$mapping{'languagemaporder'}}, $language);
492 $mapping{$languages} = $plang;
493 }
494 $dirnames{$dirname} = $index;
495 $pnames{'index'}{$pindex} = "$fields";
496 $pnames{'subcollection'}{$psub} = $subcollection;
497 $pnames{'languages'}{$plang} = $languages;
498 }
499
500 return \%mapping;
501}
502
503# returns a processed version of a field.
504# if the field has only one component the processed
505# version will contain the first character and next consonant
506# of that componant - otherwise it will contain the first
507# character of the first two components
508sub process_field {
509 my $self = shift (@_);
510 my ($field) = @_;
511
512 return "" unless (defined ($field) && $field =~ /\w/);
513
514 my @components = split /,/, $field;
515 if (scalar @components >= 2) {
516 splice (@components, 2);
517 map {s/^(.).*$/$1/;} @components;
518 return join("", @components);
519 } else {
520 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
521 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
522 return "$a$b";
523 }
524}
525
526sub make_unique {
527 my $self = shift (@_);
528 my ($namehash, $index, $indexref, $subref, $langref) = @_;
529 my ($fields, $subcollection, $languages) = split (":", $index);
530
531 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
532 $self->get_next_version ($indexref);
533 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
534 $self->get_next_version ($subref);
535 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
536 $self->get_next_version ($langref);
537 }
538 return "$$indexref$$subref$$langref";
539}
540
541sub get_next_version {
542 my $self = shift (@_);
543 my ($nameref) = @_;
544
545 if ($$nameref =~ /(\d\d)$/) {
546 my $num = $1; $num ++;
547 $$nameref =~ s/\d\d$/$num/;
548 } elsif ($$nameref =~ /(\d)$/) {
549 my $num = $1;
550 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
551 else {$num ++; $$nameref =~ s/\d$/$num/;}
552 } else {
553 $$nameref =~ s/.$/0/;
554 }
555}
556
557sub build_index {
558 my $self = shift (@_);
559 my ($index) = @_;
560 my $outhandle = $self->{'outhandle'};
561
562 # get the full index directory path and make sure it exists
563 my $indexdir = $self->{'index_mapping'}->{$index};
564 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
565 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
566 $indexdir,
567 $self->{'collection'});
568 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
569 $self->{'collection'});
570
571 # get any os specific stuff
572 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
573
574 my $exe = &util::get_os_exe ();
575 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
576
577 # define the section names for mgpasses
578 # define the section names and possibly the doc name for mgpasses
579 my $mgpp_passes_sections = "";
580 my ($doc_level) = $self->{'doc_level'};
581 $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
582
583 foreach $level (keys %{$self->{'levels'}}) {
584 if ($level ne $doc_level) {
585 $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
586 }
587 }
588
589 my $mgpp_perf_hash_build_exe =
590 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
591 my $mgpp_weights_build_exe =
592 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
593 my $mgpp_invf_dict_exe =
594 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
595 my $mgpp_stem_idx_exe =
596 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
597
598 my $osextra = "";
599 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
600 $fullindexprefix =~ s@/@\\@g;
601 } else {
602 $osextra = " -d /";
603 if ($outhandle ne "STDERR") {
604 # so mgpp_passes doesn't print to stderr if we redirect output
605 $osextra .= " 2>/dev/null";
606 }
607 }
608
609 # get the index expression if this index belongs
610 # to a subcollection
611 my $indexexparr = [];
612
613 # there may be subcollection info, and language info.
614 my ($fields, $subcollection, $language) = split (":", $index);
615 my @subcollections = ();
616 @subcollections = split /,/, $subcollection if (defined $subcollection);
617
618 foreach $subcollection (@subcollections) {
619 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
620 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
621 }
622 }
623
624 # add expressions for languages if this index belongs to
625 # a language subcollection - only put languages expressions for the
626 # ones we want in the index
627
628 my @languages = ();
629 @languages = split /,/, $language if (defined $language);
630 foreach $language (@languages) {
631 my $not=0;
632 if ($language =~ s/^\!//) {
633 $not = 1;
634 }
635 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
636 if ($lang eq $language) {
637 if ($not) {
638 push (@$indexexparr, "!Language/$language/");
639 } else {
640 push (@$indexexparr, "Language/$language/");
641 }
642 last;
643 }
644 }
645 }
646
647 # Build index dictionary. Uses verbatim stem method
648 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
649 my ($handle);
650 if ($self->{'debug'}) {
651 $handle = STDOUT;
652 } else {
653 if (!-e "$mgpp_passes_exe" ||
654 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
655 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
656 }
657 $handle = mgppbuilder::PIPEOUT;
658 }
659
660 # set up the document processr
661 $self->{'buildproc'}->set_output_handle ($handle);
662 $self->{'buildproc'}->set_mode ('text');
663 $self->{'buildproc'}->set_index ($index, $indexexparr);
664 $self->{'buildproc'}->set_indexing_text (1);
665 $self->{'buildproc'}->set_store_text(1);
666 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
667 $self->{'buildproc'}->set_levels ($self->{'levels'});
668 $self->{'buildproc'}->reset();
669 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
670 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
671 close ($handle) unless $self->{'debug'};
672
673 $self->print_stats();
674
675 if (!$self->{'debug'}) {
676 # create the perfect hash function
677 if (!-e "$mgpp_perf_hash_build_exe") {
678 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
679 }
680 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
681
682 if (!-e "$mgpp_passes_exe" ||
683 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
684 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
685 }
686 }
687
688 # invert the text
689 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
690
691 $self->{'buildproc'}->reset();
692 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
693 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
694
695 $self->print_stats ();
696
697 if (!$self->{'debug'}) {
698
699 close ($handle);
700
701 # create the weights file
702 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
703 if (!-e "$mgpp_weights_build_exe") {
704 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
705 }
706 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
707
708 # create 'on-disk' stemmed dictionary
709 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
710 if (!-e "$mgpp_invf_dict_exe") {
711 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
712 }
713 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
714
715
716 # creates stem index files for the various stemming methods
717 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
718 if (!-e "$mgpp_stem_idx_exe") {
719 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
720 }
721 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
722 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
723 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
724
725 #define the final field lists
726 $self->make_final_field_list();
727
728 # remove unwanted files
729 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
730 opendir (DIR, $tmpdir) || die
731 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
732 foreach $file (readdir(DIR)) {
733 next if $file =~ /^\./;
734 my ($suffix) = $file =~ /\.([^\.]+)$/;
735 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
736 # delete it!
737 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
738 #&util::rm (&util::filename_cat ($tmpdir, $file));
739 }
740 }
741 closedir (DIR);
742 }
743}
744
745sub make_infodatabase {
746 my $self = shift (@_);
747 my $outhandle = $self->{'outhandle'};
748
749
750 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
751 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
752 &util::mk_all_dir ($textdir);
753 &util::mk_all_dir ($assocdir);
754
755 # get db name
756 my $dbext = ".bdb";
757 $dbext = ".ldb" if &util::is_little_endian();
758 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
759 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
760
761 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
762 my $exe = &util::get_os_exe ();
763 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
764
765 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
766 if (!defined $self->{'build_cfg'}) {
767 $self->read_final_field_list();
768 }
769 print $outhandle "\n*** creating the info database and processing associated files\n"
770 if ($self->{'verbosity'} >= 1);
771
772 # init all the classifiers
773 &classify::init_classifiers ($self->{'classifiers'});
774
775 # set up the document processor
776 my ($handle);
777 if ($self->{'debug'}) {
778 $handle = STDOUT;
779 } else {
780 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
781 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
782 }
783 $handle = mgppbuilder::PIPEOUT;
784 }
785
786 $self->{'buildproc'}->set_output_handle ($handle);
787 $self->{'buildproc'}->set_mode ('infodb');
788 $self->{'buildproc'}->set_assocdir ($assocdir);
789 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
790 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
791 $self->{'buildproc'}->set_indexing_text (0);
792 $self->{'buildproc'}->set_store_text(1);
793 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
794
795 $self->{'buildproc'}->reset();
796
797 # do the collection info
798 print $handle "[collection]\n";
799
800 # first do the collection meta stuff - everything without a dot
801 my $collmetadefined = 0;
802 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
803 $collmetadefined = 1;
804 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
805 next if ($cmeta =~ /^\./); # for now, ignore ones with dots
806 my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
807 #write the entry to the file
808 print $handle $metadata_entry;
809
810 } # foreach collmeta key
811 }
812 #add the index field macros to [collection]
813 # eg <TI>Title
814 # <SU>Subject
815 # these now come from collection meta. if that is not defined, usses the metadata name
816 $field_entry="";
817 foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
818 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
819 next if $shortfield eq 1;
820
821 # we need to check if some coll meta has been defined
822 my $collmeta = ".$longfield";
823 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
824 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
825 $field_entry .= $metadata_entry;
826 } else { #use the metadata names, or the text macros for allfields and textonly
827 if ($longfield eq "allfields") {
828 $field_entry .= "<$shortfield>_query:textallfields_\n";
829 } elsif ($longfield eq "text") {
830 $field_entry .= "<$shortfield>_query:texttextonly_\n";
831 } else {
832 $field_entry .= "<$shortfield>$longfield\n";
833 }
834 }
835 }
836 print $handle $field_entry;
837
838 # now add the level names
839 $level_entry = "";
840 foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
841 my $collmeta = ".$level"; # based on the original specification
842 $level =~ tr/A-Z/a-z/; # make it lower case
843 my $levelid = %level_map->{$level}; # find the actual value we used in the index
844 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
845 $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
846 $level_entry .= $metadata_entry;
847 } else {
848 # use the default macro
849 $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
850 }
851 }
852 print $handle $level_entry;
853 #end the collection entry
854 print $handle "\n" . ('-' x 70) . "\n";
855
856
857
858 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
859 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
860
861 # output classification information
862 &classify::output_classify_info ($self->{'classifiers'}, $handle,
863 $self->{'allclassifications'});
864
865 #output doclist
866 my @doclist = $self->{'buildproc'}->get_doc_list();
867 my $docs = join (";",@doclist);
868 print $handle "[browselist]\n";
869 print $handle "<hastxt>0\n";
870 print $handle "<childtype>VList\n";
871 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
872 print $handle "<thistype>Invisible\n";
873 print $handle "<contains>$docs";
874 print $handle "\n" . ('-' x 70) . "\n";
875 close ($handle) if !$self->{'debug'};
876
877}
878
879sub create_language_db_map {
880 my $self = shift (@_);
881 my ($metaname, $mapname) = @_;
882 my $outhandle = $self->{'outhandle'};
883 my $defaultfound=0;
884 my $first=1;
885 my $metadata_entry = "";
886 my $default="";
887 #iterate through the languages
888 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
889 if ($first) {
890 $first=0;
891 #set the default default to the first entry
892 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
893 }
894 if ($lang =~ /default/) {
895 $defaultfound=1;
896 #the default entry goes first
897 $metadata_entry = "<$mapname>" .
898 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
899 }
900 else {
901 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
902 if ($l) {
903 $metadata_entry .= "<$mapname:$l>" .
904 $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";
905 }
906 }
907 } #foreach lang
908 #if we haven't found a default, put one in
909 if (!$defaultfound) {
910 $metadata_entry = "<$mapname>$default\n" . $metadata_entry;
911 }
912 return $metadata_entry;
913
914}
915sub collect_specific {
916 my $self = shift (@_);
917}
918
919# at the end of building, we have an indexfieldmap with all teh mappings, plus
920# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
921# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
922# we store these in a build.cfg bit
923sub make_final_field_list {
924 my $self = shift (@_);
925
926 $self->{'build_cfg'} = {};
927
928 # store the indexfieldmap information
929 my @indexfieldmap = ();
930 my @indexfields = ();
931 my $specifiedfields = {};
932 my @specifiedfieldorder = ();
933 # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
934 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
935 my @fs = split(',', $field);
936 foreach $f(@fs) {
937 $specifiedfields->{$f}=1;
938 push (@specifiedfieldorder, "$f");
939 }
940 }
941
942 #add all fields bit
943 foreach $field (@specifiedfieldorder) {
944 if ($field eq "metadata") {
945 foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
946 if (!defined $specifiedfields->{$newfield}) {
947 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
948 push (@indexfields, "$newfield");
949 }
950 }
951
952 } elsif ($field eq 'text') {
953 push (@indexfieldmap, "text\-\>TX");
954 push (@indexfields, "text");
955 } elsif ($field eq 'allfields') {
956 push (@indexfieldmap, "allfields\-\>ZZ");
957 push (@indexfields, "allfields");
958 } else {
959 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
960 push (@indexfields, "$field");
961
962 }
963 }
964 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
965 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
966
967
968}
969
970
971# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
972sub read_final_field_list {
973 my $self = shift (@_);
974 $self->{'build_cfg'} = {};
975 my @indexfieldmap = ();
976 my @indexfields = ();
977
978 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
979 # set the default mapping
980 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
981 }
982 # we read the stuff in from the build.cfg file - if its there
983 $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
984
985 if (!-e $buildconfigfile) {
986 # try the index dir - but do we know where it is?? try here
987 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
988 if (!-e $buildconfigfile) {
989 #we cant find a config file - just ignore the field list
990 return;
991 }
992 }
993 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
994 if (defined $buildcfg->{'indexfields'}) {
995 foreach $field (@{$buildcfg->{'indexfields'}}) {
996 push (@indexfields, "$field");
997 }
998 }
999 if (defined $buildcfg->{'indexfieldmap'}) {
1000 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
1001 push (@indexfieldmap, "$field");
1002 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
1003 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
1004 }
1005 }
1006
1007 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
1008 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
1009
1010}
1011sub make_auxiliary_files {
1012 my $self = shift (@_);
1013 my ($index);
1014
1015 my $build_cfg = {};
1016 # this already includes indexfieldmap and indexfields
1017 if (defined $self->{'build_cfg'}) {
1018 $build_cfg = $self->{'build_cfg'};
1019 }
1020 #my %build_cfg = ();
1021
1022 my $outhandle = $self->{'outhandle'};
1023 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
1024
1025 # get the text directory
1026 &util::mk_all_dir ($self->{'build_dir'});
1027
1028 # store the build date
1029 $build_cfg->{'builddate'} = time;
1030 $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
1031
1032 # store the level info
1033 my @indexlevels = ();
1034 foreach $l (@{$self->{'levelorder'}}) {
1035 push (@indexlevels, %level_map->{$l});
1036 }
1037 $build_cfg->{'indexlevels'} = \@indexlevels;
1038
1039 if ($self->{'levels'}->{'section'}) {
1040 $build_cfg->{'textlevel'} = %level_map->{'section'};
1041 } else {
1042 $build_cfg->{'textlevel'} = %level_map->{'document'};
1043 }
1044 # store the number of documents and number of bytes
1045 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
1046 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
1047
1048 # store the mapping between the index names and the directory names
1049 my @indexmap = ();
1050 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
1051 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
1052 }
1053 $build_cfg->{'indexmap'} = \@indexmap;
1054
1055 my @subcollectionmap = ();
1056 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
1057 push (@subcollectionmap, "$subcollection\-\>" .
1058 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
1059 }
1060 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
1061
1062 my @languagemap = ();
1063 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
1064 push (@languagemap, "$language\-\>" .
1065 $self->{'index_mapping'}->{'languagemap'}->{$language});
1066 }
1067 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
1068
1069 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
1070
1071 # write out the build information
1072 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
1073 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
1074 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
1075
1076}
1077
1078sub deinit {
1079 my $self = shift (@_);
1080}
1081
1082sub print_stats {
1083 my $self = shift (@_);
1084
1085 my $outhandle = $self->{'outhandle'};
1086 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
1087 my $index = $self->{'buildproc'}->get_index();
1088 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
1089 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
1090
1091 if ($indexing_text) {
1092 print $outhandle "Stats (Creating index $index)\n";
1093 } else {
1094 print $outhandle "Stats (Compressing text from $index)\n";
1095 }
1096 print $outhandle "Total bytes in collection: $num_bytes\n";
1097 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
1098
1099 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
1100 print $outhandle "***************\n";
1101 if ($indexing_text) {
1102 print $outhandle "WARNING: There is very little or no text to process for $index\n";
1103 } elsif (!$self->{'no_text'}) {
1104 print $outhandle "WARNING: There is very little or no text to compress\n";
1105 }
1106 print $outhandle " Was this your intention?\n";
1107 print $outhandle "***************\n";
1108 }
1109
1110}
1111
11121;
1113
1114
Note: See TracBrowser for help on using the repository browser.