source: other-projects/trunk/protemix/perllib/ptmxbuilder.pm@ 14162

Last change on this file since 14162 was 3172, checked in by sjboddie, 20 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 31.2 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# ptmxbuilder is identical to mgppbuilder except for a few new entries in
27# %static_indexfield_map
28
29package ptmxbuilder;
30
31use classify;
32use cfgread;
33use colcfg;
34use plugin;
35use util;
36use FileHandle;
37
38
39BEGIN {
40 # set autoflush on for STDERR and STDOUT so that mg
41 # doesn't get out of sync with plugins
42 STDOUT->autoflush(1);
43 STDERR->autoflush(1);
44}
45
46END {
47 STDOUT->autoflush(0);
48 STDERR->autoflush(0);
49}
50
51$maxdocsize = 12000;
52
53
54%wanted_index_files = ('td'=>1,
55 't'=>1,
56 'tl'=>1,
57 'ti'=>1,
58 'idb'=>1,
59 'ib1'=>1,
60 'ib2'=>1,
61 'ib3'=>1,
62 'i'=>1,
63 'il'=>1,
64 'w'=>1,
65 'wa'=>1);
66
67# change this so a user can add their own ones in via a file or cfg
68%static_indexfield_map = ('Title'=>'TI',
69 'TI'=>1,
70 'Subject'=>'SU',
71 'SU'=>1,
72 'Creator'=>'CR',
73 'CR'=>1,
74 'Organization'=>'OR',
75 'OR'=>1,
76 'Source'=>'SO',
77 'SO'=>1,
78 'Howto'=>'HT',
79 'HT'=>1,
80 'ItemTitle'=>'IT',
81 'IT'=>1,
82 'ProgNumber'=>'PN',
83 'PN'=>1,
84 'People'=>'PE',
85 'PE'=>1,
86 'Class1'=>'CL',
87 'CL'=>1,
88 'Class2'=>'CA',
89 'CA'=>1,
90 'Class3'=>'CS',
91 'CS'=>1,
92 'TextOnly'=>'TX',
93 'TX'=>1);
94
95sub new {
96 my ($class, $collection, $source_dir, $build_dir, $verbosity,
97 $maxdocs, $debug, $keepold, $allclassifications,
98 $outhandle, $no_text) = @_;
99
100 $outhandle = STDERR unless defined $outhandle;
101 $no_text = 0 unless defined $no_text;
102
103 # create an mgppbuilder object
104 my $self = bless {'collection'=>$collection,
105 'source_dir'=>$source_dir,
106 'build_dir'=>$build_dir,
107 'verbosity'=>$verbosity,
108 'maxdocs'=>$maxdocs,
109 'debug'=>$debug,
110 'keepold'=>$keepold,
111 'allclassifications'=>$allclassifications,
112 'outhandle'=>$outhandle,
113 'no_text'=>$no_text,
114 'notbuilt'=>[], # indexes not built
115 'indexfieldmap'=>\%static_indexfield_map
116 }, $class;
117
118
119 # read in the collection configuration file
120 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
121 if (!-e $colcfgname) {
122 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
123 }
124 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
125
126 # sort out subcollection indexes
127 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
128 my $indexes = $self->{'collect_cfg'}->{'indexes'};
129 $self->{'collect_cfg'}->{'indexes'} = [];
130 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
131 foreach $index (@$indexes) {
132 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
133 }
134 }
135 }
136
137 # sort out language subindexes
138 if (defined $self->{'collect_cfg'}->{'languages'}) {
139 my $indexes = $self->{'collect_cfg'}->{'indexes'};
140 $self->{'collect_cfg'}->{'indexes'} = [];
141 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
142 foreach $index (@$indexes) {
143 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
144 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
145 }
146 else { # add in an empty subcollection field
147 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
148 }
149 }
150 }
151 }
152
153 # make sure that the same index isn't specified more than once
154 my %tmphash = ();
155 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
156 $self->{'collect_cfg'}->{'indexes'} = [];
157 foreach my $i (@tmparray) {
158 if (!defined ($tmphash{$i})) {
159 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
160 $tmphash{$i} = 1;
161 }
162 }
163
164
165 # get the levels (Section, Paragraph) for indexing and compression
166 $self->{'levels'} = {};
167 if (defined $self->{'collect_cfg'}->{'levels'}) {
168 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
169 $self->{'levels'}->{$level} = 1;
170 }
171 }
172
173 # get the list of plugins for this collection
174 my $plugins = [];
175 if (defined $self->{'collect_cfg'}->{'plugin'}) {
176 $plugins = $self->{'collect_cfg'}->{'plugin'};
177 }
178
179 # load all the plugins
180 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
181 if (scalar(@{$self->{'pluginfo'}}) == 0) {
182 print $outhandle "No plugins were loaded.\n";
183 die "\n";
184 }
185
186 # get the list of classifiers for this collection
187 my $classifiers = [];
188 if (defined $self->{'collect_cfg'}->{'classify'}) {
189 $classifiers = $self->{'collect_cfg'}->{'classify'};
190 }
191
192 # load all the classifiers
193 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
194
195 # load up any dontgdbm fields
196 $self->{'dontgdbm'} = {};
197 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
198 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
199 $self->{'dontgdbm'}->{$dg} = 1;
200 }
201 }
202
203 # load up the document processor for building
204 # if a buildproc class has been created for this collection, use it
205 # otherwise, use the mgpp buildproc
206 my ($buildprocdir, $buildproctype);
207 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
208 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
209 $buildproctype = "${collection}buildproc";
210 } else {
211 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
212 $buildproctype = "mgppbuildproc";
213 }
214 require "$buildprocdir/$buildproctype.pm";
215
216 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
217 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
218 die "$@" if $@;
219
220
221 return $self;
222}
223
224sub init {
225 my $self = shift (@_);
226
227 if (!$self->{'debug'} && !$self->{'keepold'}) {
228 # remove any old builds
229 &util::rm_r($self->{'build_dir'});
230 &util::mk_all_dir($self->{'build_dir'});
231
232 # make the text directory
233 my $textdir = "$self->{'build_dir'}/text";
234 &util::mk_all_dir($textdir);
235 }
236}
237
238sub set_strip_html {
239 my $self = shift (@_);
240 my ($strip) = @_;
241
242 $self->{'strip_html'} = $strip;
243 $self->{'buildproc'}->set_strip_html($strip);
244}
245
246sub compress_text {
247
248 my $self = shift (@_);
249 my ($textindex) = @_;
250
251 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
252 my $exe = &util::get_os_exe ();
253 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
254 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
255 my $outhandle = $self->{'outhandle'};
256
257 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
258
259 my $basefilename = "text/$self->{'collection'}";
260 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
261
262 my $osextra = "";
263 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
264 $fulltextprefix =~ s@/@\\@g;
265 }
266 else {
267 $osextra = " -d /";
268 }
269
270
271 # define the section names for mgpasses
272 # the compressor doesn't need to know about paragraphs - never want to
273 # retrieve them
274 my $mgpp_passes_sections = "";
275 if ($self->{'levels'}->{'Section'}) {
276 $mgpp_passes_sections .= "-K Section ";
277 }
278
279 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
280
281 # collect the statistics for the text
282 # -b $maxdocsize sets the maximum document size to be 12 meg
283 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
284
285 my ($handle);
286 if ($self->{'debug'}) {
287 $handle = STDOUT;
288 } else {
289 if (!-e "$mgpp_passes_exe" ||
290 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
291 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
292 }
293 $handle = ptmxbuilder::PIPEOUT;
294 }
295 $self->{'buildproc'}->set_output_handle ($handle);
296 $self->{'buildproc'}->set_mode ('text');
297 $self->{'buildproc'}->set_index ($textindex);
298 $self->{'buildproc'}->set_indexing_text (0);
299 if ($self->{'no_text'}) {
300 $self->{'buildproc'}->set_store_text(0);
301 } else {
302 $self->{'buildproc'}->set_store_text(1);
303 }
304 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
305 $self->{'buildproc'}->set_levels ($self->{'levels'});
306 $self->{'buildproc'}->reset();
307 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
308 $self->{'buildproc'}, $self->{'maxdocs'});
309 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
310 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
311 &plugin::end($self->{'pluginfo'});
312 close (PIPEOUT);
313
314 close ($handle) unless $self->{'debug'};
315
316 $self->print_stats();
317
318 # create the compression dictionary
319 # the compression dictionary is built by assuming the stats are from a seed
320 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
321 # and the resulting dictionary must be less than 5 meg with the most
322 # frequent words being put into the dictionary first (-2 -k 5120)
323 # note: these options are left over from mg version
324 if (!$self->{'debug'}) {
325 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
326 if (!-e "$mgpp_compression_dict_exe") {
327 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
328 }
329 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
330
331 if (!$self->{'debug'}) {
332 if (!-e "$mgpp_passes_exe" ||
333 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
334 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
335 }
336 }
337 }
338
339 $self->{'buildproc'}->reset();
340 # compress the text
341 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
342 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
343 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
344 close ($handle) unless $self->{'debug'};
345
346 $self->print_stats();
347}
348
349sub want_built {
350 my $self = shift (@_);
351 my ($index) = @_;
352
353 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
354 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
355 if ($index =~ /^$checkstr$/) {
356 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
357 return 0;
358 }
359 }
360 }
361
362 return 1;
363}
364
365sub build_indexes {
366 my $self = shift (@_);
367 my ($indexname) = @_;
368 my $outhandle = $self->{'outhandle'};
369
370 my $indexes = [];
371 if (defined $indexname && $indexname =~ /\w/) {
372 push @$indexes, $indexname;
373 } else {
374 $indexes = $self->{'collect_cfg'}->{'indexes'};
375 }
376
377 # create the mapping between the index descriptions
378 # and their directory names
379 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
380
381 # build each of the indexes
382 foreach $index (@$indexes) {
383 if ($self->want_built($index)) {
384 print $outhandle "\n*** building index $index in subdirectory " .
385 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
386 $self->build_index($index);
387 } else {
388 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
389 }
390 }
391}
392
393# creates directory names for each of the index descriptions
394sub create_index_mapping {
395 my $self = shift (@_);
396 my ($indexes) = @_;
397
398 my %mapping = ();
399 $mapping{'indexmaporder'} = [];
400 $mapping{'subcollectionmaporder'} = [];
401 $mapping{'languagemaporder'} = [];
402
403 # dirnames is used to check for collisions. Start this off
404 # with the manditory directory names
405 my %dirnames = ('text'=>'text',
406 'extra'=>'extra');
407 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
408
409 foreach $index (@$indexes) {
410 my ($fields, $subcollection, $languages) = split (":", $index);
411
412 # the directory name starts with a processed version of index fields
413 my ($pindex) = $self->process_field($fields);
414 # next comes a processed version of the index
415 $pindex = lc ($pindex);
416
417 # next comes a processed version of the subcollection if there is one.
418 my $psub = $self->process_field ($subcollection);
419 $psub = lc ($psub);
420
421 # next comes a processed version of the language if there is one.
422 my $plang = $self->process_field ($languages);
423 $plang = lc ($plang);
424
425 my $dirname = $pindex . $psub . $plang;
426
427 # check to be sure all index names are unique
428 while (defined ($dirnames{$dirname})) {
429 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
430 }
431
432 $mapping{$index} = $dirname;
433
434 # store the mapping orders as well as the maps
435 # also put index, subcollection and language fields into the mapping thing -
436 # (the full index name (eg document:text:subcol:lang) is not used on
437 # the query page) -these are used for collectionmeta later on
438 if (!defined $mapping{'indexmap'}{"$fields"}) {
439 $mapping{'indexmap'}{"$fields"} = $pindex;
440 push (@{$mapping{'indexmaporder'}}, "$fields");
441 if (!defined $mapping{"$fields"}) {
442 $mapping{"$fields"} = $pindex;
443 }
444 }
445 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
446 $mapping{'subcollectionmap'}{$subcollection} = $psub;
447 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
448 $mapping{$subcollection} = $psub;
449 }
450 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
451 $mapping{'languagemap'}{$languages} = $plang;
452 push (@{$mapping{'languagemaporder'}}, $language);
453 $mapping{$languages} = $plang;
454 }
455 $dirnames{$dirname} = $index;
456 $pnames{'index'}{$pindex} = "$fields";
457 $pnames{'subcollection'}{$psub} = $subcollection;
458 $pnames{'languages'}{$plang} = $languages;
459 }
460
461 return \%mapping;
462}
463
464# returns a processed version of a field.
465# if the field has only one component the processed
466# version will contain the first character and next consonant
467# of that componant - otherwise it will contain the first
468# character of the first two components
469sub process_field {
470 my $self = shift (@_);
471 my ($field) = @_;
472
473 return "" unless (defined ($field) && $field =~ /\w/);
474
475 my @components = split /,/, $field;
476 if (scalar @components >= 2) {
477 splice (@components, 2);
478 map {s/^(.).*$/$1/;} @components;
479 return join("", @components);
480 } else {
481 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
482 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
483 return "$a$b";
484 }
485}
486
487sub make_unique {
488 my $self = shift (@_);
489 my ($namehash, $index, $indexref, $subref, $langref) = @_;
490 my ($fields, $subcollection, $languages) = split (":", $index);
491
492 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
493 $self->get_next_version ($indexref);
494 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
495 $self->get_next_version ($subref);
496 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
497 $self->get_next_version ($langref);
498 }
499 return "$$indexref$$subref$$langref";
500}
501
502sub get_next_version {
503 my $self = shift (@_);
504 my ($nameref) = @_;
505
506 if ($$nameref =~ /(\d\d)$/) {
507 my $num = $1; $num ++;
508 $$nameref =~ s/\d\d$/$num/;
509 } elsif ($$nameref =~ /(\d)$/) {
510 my $num = $1;
511 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
512 else {$num ++; $$nameref =~ s/\d$/$num/;}
513 } else {
514 $$nameref =~ s/.$/0/;
515 }
516}
517
518sub build_index {
519 my $self = shift (@_);
520 my ($index) = @_;
521 my $outhandle = $self->{'outhandle'};
522
523 # get the full index directory path and make sure it exists
524 my $indexdir = $self->{'index_mapping'}->{$index};
525 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
526 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
527 $indexdir,
528 $self->{'collection'});
529 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
530 $self->{'collection'});
531
532 # get any os specific stuff
533 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
534
535 my $exe = &util::get_os_exe ();
536 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
537
538 # define the section names for mgpasses
539 my $mgpp_passes_sections = "";
540 foreach $level (keys (%{$self->{'levels'}})) {
541 if ($level eq "Section" || $level eq "Paragraph") {
542 $mgpp_passes_sections .= "-K $level ";
543 }
544 }
545
546 my $mgpp_perf_hash_build_exe =
547 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
548 my $mgpp_weights_build_exe =
549 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
550 my $mgpp_invf_dict_exe =
551 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
552 my $mgpp_stem_idx_exe =
553 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
554
555 my $osextra = "";
556 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
557 $fullindexprefix =~ s@/@\\@g;
558 } else {
559 $osextra = " -d /";
560 if ($outhandle ne "STDERR") {
561 # so mgpp_passes doesn't print to stderr if we redirect output
562 $osextra .= " 2>/dev/null";
563 }
564 }
565
566 # get the index expression if this index belongs
567 # to a subcollection
568 my $indexexparr = [];
569
570 # there may be subcollection info, and language info.
571 my ($fields, $subcollection, $language) = split (":", $index);
572 my @subcollections = ();
573 @subcollections = split /,/, $subcollection if (defined $subcollection);
574
575 foreach $subcollection (@subcollections) {
576 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
577 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
578 }
579 }
580
581 # add expressions for languages if this index belongs to
582 # a language subcollection - only put languages expressions for the
583 # ones we want in the index
584
585 my @languages = ();
586 @languages = split /,/, $language if (defined $language);
587 foreach $language (@languages) {
588 my $not=0;
589 if ($language =~ s/^\!//) {
590 $not = 1;
591 }
592 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
593 if ($lang eq $language) {
594 if ($not) {
595 push (@$indexexparr, "!Language/$language/");
596 } else {
597 push (@$indexexparr, "Language/$language/");
598 }
599 last;
600 }
601 }
602 }
603
604 # Build index dictionary. Uses verbatim stem method
605 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
606 my ($handle);
607 if ($self->{'debug'}) {
608 $handle = STDOUT;
609 } else {
610 if (!-e "$mgpp_passes_exe" ||
611 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
612 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
613 }
614 $handle = ptmxbuilder::PIPEOUT;
615 }
616
617 # set up the document processor
618 $self->{'buildproc'}->set_output_handle ($handle);
619 $self->{'buildproc'}->set_mode ('text');
620 $self->{'buildproc'}->set_index ($index, $indexexparr);
621 $self->{'buildproc'}->set_indexing_text (1);
622 $self->{'buildproc'}->set_store_text(1);
623 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
624 $self->{'buildproc'}->set_levels ($self->{'levels'});
625 $self->{'buildproc'}->reset();
626 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
627 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
628 close ($handle) unless $self->{'debug'};
629
630 $self->print_stats();
631
632 if (!$self->{'debug'}) {
633 # create the perfect hash function
634 if (!-e "$mgpp_perf_hash_build_exe") {
635 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
636 }
637 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
638
639 if (!-e "$mgpp_passes_exe" ||
640 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
641 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
642 }
643 }
644
645 # invert the text
646 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
647
648 $self->{'buildproc'}->reset();
649 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
650 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
651
652 $self->print_stats ();
653
654 if (!$self->{'debug'}) {
655
656 close ($handle);
657
658 # create the weights file
659 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
660 if (!-e "$mgpp_weights_build_exe") {
661 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
662 }
663 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
664
665 # create 'on-disk' stemmed dictionary
666 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
667 if (!-e "$mgpp_invf_dict_exe") {
668 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
669 }
670 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
671
672
673 # creates stem index files for the various stemming methods
674 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
675 if (!-e "$mgpp_stem_idx_exe") {
676 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
677 }
678 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
679 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
680 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
681
682
683 # remove unwanted files
684 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
685 opendir (DIR, $tmpdir) || die
686 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
687 foreach $file (readdir(DIR)) {
688 next if $file =~ /^\./;
689 my ($suffix) = $file =~ /\.([^\.]+)$/;
690 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
691 # delete it!
692 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
693 #&util::rm (&util::filename_cat ($tmpdir, $file));
694 }
695 }
696 closedir (DIR);
697 }
698}
699
700sub make_infodatabase {
701 my $self = shift (@_);
702 my $outhandle = $self->{'outhandle'};
703
704
705 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
706 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
707 &util::mk_all_dir ($textdir);
708 &util::mk_all_dir ($assocdir);
709
710 # get db name
711 my $dbext = ".bdb";
712 $dbext = ".ldb" if &util::is_little_endian();
713 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
714 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
715
716 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
717 my $exe = &util::get_os_exe ();
718 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
719
720 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
721 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
722 #check build.cfg to see if indexfields have been filled in
723 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
724 if (-e $buildconfigfile) {
725 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
726 if (defined $buildcfg->{'indexfields'}) {
727 foreach $field (@{$buildcfg->{'indexfields'}}) {
728 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
729 }
730 }
731 if (defined $buildcfg->{'indexfieldmap'}) {
732 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
733 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
734 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
735 }
736 }
737 }
738 }
739
740 print $outhandle "\n*** creating the info database and processing associated files\n"
741 if ($self->{'verbosity'} >= 1);
742
743 # init all the classifiers
744 &classify::init_classifiers ($self->{'classifiers'});
745
746 # set up the document processor
747 my ($handle);
748 if ($self->{'debug'}) {
749 $handle = STDOUT;
750 } else {
751 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
752 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
753 }
754 $handle = ptmxbuilder::PIPEOUT;
755 }
756
757 $self->{'buildproc'}->set_output_handle ($handle);
758 $self->{'buildproc'}->set_mode ('infodb');
759 $self->{'buildproc'}->set_assocdir ($assocdir);
760 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
761 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
762 $self->{'buildproc'}->set_indexing_text (0);
763 $self->{'buildproc'}->set_store_text(1);
764 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
765
766 $self->{'buildproc'}->reset();
767
768 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
769
770 if (!defined $self->{'index_mapping'}) {
771 $self->{'index_mapping'} =
772 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
773 }
774
775 print $handle "[collection]\n";
776
777 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
778 my $defaultfound=0;
779 my $first=1;
780 my $metadata_entry = "";
781 my $default="";
782 my $cmetamap = "";
783 if ($cmeta =~ s/^\.//) {
784 if (defined $self->{'index_mapping'}->{$cmeta}) {
785 $cmetamap = $self->{'index_mapping'}->{$cmeta};
786 $cmeta = ".$cmeta";
787 }
788 else {
789 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
790 next; #ignore this one
791 }
792 }
793 else {
794 $cmetamap = $cmeta; # just using the same name
795 }
796 #iterate through the languages
797 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
798 if ($first) {
799 $first=0;
800 #set the default default to the first entry
801 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
802 }
803 if ($lang =~ /default/) {
804 $defaultfound=1;
805 #the default entry goes first
806 $metadata_entry = "<$cmetamap>" .
807 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
808 }
809 else {
810 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
811 if ($l) {
812 $metadata_entry .= "<$cmetamap:$l>" .
813 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
814 }
815 }
816 }
817 #if we haven't found a default, put one in
818 if (!$defaultfound) {
819 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
820 }
821 #write the entry to the file
822 print $handle $metadata_entry;
823
824 }
825
826 #add the indexfieldmap macros to [collection]
827 # eg <TI>Title
828 # <SU>Subject
829 # these may be overidden for other langs if add to macro files
830 $field_entry="";
831 foreach $longfield (keys %{$self->{'buildproc'}->{'indexfieldmap'}}){
832 $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
833 next if $shortfield eq 1;
834 $field_entry .= "<$shortfield>$longfield\n";
835 }
836 print $handle $field_entry;
837
838 print $handle "\n" . ('-' x 70) . "\n";
839
840 }
841
842 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
843 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
844
845 # output classification information
846 &classify::output_classify_info ($self->{'classifiers'}, $handle,
847 $self->{'allclassifications'});
848
849 #output doclist
850 my @doclist = $self->{'buildproc'}->get_doc_list();
851 my $docs = join (";",@doclist);
852 print $handle "[browselist]\n";
853 print $handle "<hastxt>0\n";
854 print $handle "<childtype>VList\n";
855 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
856 print $handle "<thistype>Invisible\n";
857 print $handle "<contains>$docs";
858 print $handle "\n" . ('-' x 70) . "\n";
859 close ($handle) if !$self->{'debug'};
860
861}
862
863sub collect_specific {
864 my $self = shift (@_);
865}
866
867sub make_auxiliary_files {
868 my $self = shift (@_);
869 my ($index);
870 my %build_cfg = ();
871
872 my $outhandle = $self->{'outhandle'};
873 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
874
875 # get the text directory
876 &util::mk_all_dir ($self->{'build_dir'});
877
878 # store the build date
879 $build_cfg->{'builddate'} = time;
880 $build_cfg->{'buildtype'} = "mgpp";
881
882 # store the number of documents and number of bytes
883 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
884 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
885
886 # store the mapping between the index names and the directory names
887 my @indexmap = ();
888 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
889 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
890 }
891 $build_cfg->{'indexmap'} = \@indexmap;
892
893 my @subcollectionmap = ();
894 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
895 push (@subcollectionmap, "$subcollection\-\>" .
896 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
897 }
898 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
899
900 my @languagemap = ();
901 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
902 push (@languagemap, "$language\-\>" .
903 $self->{'index_mapping'}->{'languagemap'}->{$language});
904 }
905 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
906
907 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
908
909 # store the indexfieldmap information
910 my @indexfieldmap = ();
911 #add all fields bit - sort based on keys. text only is put at front
912 if (defined $self->{'buildproc'}->{'indexfields'}->{'TextOnly'}) {
913 push (@indexfieldmap, "TextOnly\-\>TX");
914 }
915 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
916 next if $field eq "TextOnly";
917 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
918 }
919
920 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
921
922 #store the indexed field information
923 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
924
925 push (@{$build_cfg->{'indexfields'}}, $field);
926 }
927 # write out the build information
928 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
929 '^(builddate|buildtype|numdocs|numbytes)$',
930 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
931
932}
933
934sub deinit {
935 my $self = shift (@_);
936}
937
938sub print_stats {
939 my $self = shift (@_);
940
941 my $outhandle = $self->{'outhandle'};
942 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
943 my $index = $self->{'buildproc'}->get_index();
944 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
945 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
946
947 if ($indexing_text) {
948 print $outhandle "Stats (Creating index $index)\n";
949 } else {
950 print $outhandle "Stats (Compressing text from $index)\n";
951 }
952 print $outhandle "Total bytes in collection: $num_bytes\n";
953 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
954
955 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
956 print $outhandle "***************\n";
957 if ($indexing_text) {
958 print $outhandle "WARNING: There is very little or no text to process for $index\n";
959 } elsif (!$self->{'no_text'}) {
960 print $outhandle "WARNING: There is very little or no text to compress\n";
961 }
962 print $outhandle " Was this your intention?\n";
963 print $outhandle "***************\n";
964 }
965
966}
967
9681;
969
970
Note: See TracBrowser for help on using the repository browser.