source: tags/gsdl-2_30d-distribution/gsdl/perllib/mgppbuilder.pm@ 2308

Last change on this file since 2308 was 1917, checked in by kjm18, 23 years ago

minor changes

  • Property svn:keywords set to Author Date Id Revision
File size: 26.8 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'tw'=>1,
62 'w'=>1,
63 'wa'=>1);
64
65# change this so a user can add their own ones in via a file or cfg
66%static_indexfield_map = ('Title'=>'TI',
67 'TI'=>1,
68 'Subject'=>'SU',
69 'SU'=>1,
70 'Creator'=>'CR',
71 'CR'=>1,
72 'Organization'=>'OR',
73 'OR'=>1,
74 'Source'=>'SO',
75 'SO'=>1,
76 'Howto'=>'HT',
77 'HT'=>1,
78 'ItemTitle'=>'IT',
79 'IT'=>1,
80 'ProgNumber'=>'PN',
81 'PN'=>1,
82 'People'=>'PE',
83 'PE'=>1,
84 'TextOnly'=>'TX',
85 'TX'=>1);
86
87sub new {
88 my ($class, $collection, $source_dir, $build_dir, $verbosity,
89 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92
93 # create an mgppbuilder object
94 my $self = bless {'collection'=>$collection,
95 'source_dir'=>$source_dir,
96 'build_dir'=>$build_dir,
97 'verbosity'=>$verbosity,
98 'maxdocs'=>$maxdocs,
99 'debug'=>$debug,
100 'keepold'=>$keepold,
101 'allclassifications'=>$allclassifications,
102 'outhandle'=>$outhandle,
103 'notbuilt'=>[], # indexes not built
104 'indexfieldmap'=>\%static_indexfield_map
105 }, $class;
106
107
108 # read in the collection configuration file
109 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
110 if (!-e $colcfgname) {
111 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
112 }
113 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
114
115 # sort out subcollection indexes
116 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
117 my $indexes = $self->{'collect_cfg'}->{'indexes'};
118 $self->{'collect_cfg'}->{'indexes'} = [];
119 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
120 foreach $index (@$indexes) {
121 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
122 }
123 }
124 }
125
126 # sort out language subindexes
127 if (defined $self->{'collect_cfg'}->{'languages'}) {
128 my $indexes = $self->{'collect_cfg'}->{'indexes'};
129 $self->{'collect_cfg'}->{'indexes'} = [];
130 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
131 foreach $index (@$indexes) {
132 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
133 }
134 }
135 }
136
137 # make sure that the same index isn't specified more than once
138 my %tmphash = ();
139 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
140 $self->{'collect_cfg'}->{'indexes'} = [];
141 foreach my $i (@tmparray) {
142 if (!defined ($tmphash{$i})) {
143 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
144 $tmphash{$i} = 1;
145 }
146 }
147
148
149 # get the levels (Section, Paragraph) for indexing and compression
150 $self->{'levels'} = {};
151 if (defined $self->{'collect_cfg'}->{'levels'}) {
152 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
153 $self->{'levels'}->{$level} = 1;
154 }
155 }
156
157 # get the list of plugins for this collection
158 my $plugins = [];
159 if (defined $self->{'collect_cfg'}->{'plugin'}) {
160 $plugins = $self->{'collect_cfg'}->{'plugin'};
161 }
162
163 # load all the plugins
164 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
165 if (scalar(@{$self->{'pluginfo'}}) == 0) {
166 print $outhandle "No plugins were loaded.\n";
167 die "\n";
168 }
169
170 # get the list of classifiers for this collection
171 my $classifiers = [];
172 if (defined $self->{'collect_cfg'}->{'classify'}) {
173 $classifiers = $self->{'collect_cfg'}->{'classify'};
174 }
175
176 # load all the classifiers
177 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
178
179 # load up any dontgdbm fields
180 $self->{'dontgdbm'} = {};
181 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
182 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
183 $self->{'dontgdbm'}->{$dg} = 1;
184 }
185 }
186
187 # load up the document processor for building
188 # if a buildproc class has been created for this collection, use it
189 # otherwise, use the mgpp buildproc
190 my ($buildprocdir, $buildproctype);
191 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
192 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
193 $buildproctype = "${collection}buildproc";
194 } else {
195 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
196 $buildproctype = "mgppbuildproc";
197 }
198 require "$buildprocdir/$buildproctype.pm";
199
200 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
201 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
202 die "$@" if $@;
203
204
205 return $self;
206}
207
208sub init {
209 my $self = shift (@_);
210
211 if (!$self->{'debug'} && !$self->{'keepold'}) {
212 # remove any old builds
213 &util::rm_r($self->{'build_dir'});
214 &util::mk_all_dir($self->{'build_dir'});
215
216 # make the text directory
217 my $textdir = "$self->{'build_dir'}/text";
218 &util::mk_all_dir($textdir);
219 }
220}
221
222sub set_strip_html {
223 my $self = shift (@_);
224 my ($strip) = @_;
225
226 $self->{'strip_html'} = $strip;
227 $self->{'buildproc'}->set_strip_html($strip);
228}
229
230sub compress_text {
231
232 my $self = shift (@_);
233 my ($textindex) = @_;
234
235 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
236 my $exe = &util::get_os_exe ();
237 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
238 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
239 my $outhandle = $self->{'outhandle'};
240
241 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
242
243 my $builddir = $self->{'build_dir'};
244 my $basefilename = "text/$self->{'collection'}";
245
246# mgpp cant work on windows at the moment
247# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
248# $basefilename =~ s/\//\\/g;
249# $builddir =~ s/\//\\/g;
250#
251# }
252
253
254 # define the section names for mgpasses
255 # the compressor doesn't need to know about paragraphs - never want to
256 # retrieve them
257 my $mg_passes_sections = "";
258 if ($self->{'levels'}->{'Section'}) {
259 $mg_passes_sections .= "-K Section ";
260 }
261
262 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
263
264 # collect the statistics for the text
265 # -b $maxdocsize sets the maximum document size to be 12 meg
266 print $outhandle "\n collecting text statistics (mg_passes -T1)\n" if ($self->{'verbosity'} >= 1);
267
268 my ($handle);
269 if ($self->{'debug'}) {
270 $handle = STDOUT;
271 } else {
272 if (!-e "$mg_passes_exe" ||
273 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -T1")) {
274 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
275 }
276 $handle = mgppbuilder::PIPEOUT;
277 }
278
279 $self->{'buildproc'}->set_output_handle ($handle);
280 $self->{'buildproc'}->set_mode ('text');
281 $self->{'buildproc'}->set_index ($textindex);
282 $self->{'buildproc'}->set_indexing_text (0);
283 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
284 $self->{'buildproc'}->set_levels ($self->{'levels'});
285 $self->{'buildproc'}->reset();
286 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
287 $self->{'buildproc'}, $self->{'maxdocs'});
288 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
289 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
290 &plugin::end($self->{'pluginfo'});
291 close (PIPEOUT);
292
293 close ($handle) unless $self->{'debug'};
294
295 # create the compression dictionary
296 # the compression dictionary is built by assuming the stats are from a seed
297 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
298 # and the resulting dictionary must be less than 5 meg with the most
299 # frequent words being put into the dictionary first (-2 -k 5120)
300 # note: these options are left over from mg version
301 if (!$self->{'debug'}) {
302 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
303 if (!-e "$mg_compression_dict_exe") {
304 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
305 }
306 system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120");
307
308
309 if (!$self->{'debug'}) {
310 if (!-e "$mg_passes_exe" ||
311 !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {
312 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
313 }
314 }
315 }
316
317 $self->{'buildproc'}->reset();
318 # compress the text
319 print $outhandle "\n compressing the text (mg_passes -T2)\n" if ($self->{'verbosity'} >= 1);
320 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
321 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
322 close ($handle) unless $self->{'debug'};
323
324 $self->print_stats();
325}
326
327sub want_built {
328 my $self = shift (@_);
329 my ($index) = @_;
330
331 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
332 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
333 if ($index =~ /^$checkstr$/) {
334 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
335 return 0;
336 }
337 }
338 }
339
340 return 1;
341}
342
343sub build_indexes {
344 my $self = shift (@_);
345 my ($indexname) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 my $indexes = [];
349 if (defined $indexname && $indexname =~ /\w/) {
350 push @$indexes, $indexname;
351 } else {
352 $indexes = $self->{'collect_cfg'}->{'indexes'};
353 }
354
355 # create the mapping between the index descriptions
356 # and their directory names
357 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
358
359 # build each of the indexes
360 foreach $index (@$indexes) {
361 if ($self->want_built($index)) {
362 print $outhandle "\n*** building index $index in subdirectory " .
363 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
364 $self->build_index($index);
365 } else {
366 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
367 }
368 }
369}
370
371# creates directory names for each of the index descriptions
372sub create_index_mapping {
373 my $self = shift (@_);
374 my ($indexes) = @_;
375
376 my %mapping = ();
377 $mapping{'indexmaporder'} = [];
378 $mapping{'subcollectionmaporder'} = [];
379 $mapping{'languagemaporder'} = [];
380
381 # dirnames is used to check for collisions. Start this off
382 # with the manditory directory names
383 my %dirnames = ('text'=>'text',
384 'extra'=>'extra');
385 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
386
387 foreach $index (@$indexes) {
388 my ($fields, $subcollection, $languages) = split (":", $index);
389
390 # the directory name starts with a processed version of index fields
391 my ($pindex) = $self->process_field($fields);
392 # next comes a processed version of the index
393 $pindex = lc ($pindex);
394
395 # next comes a processed version of the subcollection if there is one.
396 my $psub = $self->process_field ($subcollection);
397 $psub = lc ($psub);
398
399 # next comes a processed version of the language if there is one.
400 my $plang = $self->process_field ($languages);
401 $plang = lc ($plang);
402
403 my $dirname = $pindex . $psub . $plang;
404
405 # check to be sure all index names are unique
406 while (defined ($dirnames{$dirname})) {
407 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
408 }
409
410 # store the mapping orders as well as the maps
411 if (!defined $mapping{'indexmap'}{"$fields"}) {
412 $mapping{'indexmap'}{"$fields"} = $pindex;
413 push (@{$mapping{'indexmaporder'}}, "$fields");
414 }
415 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
416 $mapping{'subcollectionmap'}{$subcollection} = $psub;
417 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
418 }
419 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
420 $mapping{'languagemap'}{$languages} = $plang;
421 push (@{$mapping{'languagemaporder'}}, $language);
422 }
423 $mapping{$index} = $dirname;
424 $dirnames{$dirname} = $index;
425 $pnames{'index'}{$pindex} = "$fields";
426 $pnames{'subcollection'}{$psub} = $subcollection;
427 $pnames{'languages'}{$plang} = $languages;
428 }
429
430 return \%mapping;
431}
432
433# returns a processed version of a field.
434# if the field has only one component the processed
435# version will contain the first character and next consonant
436# of that componant - otherwise it will contain the first
437# character of the first two components
438sub process_field {
439 my $self = shift (@_);
440 my ($field) = @_;
441
442 return "" unless (defined ($field) && $field =~ /\w/);
443
444 my @components = split /,/, $field;
445 if (scalar @components >= 2) {
446 splice (@components, 2);
447 map {s/^(.).*$/$1/;} @components;
448 return join("", @components);
449 } else {
450 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
451 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
452 return "$a$b";
453 }
454}
455
456sub make_unique {
457 my $self = shift (@_);
458 my ($namehash, $index, $indexref, $subref, $langref) = @_;
459 my ($fields, $subcollection, $languages) = split (":", $index);
460
461 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
462 $self->get_next_version ($indexref);
463 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
464 $self->get_next_version ($subref);
465 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
466 $self->get_next_version ($langref);
467 }
468 return "$$indexref$$subref$$langref";
469}
470
471sub get_next_version {
472 my $self = shift (@_);
473 my ($nameref) = @_;
474
475 if ($$nameref =~ /(\d\d)$/) {
476 my $num = $1; $num ++;
477 $$nameref =~ s/\d\d$/$num/;
478 } elsif ($$nameref =~ /(\d)$/) {
479 my $num = $1;
480 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
481 else {$num ++; $$nameref =~ s/\d$/$num/;}
482 } else {
483 $$nameref =~ s/.$/0/;
484 }
485}
486
487sub build_index {
488 my $self = shift (@_);
489 my ($index) = @_;
490 my $outhandle = $self->{'outhandle'};
491
492 # get the full index directory path and make sure it exists
493 my $indexdir = $self->{'index_mapping'}->{$index};
494 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
495 my $builddir = $self->{'build_dir'};
496
497 my $basefilename = &util::filename_cat ($indexdir,
498 $self->{'collection'});
499
500 # get any os specific stuff
501 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
502
503 my $exe = &util::get_os_exe ();
504 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
505
506 # define the section names for mgpasses
507 my $mg_passes_sections = "";
508 foreach $level (keys (%{$self->{'levels'}})) {
509 if ($level eq "Section" || $level eq "Paragraph") {
510 $mg_passes_sections .= "-K $level ";
511 }
512 }
513
514 my $mg_perf_hash_build_exe =
515 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
516 my $mg_weights_build_exe =
517 &util::filename_cat ($exedir, "mg_weights_build$exe");
518 my $mg_invf_dict_exe =
519 &util::filename_cat ($exedir, "mg_invf_dict$exe");
520 my $mg_stem_idx_exe =
521 &util::filename_cat ($exedir, "mg_stem_idx$exe");
522
523# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
524# $builddir=~ s/\//\\/g;
525# $basefilename =~ s/\//\\/g;
526# }
527
528 # get the index expression if this index belongs
529 # to a subcollection
530 my $indexexparr = [];
531 my ($fields, $subcollection) = split (":", $index);
532 my @subcollections = ();
533 @subcollections = split /,/, $subcollection if (defined $subcollection);
534
535 foreach $subcollection (@subcollections) {
536 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
537 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
538 }
539 }
540
541 # add expressions for languages if this index belongs to
542 # a language subcollection
543 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
544 if ($language =~ s/^\!//) {
545 push (@$indexexparr, "!Language/$language/");
546 } else {
547 push (@$indexexparr, "Language/$language/");
548 }
549 }
550
551 # Build index dictionary. Uses verbatim stem method
552 print $outhandle "\n creating index dictionary (mg_passes -I1)\n" if ($self->{'verbosity'} >= 1);
553 my ($handle);
554 if ($self->{'debug'}) {
555 $handle = STDOUT;
556 } else {
557 if (!-e "$mg_passes_exe" ||
558 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I1")) {
559 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
560 }
561 $handle = mgppbuilder::PIPEOUT;
562 }
563
564 # set up the document processor
565 $self->{'buildproc'}->set_output_handle ($handle);
566 $self->{'buildproc'}->set_mode ('text');
567 $self->{'buildproc'}->set_index ($index, $indexexparr);
568 $self->{'buildproc'}->set_indexing_text (1);
569 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
570 $self->{'buildproc'}->set_levels ($self->{'levels'});
571 $self->{'buildproc'}->reset();
572 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
573 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
574 close ($handle) unless $self->{'debug'};
575
576 $self->print_stats();
577
578 if (!$self->{'debug'}) {
579 # create the perfect hash function
580 if (!-e "$mg_perf_hash_build_exe") {
581 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
582 }
583 system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename");
584
585 if (!-e "$mg_passes_exe" ||
586 !open ($handle, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I2")) {
587 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
588 }
589 }
590
591 # invert the text
592 print $outhandle "\n inverting the text (mg_passes -I2)\n" if ($self->{'verbosity'} >= 1);
593
594 $self->{'buildproc'}->reset();
595 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
596 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
597
598 $self->print_stats ();
599
600 if (!$self->{'debug'}) {
601
602 close ($handle);
603
604 # create the weights file
605 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
606 if (!-e "$mg_weights_build_exe") {
607 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
608 }
609 system ("$mg_weights_build_exe -d $builddir -f $basefilename");
610
611 # create 'on-disk' stemmed dictionary
612 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
613 if (!-e "$mg_invf_dict_exe") {
614 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
615 }
616 system ("$mg_invf_dict_exe -d $builddir -f $basefilename");
617
618
619 # creates stem index files for the various stemming methods
620 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
621 if (!-e "$mg_stem_idx_exe") {
622 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
623 }
624 system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");
625 system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");
626 system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");
627
628
629 # remove unwanted files
630 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
631 opendir (DIR, $tmpdir) || die
632 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
633 foreach $file (readdir(DIR)) {
634 next if $file =~ /^\./;
635 my ($suffix) = $file =~ /\.([^\.]+)$/;
636 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
637 # delete it!
638 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
639 &util::rm (&util::filename_cat ($tmpdir, $file));
640 }
641 }
642 closedir (DIR);
643 }
644}
645
646sub make_infodatabase {
647 my $self = shift (@_);
648 my $outhandle = $self->{'outhandle'};
649
650
651 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
652 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
653 &util::mk_all_dir ($textdir);
654 &util::mk_all_dir ($assocdir);
655
656 # get db name
657 my $dbext = ".bdb";
658 $dbext = ".ldb" if &util::is_little_endian();
659 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
660 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
661
662 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
663 my $exe = &util::get_os_exe ();
664 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
665
666 print $outhandle "\n*** creating the info database and processing associated files\n"
667 if ($self->{'verbosity'} >= 1);
668
669 # init all the classifiers
670 &classify::init_classifiers ($self->{'classifiers'});
671
672 # set up the document processor
673 my ($handle);
674 if ($self->{'debug'}) {
675 $handle = STDOUT;
676 } else {
677 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
678 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
679 }
680 $handle = mgppbuilder::PIPEOUT;
681 }
682
683 $self->{'buildproc'}->set_output_handle ($handle);
684 $self->{'buildproc'}->set_mode ('infodb');
685 $self->{'buildproc'}->set_assocdir ($assocdir);
686 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
687 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
688 $self->{'buildproc'}->set_indexing_text (0);
689 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
690
691 $self->{'buildproc'}->reset();
692
693 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
694
695 if (!defined $self->{'index_mapping'}) {
696 $self->{'index_mapping'} =
697 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
698 }
699
700 print $handle "[collection]\n";
701
702 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
703 if ($cmeta =~ s/^\.//) {
704 if (defined $self->{'index_mapping'}->{$cmeta}) {
705 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
706 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
707 print $outhandle "have .section entry in collect file\n";
708 } else {
709 print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
710 }
711 } else {
712 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
713 }
714 }
715 #print out the indexfield mapping
716 foreach $field (keys(%{$self->{'indexfieldmap'}})) {
717 $shortname = $self->{'indexfieldmap'}->{$field};
718 print $handle "<$shortname>$field\n";
719 }
720 print $handle "\n" . ('-' x 70) . "\n";
721
722 }
723
724 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
725 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
726
727 # output classification information
728 &classify::output_classify_info ($self->{'classifiers'}, $handle,
729 $self->{'allclassifications'});
730
731 close ($handle) if !$self->{'debug'};
732}
733
734sub collect_specific {
735 my $self = shift (@_);
736}
737
738sub make_auxiliary_files {
739 my $self = shift (@_);
740 my ($index);
741 my %build_cfg = ();
742
743 my $outhandle = $self->{'outhandle'};
744 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
745
746 # get the text directory
747 &util::mk_all_dir ($self->{'build_dir'});
748
749 # store the build date
750 $build_cfg->{'builddate'} = time;
751 $build_cfg->{'buildtype'} = "mgpp";
752
753 # store the number of documents and number of bytes
754 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
755 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
756
757 # store the mapping between the index names and the directory names
758 my @indexmap = ();
759 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
760 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
761 }
762 $build_cfg->{'indexmap'} = \@indexmap;
763
764 my @subcollectionmap = ();
765 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
766 push (@subcollectionmap, "$subcollection\-\>" .
767 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
768 }
769 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
770
771 my @languagemap = ();
772 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
773 push (@languagemap, "$language\-\>" .
774 $self->{'index_mapping'}->{'languagemap'}->{$language});
775 }
776 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
777
778 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
779
780 # store the indexfieldmap information
781 my @indexfieldmap = ();
782 #add all fields bit
783 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
784 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
785 }
786
787 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
788
789 #store the indexed field information
790 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
791
792 push (@{$build_cfg->{'indexfields'}}, $field);
793 }
794 # write out the build information
795 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
796 '^(builddate|buildtype|numdocs|numbytes)$',
797 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
798
799}
800
801sub deinit {
802 my $self = shift (@_);
803}
804
805sub print_stats {
806 my $self = shift (@_);
807
808 my $outhandle = $self->{'outhandle'};
809 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
810 my $index = $self->{'buildproc'}->get_index();
811 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
812 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
813
814 if ($indexing_text) {
815 print $outhandle "Stats (Creating index $index)\n";
816 } else {
817 print $outhandle "Stats (Compressing text from $index)\n";
818 }
819 print $outhandle "Total bytes in collection: $num_bytes\n";
820 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
821
822 if ($num_processed_bytes < 50) {
823 print $outhandle "***************\n";
824 print $outhandle "WARNING: There is very little or no text to process for $index\n";
825 if ($indexing_text) {
826 print $outhandle "This may cause an error while attempting to build the index\n";
827 } else {
828 print $outhandle "This may cause an error while attempting to compress the text\n";
829 }
830 print $outhandle "***************\n";
831 }
832
833}
834
8351;
836
837
Note: See TracBrowser for help on using the repository browser.