source: trunk/gsdl/perllib/mgppbuilder.pm@ 1852

Last change on this file since 1852 was 1852, checked in by kjm18, 23 years ago

heaps of changes

  • Property svn:keywords set to Author Date Id Revision
File size: 26.4 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'tl'=>1,
54 'ti'=>1,
55 'idb'=>1,
56 'ib1'=>1,
57 'ib2'=>1,
58 'ib3'=>1,
59 'i'=>1,
60 'il'=>1,
61 'tw'=>1,
62 'w'=>1,
63 'wa'=>1);
64
65# change this so a user can add their own ones in via a file or cfg
66%static_indexfield_map = ('Title'=>'TI',
67 'TI'=>1,
68 'Subject'=>'SU',
69 'SU'=>1,
70 'Creator'=>'CR',
71 'CR'=>1,
72 'Organization'=>'OR',
73 'OR'=>1,
74 'Source'=>'SO',
75 'SO'=>1,
76 'Howto'=>'HT',
77 'HT'=>1,
78 'ItemTitle'=>'IT',
79 'IT'=>1,
80 'ProgNumber'=>'PN',
81 'PN'=>1,
82 'People'=>'PE',
83 'PE'=>1,
84 'TextOnly'=>'TX',
85 'TX'=>1);
86
87sub new {
88 my ($class, $collection, $source_dir, $build_dir, $verbosity,
89 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
90
91 $outhandle = STDERR unless defined $outhandle;
92
93 # create an mgppbuilder object
94 my $self = bless {'collection'=>$collection,
95 'source_dir'=>$source_dir,
96 'build_dir'=>$build_dir,
97 'verbosity'=>$verbosity,
98 'maxdocs'=>$maxdocs,
99 'debug'=>$debug,
100 'keepold'=>$keepold,
101 'allclassifications'=>$allclassifications,
102 'outhandle'=>$outhandle,
103 'notbuilt'=>[], # indexes not built
104 'indexfieldmap'=>\%static_indexfield_map
105 }, $class;
106
107
108 # read in the collection configuration file
109 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
110 if (!-e $colcfgname) {
111 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
112 }
113 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
114
115 # sort out subcollection indexes
116 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
117 my $indexes = $self->{'collect_cfg'}->{'indexes'};
118 $self->{'collect_cfg'}->{'indexes'} = [];
119 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
120 foreach $index (@$indexes) {
121 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
122 }
123 }
124 }
125
126 # sort out language subindexes
127 if (defined $self->{'collect_cfg'}->{'languages'}) {
128 my $indexes = $self->{'collect_cfg'}->{'indexes'};
129 $self->{'collect_cfg'}->{'indexes'} = [];
130 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
131 foreach $index (@$indexes) {
132 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
133 }
134 }
135 }
136
137 # get the levels (Section, Paragraph) for indexing and compression
138 $self->{'levels'} = {};
139 if (defined $self->{'collect_cfg'}->{'levels'}) {
140 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
141 $self->{'levels'}->{$level} = 1;
142 }
143 }
144
145 # get the list of plugins for this collection
146 my $plugins = [];
147 if (defined $self->{'collect_cfg'}->{'plugin'}) {
148 $plugins = $self->{'collect_cfg'}->{'plugin'};
149 }
150
151 # load all the plugins
152 $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
153 if (scalar(@{$self->{'pluginfo'}}) == 0) {
154 print $outhandle "No plugins were loaded.\n";
155 die "\n";
156 }
157
158 # get the list of classifiers for this collection
159 my $classifiers = [];
160 if (defined $self->{'collect_cfg'}->{'classify'}) {
161 $classifiers = $self->{'collect_cfg'}->{'classify'};
162 }
163
164 # load all the classifiers
165 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle);
166
167 # load up any dontgdbm fields
168 $self->{'dontgdbm'} = {};
169 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
170 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
171 $self->{'dontgdbm'}->{$dg} = 1;
172 }
173 }
174
175 # load up the document processor for building
176 # if a buildproc class has been created for this collection, use it
177 # otherwise, use the mgpp buildproc
178 my ($buildprocdir, $buildproctype);
179 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
180 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
181 $buildproctype = "${collection}buildproc";
182 } else {
183 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
184 $buildproctype = "mgppbuildproc";
185 }
186 require "$buildprocdir/$buildproctype.pm";
187
188 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
189 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
190 die "$@" if $@;
191
192
193 return $self;
194}
195
196sub init {
197 my $self = shift (@_);
198
199 if (!$self->{'debug'} && !$self->{'keepold'}) {
200 # remove any old builds
201 &util::rm_r($self->{'build_dir'});
202 &util::mk_all_dir($self->{'build_dir'});
203
204 # make the text directory
205 my $textdir = "$self->{'build_dir'}/text";
206 &util::mk_all_dir($textdir);
207 }
208}
209
210sub set_strip_html {
211 my $self = shift (@_);
212 my ($strip) = @_;
213
214 $self->{'strip_html'} = $strip;
215 $self->{'buildproc'}->set_strip_html($strip);
216}
217
218sub compress_text {
219
220 my $self = shift (@_);
221 my ($textindex) = @_;
222
223 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
224 my $exe = &util::get_os_exe ();
225 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
226 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
227 my $outhandle = $self->{'outhandle'};
228
229 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
230
231 my $builddir = $self->{'build_dir'};
232 my $basefilename = "text/$self->{'collection'}";
233
234# mgpp cant work on windows at the moment
235# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
236# $basefilename =~ s/\//\\/g;
237# $builddir =~ s/\//\\/g;
238#
239# }
240
241
242 # define the section names for mgpasses
243 # the compressor doesn't need to know about paragraphs - never want to
244 # retrieve them
245 my $mg_passes_sections = "";
246 if ($self->{'levels'}->{'Section'}) {
247 $mg_passes_sections .= "-K Section ";
248 }
249
250 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
251
252 # collect the statistics for the text
253 # -b $maxdocsize sets the maximum document size to be 12 meg
254 print $outhandle "\n collecting text statistics (mg_passes -T1)\n" if ($self->{'verbosity'} >= 1);
255
256 my ($handle);
257 if ($self->{'debug'}) {
258 $handle = STDOUT;
259 } else {
260 if (!-e "$mg_passes_exe" ||
261 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -T1")) {
262 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
263 }
264 $handle = mgppbuilder::PIPEOUT;
265 }
266
267 $self->{'buildproc'}->set_output_handle ($handle);
268 $self->{'buildproc'}->set_mode ('text');
269 $self->{'buildproc'}->set_index ($textindex);
270 $self->{'buildproc'}->set_indexing_text (0);
271 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
272 $self->{'buildproc'}->set_levels ($self->{'levels'});
273 $self->{'buildproc'}->reset();
274 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
275 $self->{'buildproc'}, $self->{'maxdocs'});
276 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
277 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
278 &plugin::end($self->{'pluginfo'});
279 close (PIPEOUT);
280
281 close ($handle) unless $self->{'debug'};
282
283 # create the compression dictionary
284 # the compression dictionary is built by assuming the stats are from a seed
285 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
286 # and the resulting dictionary must be less than 5 meg with the most
287 # frequent words being put into the dictionary first (-2 -k 5120)
288 # note: these options are left over from mg version
289 if (!$self->{'debug'}) {
290 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
291 if (!-e "$mg_compression_dict_exe") {
292 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
293 }
294 system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120");
295
296
297 if (!$self->{'debug'}) {
298 if (!-e "$mg_passes_exe" ||
299 !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {
300 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
301 }
302 }
303 }
304
305 $self->{'buildproc'}->reset();
306 # compress the text
307 print $outhandle "\n compressing the text (mg_passes -T2)\n" if ($self->{'verbosity'} >= 1);
308 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
309 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
310 close ($handle) unless $self->{'debug'};
311
312 $self->print_stats();
313}
314
315sub want_built {
316 my $self = shift (@_);
317 my ($index) = @_;
318
319 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
320 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
321 if ($index =~ /^$checkstr$/) {
322 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
323 return 0;
324 }
325 }
326 }
327
328 return 1;
329}
330
331sub build_indexes {
332 my $self = shift (@_);
333 my ($indexname) = @_;
334 my $outhandle = $self->{'outhandle'};
335
336 my $indexes = [];
337 if (defined $indexname && $indexname =~ /\w/) {
338 push @$indexes, $indexname;
339 } else {
340 $indexes = $self->{'collect_cfg'}->{'indexes'};
341 }
342
343 # create the mapping between the index descriptions
344 # and their directory names
345 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
346
347 # build each of the indexes
348 foreach $index (@$indexes) {
349 if ($self->want_built($index)) {
350 print $outhandle "\n*** building index $index in subdirectory " .
351 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
352 $self->build_index($index);
353 } else {
354 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
355 }
356 }
357}
358
359# creates directory names for each of the index descriptions
360sub create_index_mapping {
361 my $self = shift (@_);
362 my ($indexes) = @_;
363
364 my %mapping = ();
365 $mapping{'indexmaporder'} = [];
366 $mapping{'subcollectionmaporder'} = [];
367 $mapping{'languagemaporder'} = [];
368
369 # dirnames is used to check for collisions. Start this off
370 # with the manditory directory names
371 my %dirnames = ('text'=>'text',
372 'extra'=>'extra');
373 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
374
375 foreach $index (@$indexes) {
376 my ($fields, $subcollection, $languages) = split (":", $index);
377
378 # the directory name starts with a processed version of index fields
379 my ($pindex) = $self->process_field($fields);
380 # next comes a processed version of the index
381 $pindex = lc ($pindex);
382
383 # next comes a processed version of the subcollection if there is one.
384 my $psub = $self->process_field ($subcollection);
385 $psub = lc ($psub);
386
387 # next comes a processed version of the language if there is one.
388 my $plang = $self->process_field ($languages);
389 $plang = lc ($plang);
390
391 my $dirname = $pindex . $psub . $plang;
392
393 # check to be sure all index names are unique
394 while (defined ($dirnames{$dirname})) {
395 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
396 }
397
398 # store the mapping orders as well as the maps
399 if (!defined $mapping{'indexmap'}{"$fields"}) {
400 $mapping{'indexmap'}{"$fields"} = $pindex;
401 push (@{$mapping{'indexmaporder'}}, "$fields");
402 }
403 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
404 $mapping{'subcollectionmap'}{$subcollection} = $psub;
405 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
406 }
407 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
408 $mapping{'languagemap'}{$languages} = $plang;
409 push (@{$mapping{'languagemaporder'}}, $language);
410 }
411 $mapping{$index} = $dirname;
412 $dirnames{$dirname} = $index;
413 $pnames{'index'}{$pindex} = "$fields";
414 $pnames{'subcollection'}{$psub} = $subcollection;
415 $pnames{'languages'}{$plang} = $languages;
416 }
417
418 return \%mapping;
419}
420
421# returns a processed version of a field.
422# if the field has only one component the processed
423# version will contain the first character and next consonant
424# of that componant - otherwise it will contain the first
425# character of the first two components
426sub process_field {
427 my $self = shift (@_);
428 my ($field) = @_;
429
430 return "" unless (defined ($field) && $field =~ /\w/);
431
432 my @components = split /,/, $field;
433 if (scalar @components >= 2) {
434 splice (@components, 2);
435 map {s/^(.).*$/$1/;} @components;
436 return join("", @components);
437 } else {
438 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
439 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
440 return "$a$b";
441 }
442}
443
444sub make_unique {
445 my $self = shift (@_);
446 my ($namehash, $index, $indexref, $subref, $langref) = @_;
447 my ($fields, $subcollection, $languages) = split (":", $index);
448
449 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
450 $self->get_next_version ($indexref);
451 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
452 $self->get_next_version ($subref);
453 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
454 $self->get_next_version ($langref);
455 }
456 return "$$indexref$$subref$$langref";
457}
458
459sub get_next_version {
460 my $self = shift (@_);
461 my ($nameref) = @_;
462
463 if ($$nameref =~ /(\d\d)$/) {
464 my $num = $1; $num ++;
465 $$nameref =~ s/\d\d$/$num/;
466 } elsif ($$nameref =~ /(\d)$/) {
467 my $num = $1;
468 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
469 else {$num ++; $$nameref =~ s/\d$/$num/;}
470 } else {
471 $$nameref =~ s/.$/0/;
472 }
473}
474
475sub build_index {
476 my $self = shift (@_);
477 my ($index) = @_;
478 my $outhandle = $self->{'outhandle'};
479
480 # get the full index directory path and make sure it exists
481 my $indexdir = $self->{'index_mapping'}->{$index};
482 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
483 my $builddir = $self->{'build_dir'};
484
485 my $basefilename = &util::filename_cat ($indexdir,
486 $self->{'collection'});
487
488 # get any os specific stuff
489 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
490
491 my $exe = &util::get_os_exe ();
492 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
493
494 # define the section names for mgpasses
495 my $mg_passes_sections = "";
496 foreach $level (keys (%{$self->{'levels'}})) {
497 if ($level eq "Section" || $level eq "Paragraph") {
498 $mg_passes_sections .= "-K $level ";
499 }
500 }
501
502 my $mg_perf_hash_build_exe =
503 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
504 my $mg_weights_build_exe =
505 &util::filename_cat ($exedir, "mg_weights_build$exe");
506 my $mg_invf_dict_exe =
507 &util::filename_cat ($exedir, "mg_invf_dict$exe");
508 my $mg_stem_idx_exe =
509 &util::filename_cat ($exedir, "mg_stem_idx$exe");
510
511# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
512# $builddir=~ s/\//\\/g;
513# $basefilename =~ s/\//\\/g;
514# }
515
516 # get the index expression if this index belongs
517 # to a subcollection
518 my $indexexparr = [];
519 my ($fields, $subcollection) = split (":", $index);
520 my @subcollections = ();
521 @subcollections = split /,/, $subcollection if (defined $subcollection);
522
523 foreach $subcollection (@subcollections) {
524 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
525 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
526 }
527 }
528
529 # add expressions for languages if this index belongs to
530 # a language subcollection
531 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
532 if ($language =~ s/^\!//) {
533 push (@$indexexparr, "!Language/$language/");
534 } else {
535 push (@$indexexparr, "Language/$language/");
536 }
537 }
538
539 # Build index dictionary. Uses verbatim stem method
540 print $outhandle "\n creating index dictionary (mg_passes -I1)\n" if ($self->{'verbosity'} >= 1);
541 my ($handle);
542 if ($self->{'debug'}) {
543 $handle = STDOUT;
544 } else {
545 if (!-e "$mg_passes_exe" ||
546 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I1")) {
547 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
548 }
549 $handle = mgppbuilder::PIPEOUT;
550 }
551
552 # set up the document processor
553 $self->{'buildproc'}->set_output_handle ($handle);
554 $self->{'buildproc'}->set_mode ('text');
555 $self->{'buildproc'}->set_index ($index, $indexexparr);
556 $self->{'buildproc'}->set_indexing_text (1);
557 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
558 $self->{'buildproc'}->set_levels ($self->{'levels'});
559 $self->{'buildproc'}->reset();
560 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
561 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
562 close ($handle) unless $self->{'debug'};
563
564 $self->print_stats();
565
566 if (!$self->{'debug'}) {
567 # create the perfect hash function
568 if (!-e "$mg_perf_hash_build_exe") {
569 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
570 }
571 system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename");
572
573 if (!-e "$mg_passes_exe" ||
574 !open ($handle, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I2")) {
575 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
576 }
577 }
578
579 # invert the text
580 print $outhandle "\n inverting the text (mg_passes -I2)\n" if ($self->{'verbosity'} >= 1);
581
582 $self->{'buildproc'}->reset();
583 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
584 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
585
586 $self->print_stats ();
587
588 if (!$self->{'debug'}) {
589
590 close ($handle);
591
592 # create the weights file
593 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
594 if (!-e "$mg_weights_build_exe") {
595 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
596 }
597 system ("$mg_weights_build_exe -d $builddir -f $basefilename");
598
599 # create 'on-disk' stemmed dictionary
600 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
601 if (!-e "$mg_invf_dict_exe") {
602 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
603 }
604 system ("$mg_invf_dict_exe -d $builddir -f $basefilename");
605
606
607 # creates stem index files for the various stemming methods
608 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
609 if (!-e "$mg_stem_idx_exe") {
610 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
611 }
612 system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");
613 system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");
614 system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");
615
616
617 # remove unwanted files
618 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
619 opendir (DIR, $tmpdir) || die
620 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
621 foreach $file (readdir(DIR)) {
622 next if $file =~ /^\./;
623 my ($suffix) = $file =~ /\.([^\.]+)$/;
624 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
625 # delete it!
626 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
627 &util::rm (&util::filename_cat ($tmpdir, $file));
628 }
629 }
630 closedir (DIR);
631 }
632}
633
634sub make_infodatabase {
635 my $self = shift (@_);
636 my $outhandle = $self->{'outhandle'};
637
638
639 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
640 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
641 &util::mk_all_dir ($textdir);
642 &util::mk_all_dir ($assocdir);
643
644 # get db name
645 my $dbext = ".bdb";
646 $dbext = ".ldb" if &util::is_little_endian();
647 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
648 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
649
650 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
651 my $exe = &util::get_os_exe ();
652 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
653
654 print $outhandle "\n*** creating the info database and processing associated files\n"
655 if ($self->{'verbosity'} >= 1);
656
657 # init all the classifiers
658 &classify::init_classifiers ($self->{'classifiers'});
659
660 # set up the document processor
661 my ($handle);
662 if ($self->{'debug'}) {
663 $handle = STDOUT;
664 } else {
665 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
666 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
667 }
668 $handle = mgppbuilder::PIPEOUT;
669 }
670
671 $self->{'buildproc'}->set_output_handle ($handle);
672 $self->{'buildproc'}->set_mode ('infodb');
673 $self->{'buildproc'}->set_assocdir ($assocdir);
674 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
675 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
676 $self->{'buildproc'}->set_indexing_text (0);
677 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
678
679 $self->{'buildproc'}->reset();
680
681 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
682
683 if (!defined $self->{'index_mapping'}) {
684 $self->{'index_mapping'} =
685 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
686 }
687
688 print $handle "[collection]\n";
689
690 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
691 if ($cmeta =~ s/^\.//) {
692 if (defined $self->{'index_mapping'}->{$cmeta}) {
693 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
694 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
695 print $outhandle "have .section entry in collect file\n";
696 } else {
697 print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
698 }
699 } else {
700 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
701 }
702 }
703 #print out the indexfield mapping
704 foreach $field (keys(%{$self->{'indexfieldmap'}})) {
705 $shortname = $self->{'indexfieldmap'}->{$field};
706 print $handle "<$shortname>$field\n";
707 }
708 print $handle "\n" . ('-' x 70) . "\n";
709
710 }
711
712 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
713 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
714
715 # output classification information
716 &classify::output_classify_info ($self->{'classifiers'}, $handle,
717 $self->{'allclassifications'});
718
719 close ($handle) if !$self->{'debug'};
720}
721
722sub collect_specific {
723 my $self = shift (@_);
724}
725
726sub make_auxiliary_files {
727 my $self = shift (@_);
728 my ($index);
729 my %build_cfg = ();
730
731 my $outhandle = $self->{'outhandle'};
732 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
733
734 # get the text directory
735 &util::mk_all_dir ($self->{'build_dir'});
736
737 # store the build date
738 $build_cfg->{'builddate'} = time;
739 $build_cfg->{'buildtype'} = "mgpp";
740
741 # store the number of documents and number of bytes
742 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
743 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
744
745 # store the mapping between the index names and the directory names
746 my @indexmap = ();
747 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
748 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
749 }
750 $build_cfg->{'indexmap'} = \@indexmap;
751
752 my @subcollectionmap = ();
753 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
754 push (@subcollectionmap, "$subcollection\-\>" .
755 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
756 }
757 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
758
759 my @languagemap = ();
760 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
761 push (@languagemap, "$language\-\>" .
762 $self->{'index_mapping'}->{'languagemap'}->{$language});
763 }
764 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
765
766 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
767
768 # store the indexfieldmap information
769 my @indexfieldmap = ();
770 #add all fields bit
771 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
772 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
773 }
774
775 $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
776
777 #store the indexed field information
778 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
779
780 push (@{$build_cfg->{'indexfields'}}, $field);
781 }
782 # write out the build information
783 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
784 '^(builddate|buildtype|numdocs|numbytes)$',
785 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
786
787}
788
789sub deinit {
790 my $self = shift (@_);
791}
792
793sub print_stats {
794 my $self = shift (@_);
795
796 my $outhandle = $self->{'outhandle'};
797 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
798 my $index = $self->{'buildproc'}->get_index();
799 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
800 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
801
802 if ($indexing_text) {
803 print $outhandle "Stats (Creating index $index)\n";
804 } else {
805 print $outhandle "Stats (Compressing text from $index)\n";
806 }
807 print $outhandle "Total bytes in collection: $num_bytes\n";
808 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
809
810 if ($num_processed_bytes < 50) {
811 print $outhandle "***************\n";
812 print $outhandle "WARNING: There is very little or no text to process for $index\n";
813 if ($indexing_text) {
814 print $outhandle "This may cause an error while attempting to build the index\n";
815 } else {
816 print $outhandle "This may cause an error while attempting to compress the text\n";
817 }
818 print $outhandle "***************\n";
819 }
820
821}
822
8231;
824
825
Note: See TracBrowser for help on using the repository browser.