source: main/tags/2.30/gsdl/perllib/mgppbuilder.pm@ 24168

Last change on this file since 24168 was 1772, checked in by kjm18, 23 years ago

removed Paragraph stuff - now only has Document and Section; added </Document>

tags at end of docs; the metadata fields that are indexed are written to

indexfields line in build.cfg

  • Property svn:keywords set to Author Date Id Revision
File size: 30.0 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35
36BEGIN {
37 # set autoflush on for STDERR and STDOUT so that mg
38 # doesn't get out of sync with plugins
39 STDOUT->autoflush(1);
40 STDERR->autoflush(1);
41}
42
43END {
44 STDOUT->autoflush(0);
45 STDERR->autoflush(0);
46}
47
48$maxdocsize = 12000;
49
50#update this !!!!!!!!!!!!!!!!
51%wanted_index_files = ('td'=>1,
52 't'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'i'=>1,
58 'ip'=>1,
59 'tiw'=>1,
60 'wa'=>1);
61
62
63sub new {
64 my ($class, $collection, $source_dir, $build_dir, $verbosity,
65 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
66
67 $outhandle = STDERR unless defined $outhandle;
68
69 # create an mgppbuilder object
70 my $self = bless {'collection'=>$collection,
71 'source_dir'=>$source_dir,
72 'build_dir'=>$build_dir,
73 'verbosity'=>$verbosity,
74 'maxdocs'=>$maxdocs,
75 'debug'=>$debug,
76 'keepold'=>$keepold,
77 'allclassifications'=>$allclassifications,
78 'outhandle'=>$outhandle,
79 'notbuilt'=>[] # indexes not built
80 }, $class;
81
82
83 # read in the collection configuration file
84 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
85 if (!-e $colcfgname) {
86 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
87 }
88 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
89
90 # sort out subcollection indexes
91 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
92 my $indexes = $self->{'collect_cfg'}->{'indexes'};
93 $self->{'collect_cfg'}->{'indexes'} = [];
94 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
95 foreach $index (@$indexes) {
96 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
97 }
98 }
99 }
100
101 # sort out language subindexes
102 if (defined $self->{'collect_cfg'}->{'languages'}) {
103 my $indexes = $self->{'collect_cfg'}->{'indexes'};
104 $self->{'collect_cfg'}->{'indexes'} = [];
105 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
106 foreach $index (@$indexes) {
107 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
108 }
109 }
110 }
111
112 # get the list of plugins for this collection
113 my $plugins = [];
114 if (defined $self->{'collect_cfg'}->{'plugin'}) {
115 $plugins = $self->{'collect_cfg'}->{'plugin'};
116 }
117
118 # load all the plugins
119 $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
120 if (scalar(@{$self->{'pluginfo'}}) == 0) {
121 print $outhandle "No plugins were loaded.\n";
122 die "\n";
123 }
124
125 # get the list of classifiers for this collection
126 my $classifiers = [];
127 if (defined $self->{'collect_cfg'}->{'classify'}) {
128 $classifiers = $self->{'collect_cfg'}->{'classify'};
129 }
130
131 # load all the classifiers
132 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle);
133
134 # load up any dontgdbm fields
135 $self->{'dontgdbm'} = {};
136 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
137 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
138 $self->{'dontgdbm'}->{$dg} = 1;
139 }
140 }
141
142 # load up the document processor for building
143 # if a buildproc class has been created for this collection, use it
144 # otherwise, use the mg buildproc
145 my ($buildprocdir, $buildproctype);
146 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
147 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
148 $buildproctype = "${collection}buildproc";
149 } else {
150 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
151 $buildproctype = "mgppbuildproc";
152 }
153 require "$buildprocdir/$buildproctype.pm";
154
155 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
156 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
157 die "$@" if $@;
158
159
160 return $self;
161}
162
163sub init {
164 my $self = shift (@_);
165
166 if (!$self->{'debug'} && !$self->{'keepold'}) {
167 # remove any old builds
168 &util::rm_r($self->{'build_dir'});
169 &util::mk_all_dir($self->{'build_dir'});
170
171 # make the text directory
172 my $textdir = "$self->{'build_dir'}/text";
173 &util::mk_all_dir($textdir);
174 }
175}
176
177sub build_collection {
178 my $self = shift (@_);
179 my ($textindex, $indexname) = @_;
180
181 my $outhandle = $self->{'outhandle'};
182
183 print $outhandle "build_col, textindex=$textindex, indexname=$indexname\n";
184 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
185 my $exe = &util::get_os_exe ();
186
187 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
188 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
189 my $mg_perf_hash_build_exe =
190 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
191 my $mg_weights_build_exe =
192 &util::filename_cat ($exedir, "mg_weights_build$exe");
193 my $mg_invf_dict_exe =
194 &util::filename_cat ($exedir, "mg_invf_dict$exe");
195 my $mg_stem_idx_exe =
196 &util::filename_cat ($exedir, "mg_stem_idx$exe");
197
198 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
199 my $basefilename = "$self->{'collection'}";
200# my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
201 # my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
202 # $self->{'collection'});
203
204 my $fulltextprefix=$self->{'build_dir'}; # note if this works, change all to $directory, change in mg calls!!!!!!!!!!!!!!
205 my $fullindexprefix=$self->{'build_dir'};
206
207 my $directory = $self->{'build_dir'};
208 my $osextra = "";
209 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
210 $fulltextprefix =~ s/\//\\/g;
211 #$directory = ~s/\//\\/g;
212 } else {
213 $osextra = " -d /";
214 }
215
216 #indexname got from command line arg. if not specified, its "", so use
217 # ones stated in cfg file
218 my $indexes = [];
219 if (!(defined $indexname && $indexname =~ /\w/)) {
220 $indexes = $self->{'collect_cfg'}->{'indexes'};
221 $indexname="Title,Organization,Magazine,text";
222 }
223 else {
224 push @$indexes, $indexname;
225 }
226 print $outhandle "indexes are: @$indexes\n";
227
228
229 print $outhandle "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);
230 print $outhandle "fulltextprefix=$fulltextprefix\n";
231 # carry out the first pass of mg_passes
232 # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer
233 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
234
235 my ($handle);
236 if ($self->{'debug'}) {
237 $handle = STDOUT;
238 } else {
239 if (!-e "$mg_passes_exe" ||
240 !open (PIPEOUT, "| $mg_passes_exe -K Section -T1 -I1 -d $fulltextprefix -f $basefilename")) {
241 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
242 }
243 $handle = mgppbuilder::PIPEOUT;
244 }
245
246
247 #Assume that only going to build one index for now. so index will be
248 # anything specified in cfg file
249 $self->{'buildproc'}->set_output_handle ($handle);
250 $self->{'buildproc'}->set_mode ('text');
251 $self->{'buildproc'}->set_index ($indexname);
252 $self->{'buildproc'}->set_indexing_text (1); # not used at the moment I think
253 $self->{'buildproc'}->reset();
254 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
255 $self->{'buildproc'}, $self->{'maxdocs'});
256 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
257 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
258 &plugin::end($self->{'pluginfo'});
259 close (PIPEOUT);
260
261 close ($handle) unless $self->{'debug'};
262
263
264 # create the compression dictionary
265 # the compression dictionary is built by assuming the stats are from a seed
266 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
267 # and the resulting dictionary must be less than 5 meg with the most frequent
268 # words being put into the dictionary first (-2 -k 5120)
269 if (!$self->{'debug'}) {
270 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
271 if (!-e "$mg_compression_dict_exe") {
272 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
273 }
274 system ("$mg_compression_dict_exe -d $fulltextprefix -f $basefilename");
275
276 # create the perfect hash function
277 if (!-e "$mg_perf_hash_build_exe") {
278 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
279 }
280 system ("$mg_perf_hash_build_exe -d $fullindexprefix -f $basefilename");
281
282 # compress the text
283 # -b $maxdocsize sets the maximum document size to be 12 meg
284 if (!$self->{'debug'}) {
285 if (!-e "$mg_passes_exe" ||
286 !open ($handle, "| $mg_passes_exe -K Section -d $fulltextprefix -f $basefilename -T2 -I2")) {
287 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
288 }
289 }
290 }
291
292 $self->{'buildproc'}->reset();
293
294 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
295 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
296 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
297 close ($handle) unless $self->{'debug'};
298
299
300
301 # create the weights file
302 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
303 if (!-e "$mg_weights_build_exe") {
304 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
305 }
306 system ("$mg_weights_build_exe -d $fullindexprefix -f $basefilename ");
307
308 # create 'on-disk' stemmed dictionary
309 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
310 if (!-e "$mg_invf_dict_exe") {
311 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
312 }
313 system ("$mg_invf_dict_exe -d $fullindexprefix -f $basefilename");
314
315
316 # creates stem index files for the various stemming methods
317 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
318 if (!-e "$mg_stem_idx_exe") {
319 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
320 }
321 system ("$mg_stem_idx_exe -b 4096 -s1 -d $fullindexprefix -f $basefilename");
322 system ("$mg_stem_idx_exe -b 4096 -s2 -d $fullindexprefix -f $basefilename");
323 system ("$mg_stem_idx_exe -b 4096 -s3 -d $fullindexprefix -f $basefilename");
324}
325
326
327#for mgpp with more than one index
328sub compress_text {
329
330 my $self = shift (@_);
331 my ($textindex) = @_;
332
333# $textindex = "Title,Organization,Subject,Magazine,text";
334 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
335 my $exe = &util::get_os_exe ();
336 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
337 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
338 my $outhandle = $self->{'outhandle'};
339
340 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
341
342 my $builddir = $self->{'build_dir'};
343 my $basefilename = "text/$self->{'collection'}";
344
345 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
346 $basefilename =~ s/\//\\/g;
347 $builddir =~ s/\//\\/g;
348
349 }
350
351 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
352
353 # collect the statistics for the text
354 # -b $maxdocsize sets the maximum document size to be 12 meg
355 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
356
357 my ($handle);
358 if ($self->{'debug'}) {
359 $handle = STDOUT;
360 } else {
361 if (!-e "$mg_passes_exe" ||
362 !open (PIPEOUT, "| $mg_passes_exe -K Section -d $builddir -f $basefilename -T1")) {
363 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
364 }
365 $handle = mgppbuilder::PIPEOUT;
366 }
367
368 $self->{'buildproc'}->set_output_handle ($handle);
369 $self->{'buildproc'}->set_mode ('text');
370 $self->{'buildproc'}->set_index ($textindex);
371 $self->{'buildproc'}->set_indexing_text (0);
372 $self->{'buildproc'}->reset();
373 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
374 $self->{'buildproc'}, $self->{'maxdocs'});
375 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
376 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
377 &plugin::end($self->{'pluginfo'});
378 close (PIPEOUT);
379
380 close ($handle) unless $self->{'debug'};
381
382 # create the compression dictionary
383 # the compression dictionary is built by assuming the stats are from a seed
384 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
385 # and the resulting dictionary must be less than 5 meg with the most
386 # frequent words being put into the dictionary first (-2 -k 5120)
387 # note: this options are left over from mg version
388 if (!$self->{'debug'}) {
389 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
390 if (!-e "$mg_compression_dict_exe") {
391 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
392 }
393 system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120");
394
395
396 if (!$self->{'debug'}) {
397 if (!-e "$mg_passes_exe" ||
398 !open ($handle, "| $mg_passes_exe -K Section -f $basefilename -d $builddir -T2")) {
399 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
400 }
401 }
402 }
403
404 $self->{'buildproc'}->reset();
405 # compress the text
406 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
407 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
408 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
409 close ($handle) unless $self->{'debug'};
410
411 $self->print_stats();
412}
413
414sub want_built {
415 my $self = shift (@_);
416 my ($index) = @_;
417
418 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
419 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
420 if ($index =~ /^$checkstr$/) {
421 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
422 return 0;
423 }
424 }
425 }
426
427 return 1;
428}
429
430sub build_indexes {
431 my $self = shift (@_);
432 my ($indexname) = @_;
433 my $outhandle = $self->{'outhandle'};
434
435 my $indexes = [];
436 if (defined $indexname && $indexname =~ /\w/) {
437 push @$indexes, $indexname;
438 } else {
439 $indexes = $self->{'collect_cfg'}->{'indexes'};
440 }
441
442# push @$indexes, "text,Title,Organization,Magazine,Subject";
443# push @$indexes, "Title,Organization,Magazine,Subject";
444
445 # create the mapping between the index descriptions
446 # and their directory names
447 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
448
449 # build each of the indexes
450 foreach $index (@$indexes) {
451 if ($self->want_built($index)) {
452 print $outhandle "\n*** building index $index in subdirectory " .
453 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
454 $self->build_index($index);
455 } else {
456 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
457 }
458 }
459}
460
461# creates directory names for each of the index descriptions
462sub create_index_mapping {
463 my $self = shift (@_);
464 my ($indexes) = @_;
465
466 my %mapping = ();
467 $mapping{'indexmaporder'} = [];
468 $mapping{'subcollectionmaporder'} = [];
469 $mapping{'languagemaporder'} = [];
470
471 # dirnames is used to check for collisions. Start this off
472 # with the manditory directory names
473 my %dirnames = ('text'=>'text',
474 'extra'=>'extra');
475 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
476
477 foreach $index (@$indexes) {
478 my ($fields, $subcollection, $languages) = split (":", $index);
479
480 # the directory name starts with a processed version of index fields
481 my ($pindex) = $self->process_field($fields);
482 # next comes a processed version of the index
483 $pindex = lc ($pindex);
484
485 # next comes a processed version of the subcollection if there is one.
486 my $psub = $self->process_field ($subcollection);
487 $psub = lc ($psub);
488
489 # next comes a processed version of the language if there is one.
490 my $plang = $self->process_field ($languages);
491 $plang = lc ($plang);
492
493 my $dirname = $pindex . $psub . $plang;
494
495 # check to be sure all index names are unique
496 while (defined ($dirnames{$dirname})) {
497 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
498 }
499
500 # store the mapping orders as well as the maps
501 if (!defined $mapping{'indexmap'}{"$fields"}) {
502 $mapping{'indexmap'}{"$fields"} = $pindex;
503 push (@{$mapping{'indexmaporder'}}, "$fields");
504 }
505 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
506 $mapping{'subcollectionmap'}{$subcollection} = $psub;
507 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
508 }
509 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
510 $mapping{'languagemap'}{$languages} = $plang;
511 push (@{$mapping{'languagemaporder'}}, $language);
512 }
513 $mapping{$index} = $dirname;
514 $dirnames{$dirname} = $index;
515 $pnames{'index'}{$pindex} = "$fields";
516 $pnames{'subcollection'}{$psub} = $subcollection;
517 $pnames{'languages'}{$plang} = $languages;
518 }
519
520 return \%mapping;
521}
522
523# returns a processed version of a field.
524# if the field has only one component the processed
525# version will contain the first character and next consonant
526# of that componant - otherwise it will contain the first
527# character of the first two components
528sub process_field {
529 my $self = shift (@_);
530 my ($field) = @_;
531
532 return "" unless (defined ($field) && $field =~ /\w/);
533
534 my @components = split /,/, $field;
535 if (scalar @components >= 2) {
536 splice (@components, 2);
537 map {s/^(.).*$/$1/;} @components;
538 return join("", @components);
539 } else {
540 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
541 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
542 return "$a$b";
543 }
544}
545
546sub make_unique {
547 my $self = shift (@_);
548 my ($namehash, $index, $indexref, $subref, $langref) = @_;
549 my ($fields, $subcollection, $languages) = split (":", $index);
550
551 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
552 $self->get_next_version ($indexref);
553 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
554 $self->get_next_version ($subref);
555 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
556 $self->get_next_version ($langref);
557 }
558 return "$$indexref$$subref$$langref";
559}
560
561sub get_next_version {
562 my $self = shift (@_);
563 my ($nameref) = @_;
564
565 if ($$nameref =~ /(\d\d)$/) {
566 my $num = $1; $num ++;
567 $$nameref =~ s/\d\d$/$num/;
568 } elsif ($$nameref =~ /(\d)$/) {
569 my $num = $1;
570 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
571 else {$num ++; $$nameref =~ s/\d$/$num/;}
572 } else {
573 $$nameref =~ s/.$/0/;
574 }
575}
576
577sub build_index {
578 my $self = shift (@_);
579 my ($index) = @_;
580 my $outhandle = $self->{'outhandle'};
581
582 # get the full index directory path and make sure it exists
583 my $indexdir = $self->{'index_mapping'}->{$index};
584 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
585 my $builddir = $self->{'build_dir'};
586
587 my $basefilename = &util::filename_cat ($indexdir,
588 $self->{'collection'});
589
590 # get any os specific stuff
591 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
592
593 my $exe = &util::get_os_exe ();
594 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
595 my $mg_perf_hash_build_exe =
596 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
597 my $mg_weights_build_exe =
598 &util::filename_cat ($exedir, "mg_weights_build$exe");
599 my $mg_invf_dict_exe =
600 &util::filename_cat ($exedir, "mg_invf_dict$exe");
601 my $mg_stem_idx_exe =
602 &util::filename_cat ($exedir, "mg_stem_idx$exe");
603
604 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
605 $builddir=~ s/\//\\/g;
606 $basefilename =~ s/\//\\/g;
607 }
608
609 # get the index expression if this index belongs
610 # to a subcollection
611 my $indexexparr = [];
612 my ($fields, $subcollection) = split (":", $index);
613 my @subcollections = ();
614 @subcollections = split /,/, $subcollection if (defined $subcollection);
615
616 foreach $subcollection (@subcollections) {
617 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
618 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
619 }
620 }
621
622 # add expressions for languages if this index belongs to
623 # a language subcollection
624 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
625 if ($language =~ s/^\!//) {
626 push (@$indexexparr, "!Language/$language/");
627 } else {
628 push (@$indexexparr, "Language/$language/");
629 }
630 }
631
632 # Build index dictionary. Uses verbatim stem method
633 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
634 my ($handle);
635 if ($self->{'debug'}) {
636 $handle = STDOUT;
637 } else {
638 if (!-e "$mg_passes_exe" ||
639 !open (PIPEOUT, "| $mg_passes_exe -K Section -d $builddir -f $basefilename -I1")) {
640 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
641 }
642 $handle = mgppbuilder::PIPEOUT;
643 }
644
645 # set up the document processor
646 $self->{'buildproc'}->set_output_handle ($handle);
647 $self->{'buildproc'}->set_mode ('text');
648 $self->{'buildproc'}->set_index ($index, $indexexparr);
649 $self->{'buildproc'}->set_indexing_text (1);
650
651 $self->{'buildproc'}->reset();
652 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
653 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
654 close ($handle) unless $self->{'debug'};
655
656 $self->print_stats();
657
658 if (!$self->{'debug'}) {
659 # create the perfect hash function
660 if (!-e "$mg_perf_hash_build_exe") {
661 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
662 }
663 system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename");
664
665 if (!-e "$mg_passes_exe" ||
666 !open ($handle, "| $mg_passes_exe -K Section -d $builddir -f $basefilename -I2")) {
667 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
668 }
669 }
670
671 # invert the text
672 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
673
674 $self->{'buildproc'}->reset();
675 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
676 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
677
678 $self->print_stats ();
679
680 if (!$self->{'debug'}) {
681
682 close ($handle);
683
684 # create the weights file
685 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
686 if (!-e "$mg_weights_build_exe") {
687 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
688 }
689 system ("$mg_weights_build_exe -d $builddir -f $basefilename");
690
691 # create 'on-disk' stemmed dictionary
692 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
693 if (!-e "$mg_invf_dict_exe") {
694 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
695 }
696 system ("$mg_invf_dict_exe -d $builddir -f $basefilename");
697
698
699 # creates stem index files for the various stemming methods
700 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
701 if (!-e "$mg_stem_idx_exe") {
702 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
703 }
704 system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");
705 system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");
706 system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");
707
708
709 # remove unwanted files
710# my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
711# opendir (DIR, $tmpdir) || die
712# "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
713# foreach $file (readdir(DIR)) {
714# next if $file =~ /^\./;
715# my ($suffix) = $file =~ /\.([^\.]+)$/;
716# if (defined $suffix && !defined $wanted_index_files{$suffix}) {
717 # delete it!
718# print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
719# &util::rm (&util::filename_cat ($tmpdir, $file));
720# }
721# }
722# closedir (DIR);
723 }
724}
725
726sub make_infodatabase {
727 my $self = shift (@_);
728 my $outhandle = $self->{'outhandle'};
729
730
731 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
732 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
733 &util::mk_all_dir ($textdir);
734 &util::mk_all_dir ($assocdir);
735
736 # get db name
737 my $dbext = ".bdb";
738 $dbext = ".ldb" if &util::is_little_endian();
739 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
740 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
741
742 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
743 my $exe = &util::get_os_exe ();
744 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
745
746 print $outhandle "\n*** creating the info database and processing associated files\n"
747 if ($self->{'verbosity'} >= 1);
748
749 # init all the classifiers
750 &classify::init_classifiers ($self->{'classifiers'});
751
752 # set up the document processor
753 my ($handle);
754 if ($self->{'debug'}) {
755 $handle = STDOUT;
756 } else {
757 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
758 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
759 }
760 $handle = mgppbuilder::PIPEOUT;
761 }
762
763 $self->{'buildproc'}->set_output_handle ($handle);
764 $self->{'buildproc'}->set_mode ('infodb');
765 $self->{'buildproc'}->set_assocdir ($assocdir);
766 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
767 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
768 $self->{'buildproc'}->set_indexing_text (0);
769 $self->{'buildproc'}->reset();
770
771 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
772
773 if (!defined $self->{'index_mapping'}) {
774 $self->{'index_mapping'} =
775 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
776 }
777
778 print $handle "[collection]\n";
779
780 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
781 if ($cmeta =~ s/^\.//) {
782 if (defined $self->{'index_mapping'}->{$cmeta}) {
783 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
784 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
785 print $outhandle "have .section entry in collect file\n";
786 } else {
787 print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
788 }
789 } else {
790 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
791 }
792 }
793 print $handle "\n" . ('-' x 70) . "\n";
794
795 }
796
797 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
798 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
799
800 # output classification information
801 &classify::output_classify_info ($self->{'classifiers'}, $handle,
802 $self->{'allclassifications'});
803
804 close ($handle) if !$self->{'debug'};
805}
806
807sub collect_specific {
808 my $self = shift (@_);
809}
810
811sub make_auxiliary_files {
812 my $self = shift (@_);
813 my ($index);
814 my %build_cfg = ();
815
816 my $outhandle = $self->{'outhandle'};
817 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
818
819 # get the text directory
820 &util::mk_all_dir ($self->{'build_dir'});
821
822 # store the build date
823 $build_cfg->{'builddate'} = time;
824 $build_cfg->{'buildtype'} = "mgpp";
825
826 # store the number of documents and number of bytes
827 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
828 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
829
830 # store the mapping between the index names and the directory names
831 my @indexmap = ();
832 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
833 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
834 }
835 $build_cfg->{'indexmap'} = \@indexmap;
836
837 my @subcollectionmap = ();
838 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
839 push (@subcollectionmap, "$subcollection\-\>" .
840 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
841 }
842 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
843
844 my @languagemap = ();
845 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
846 push (@languagemap, "$language\-\>" .
847 $self->{'index_mapping'}->{'languagemap'}->{$language});
848 }
849 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
850
851 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
852
853
854 #store the indexed field information
855 foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
856
857 push (@{$build_cfg->{'indexfields'}}, $field);
858 }
859 # write out the build information
860 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
861 '^(builddate|buildtype|numdocs|numbytes)$',
862 '^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields)$');
863
864}
865
866sub deinit {
867 my $self = shift (@_);
868}
869
870sub print_stats {
871 my $self = shift (@_);
872
873 my $outhandle = $self->{'outhandle'};
874 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
875 my $index = $self->{'buildproc'}->get_index();
876 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
877 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
878
879 if ($indexing_text) {
880 print $outhandle "Stats (Creating index $index)\n";
881 } else {
882 print $outhandle "Stats (Compressing text from $index)\n";
883 }
884 print $outhandle "Total bytes in collection: $num_bytes\n";
885 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
886
887 if ($num_processed_bytes < 50) {
888 print $outhandle "***************\n";
889 print $outhandle "WARNING: There is very little or no text to process for $index\n";
890 if ($indexing_text) {
891 print $outhandle "This may cause an error while attempting to build the index\n";
892 } else {
893 print $outhandle "This may cause an error while attempting to compress the text\n";
894 }
895 print $outhandle "***************\n";
896 }
897
898}
899
9001;
901
902
Note: See TracBrowser for help on using the repository browser.