source: trunk/gsdl/perllib/mgbuilder.pm@ 6761

Last change on this file since 6761 was 6584, checked in by kjdon, 20 years ago

Fiddled around with segmenting for chinese text. Haven't changed how the
segmentation is done, or what character ranges are used.
But when its done is now controlled by the collect.cfg. There is a new
option, separate_cjk, values true or false, default false. Segmentation
is only done if this is set to true. This is passed as a global option to
all plugins by the import.pl script, so the user just needs to add it
once to the config file, not as an option to all plugins.
The queryaction uses this option too to determine whether or not to segment
the query.

  • Property svn:keywords set to Author Date Id Revision
File size: 32.4 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle, $gli) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # read in the collection configuration file
89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
90 if (!-e $colcfgname) {
91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
92 }
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94
95 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 }
98
99 # sort out subcollection indexes
100 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
106 }
107 }
108 }
109
110 # sort out language subindexes
111 if (defined $self->{'collect_cfg'}->{'languages'}) {
112 my $indexes = $self->{'collect_cfg'}->{'indexes'};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
115 foreach $index (@$indexes) {
116 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
117 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
118 }
119 else { # add in an empty subcollection field
120 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
121 }
122 }
123 }
124 }
125
126 if (defined($self->{'collect_cfg'}->{'indexes'})) {
127 # make sure that the same index isn't specified more than once
128 my %tmphash = ();
129 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $i (@tmparray) {
132 if (!defined ($tmphash{$i})) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
134 $tmphash{$i} = 1;
135 }
136 }
137 } else {
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 }
140
141 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
142 # no indexes have been specified so we'll build a "dummy:text" index
143 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
144 }
145
146 # get the list of plugins for this collection
147 my $plugins = [];
148 if (defined $self->{'collect_cfg'}->{'plugin'}) {
149 $plugins = $self->{'collect_cfg'}->{'plugin'};
150 }
151
152 # load all the plugins
153
154 #build up the extra global options for the plugins
155 my @global_opts = ();
156 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
157 push @global_opts, "-separate_cjk";
158 }
159 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
160 if (scalar(@{$self->{'pluginfo'}}) == 0) {
161 print $outhandle "No plugins were loaded.\n";
162 die "\n";
163 }
164
165 # get the list of classifiers for this collection
166 my $classifiers = [];
167 if (defined $self->{'collect_cfg'}->{'classify'}) {
168 $classifiers = $self->{'collect_cfg'}->{'classify'};
169 }
170
171 # load all the classifiers
172 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
173
174 # load up any dontgdbm fields
175 $self->{'dontgdbm'} = {};
176 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
177 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
178 $self->{'dontgdbm'}->{$dg} = 1;
179 }
180 }
181
182 # load up the document processor for building
183 # if a buildproc class has been created for this collection, use it
184 # otherwise, use the mg buildproc
185 my ($buildprocdir, $buildproctype);
186 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
187 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
188 $buildproctype = "${collection}buildproc";
189 } else {
190 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
191 $buildproctype = "mgbuildproc";
192 }
193 require "$buildprocdir/$buildproctype.pm";
194
195 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
196 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
197 die "$@" if $@;
198
199 return $self;
200}
201
202sub init {
203 my $self = shift (@_);
204
205 if (!$self->{'debug'} && !$self->{'keepold'}) {
206 # remove any old builds
207 &util::rm_r($self->{'build_dir'});
208 &util::mk_all_dir($self->{'build_dir'});
209
210 # make the text directory
211 my $textdir = "$self->{'build_dir'}/text";
212 &util::mk_all_dir($textdir);
213 }
214}
215
216sub compress_text {
217 my $self = shift (@_);
218 my ($textindex) = @_;
219 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
220 my $exe = &util::get_os_exe ();
221 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
222 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
223 my $outhandle = $self->{'outhandle'};
224
225 my $maxnumeric = 4;
226 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
227 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
228 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
229 }
230
231 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
232 my $basefilename = "text/$self->{'collection'}";
233 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
234
235 my $osextra = "";
236 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
237 $fulltextprefix =~ s@/@\\@g;
238 } else {
239 $osextra = " -d /";
240 }
241
242 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
243 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
244
245 # collect the statistics for the text
246 # -b $maxdocsize sets the maximum document size to be 12 meg
247 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
248 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
249
250 my ($handle);
251 if ($self->{'debug'}) {
252 $handle = STDOUT;
253 } else {
254 if (!-e "$mg_passes_exe" ||
255 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
256 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
257 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
258 }
259 $handle = mgbuilder::PIPEOUT;
260 }
261
262 $self->{'buildproc'}->set_output_handle ($handle);
263 $self->{'buildproc'}->set_mode ('text');
264 $self->{'buildproc'}->set_index ($textindex);
265 $self->{'buildproc'}->set_indexing_text (0);
266 if ($self->{'no_text'}) {
267 $self->{'buildproc'}->set_store_text(0);
268 } else {
269 $self->{'buildproc'}->set_store_text(1);
270 }
271 $self->{'buildproc'}->reset();
272 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
273 $self->{'buildproc'}, $self->{'maxdocs'});
274 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
275 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
276 &plugin::end($self->{'pluginfo'});
277
278 close ($handle) unless $self->{'debug'};
279
280 $self->print_stats();
281
282 # create the compression dictionary
283 # the compression dictionary is built by assuming the stats are from a seed
284 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
285 # and the resulting dictionary must be less than 5 meg with the most frequent
286 # words being put into the dictionary first (-2 -k 5120)
287 if (!$self->{'debug'}) {
288 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
289 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
290 if (!-e "$mg_compression_dict_exe") {
291 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
292 }
293 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
294
295 # -b $maxdocsize sets the maximum document size to be 12 meg
296 if (!-e "$mg_passes_exe" ||
297 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
298 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
299 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
300 }
301 }
302 else {
303 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
304 }
305
306 $self->{'buildproc'}->reset();
307 # compress the text
308 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
309 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
310
311 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
312 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
313 close ($handle) unless $self->{'debug'};
314
315 $self->print_stats();
316 print STDERR "</Stage>\n" if $self->{'gli'};
317}
318
319sub want_built {
320 my $self = shift (@_);
321 my ($index) = @_;
322
323 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
324 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
325 if ($index =~ /^$checkstr$/) {
326 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
327 $self->{'notbuilt'}->{$index} = 1;
328 return 0;
329 }
330 }
331 }
332
333 return 1;
334}
335
336sub build_indexes {
337 my $self = shift (@_);
338 my ($indexname) = @_;
339 my $outhandle = $self->{'outhandle'};
340
341 my $indexes = [];
342 if (defined $indexname && $indexname =~ /\w/) {
343 push @$indexes, $indexname;
344 } else {
345 $indexes = $self->{'collect_cfg'}->{'indexes'};
346 }
347
348 # create the mapping between the index descriptions
349 # and their directory names
350 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
351
352 # build each of the indexes
353 foreach $index (@$indexes) {
354 if ($self->want_built($index)) {
355 print $outhandle "\n*** building index $index in subdirectory " .
356 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
357 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
358 $self->build_index($index);
359 } else {
360 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
361 }
362 }
363}
364
365# creates directory names for each of the index descriptions
366sub create_index_mapping {
367 my $self = shift (@_);
368 my ($indexes) = @_;
369
370 my %mapping = ();
371 $mapping{'indexmaporder'} = [];
372 $mapping{'subcollectionmaporder'} = [];
373 $mapping{'languagemaporder'} = [];
374
375 # dirnames is used to check for collisions. Start this off
376 # with the manditory directory names
377 my %dirnames = ('text'=>'text',
378 'extra'=>'extra');
379 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
380
381 foreach $index (@$indexes) {
382 my ($level, $gran, $subcollection, $languages) = split (":", $index);
383
384 # the directory name starts with the first character of the index level
385 my ($pindex) = $level =~ /^(.)/;
386
387 # next comes a processed version of the index
388 $pindex .= $self->process_field ($gran);
389 $pindex = lc ($pindex);
390
391 # next comes a processed version of the subcollection if there is one.
392 my $psub = $self->process_field ($subcollection);
393 $psub = lc ($psub);
394
395 # next comes a processed version of the language if there is one.
396 my $plang = $self->process_field ($languages);
397 $plang = lc ($plang);
398
399 my $dirname = $pindex . $psub . $plang;
400
401 # check to be sure all index names are unique
402 while (defined ($dirnames{$dirname})) {
403 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
404 }
405 $mapping{$index} = $dirname;
406
407 # store the mapping orders as well as the maps
408 # also put index, subcollection and language fields into the mapping thing -
409 # (the full index name (eg document:text:subcol:lang) is not used on
410 # the query page) -these are used for collectionmeta later on
411 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
412 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
413 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
414 if (!defined $mapping{"$level:$gran"}) {
415 $mapping{"$level:$gran"} = $pindex;
416 }
417 }
418 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
419 $mapping{'subcollectionmap'}{$subcollection} = $psub;
420 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
421 $mapping{$subcollection} = $psub;
422 }
423 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
424 $mapping{'languagemap'}{$languages} = $plang;
425 push (@{$mapping{'languagemaporder'}}, $languages);
426 $mapping{$languages} = $plang;
427 }
428 $dirnames{$dirname} = $index;
429 $pnames{'index'}{$pindex} = "$level:$gran";
430 $pnames{'subcollection'}{$psub} = $subcollection;
431 $pnames{'languages'}{$plang} = $languages;
432 }
433
434 return \%mapping;
435}
436
437# returns a processed version of a field.
438# if the field has only one component the processed
439# version will contain the first character and next consonant
440# of that componant - otherwise it will contain the first
441# character of the first two components
442sub process_field {
443 my $self = shift (@_);
444 my ($field) = @_;
445
446 return "" unless (defined ($field) && $field =~ /\w/);
447
448 my @components = split /,/, $field;
449 if (scalar @components >= 2) {
450 splice (@components, 2);
451 map {s/^(.).*$/$1/;} @components;
452 return join("", @components);
453 } else {
454 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
455 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
456 return "$a$b";
457 }
458}
459
460sub make_unique {
461 my $self = shift (@_);
462 my ($namehash, $index, $indexref, $subref, $langref) = @_;
463 my ($level, $gran, $subcollection, $languages) = split (":", $index);
464
465 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
466 $self->get_next_version ($indexref);
467 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
468 $self->get_next_version ($subref);
469 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
470 $self->get_next_version ($langref);
471 }
472 return "$$indexref$$subref$$langref";
473}
474
475sub get_next_version {
476 my $self = shift (@_);
477 my ($nameref) = @_;
478
479 if ($$nameref =~ /(\d\d)$/) {
480 my $num = $1; $num ++;
481 $$nameref =~ s/\d\d$/$num/;
482 } elsif ($$nameref =~ /(\d)$/) {
483 my $num = $1;
484 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
485 else {$num ++; $$nameref =~ s/\d$/$num/;}
486 } else {
487 $$nameref =~ s/.$/0/;
488 }
489}
490
491sub build_index {
492 my $self = shift (@_);
493 my ($index) = @_;
494 my $outhandle = $self->{'outhandle'};
495
496 # get the full index directory path and make sure it exists
497 my $indexdir = $self->{'index_mapping'}->{$index};
498 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
499 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
500 $self->{'collection'});
501 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
502 $self->{'collection'});
503
504 # get any os specific stuff
505 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
506 my $exe = &util::get_os_exe ();
507 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
508 my $mg_perf_hash_build_exe =
509 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
510 my $mg_weights_build_exe =
511 &util::filename_cat ($exedir, "mg_weights_build$exe");
512 my $mg_invf_dict_exe =
513 &util::filename_cat ($exedir, "mg_invf_dict$exe");
514 my $mg_stem_idx_exe =
515 &util::filename_cat ($exedir, "mg_stem_idx$exe");
516
517 my $maxnumeric = 4;
518 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
519 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
520 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
521 }
522
523 my $osextra = "";
524 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
525 $fullindexprefix =~ s@/@\\@g;
526 } else {
527 $osextra = " -d /";
528 if ($outhandle ne "STDERR") {
529 # so mg_passes doesn't print to stderr if we redirect output
530 $osextra .= " 2>/dev/null";
531 }
532 }
533
534 # get the index level from the index description
535 # the index will be level 2 unless we are building a
536 # paragraph level index
537 my $index_level = 2;
538 $index_level = 3 if $index =~ /^paragraph/i;
539
540 # get the index expression if this index belongs
541 # to a subcollection
542 my $indexexparr = [];
543
544 # there may be subcollection info, and language info.
545 my ($level, $fields, $subcollection, $language) = split (":", $index);
546 my @subcollections = ();
547 @subcollections = split /,/, $subcollection if (defined $subcollection);
548
549 foreach $subcollection (@subcollections) {
550 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
551 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
552 }
553 }
554
555 # add expressions for languages if this index belongs to
556 # a language subcollection - only put languages expressions for the
557 # ones we want in the index
558 # this puts a separate Language/en entry in for each language in the list
559 # is this what we want?
560 # should we just have one entry with Language/en,es/ ??
561
562 my @languages = ();
563 @languages = split /,/, $language if (defined $language);
564 foreach $language (@languages) {
565 my $not=0;
566 if ($language =~ s/^\!//) {
567 $not = 1;
568 }
569 if($not) {
570 push (@$indexexparr, "!Language/$language/");
571 } else {
572 push (@$indexexparr, "Language/$language/");
573 }
574 }
575
576 # Build index dictionary. Uses verbatim stem method
577 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
578 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
579 my ($handle);
580 if ($self->{'debug'}) {
581 $handle = STDOUT;
582 } else {
583 if (!-e "$mg_passes_exe" ||
584 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
585 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
586 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
587 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
588 }
589 $handle = mgbuilder::PIPEOUT;
590 }
591
592 # set up the document processor
593 $self->{'buildproc'}->set_output_handle ($handle);
594 $self->{'buildproc'}->set_mode ('text');
595 $self->{'buildproc'}->set_index ($index, $indexexparr);
596 $self->{'buildproc'}->set_indexing_text (1);
597 $self->{'buildproc'}->set_store_text(1);
598
599 $self->{'buildproc'}->reset();
600 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
601 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
602 close ($handle) unless $self->{'debug'};
603
604 $self->print_stats();
605
606 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
607 # we check on the .id file - index dictionary
608 my $dict_file = "$fullindexprefix.id";
609 if (!-e $dict_file) {
610 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
611 $self->{'notbuilt'}->{$index}=1;
612 return;
613 }
614 if (!$self->{'debug'}) {
615 # create the perfect hash function
616 if (!-e "$mg_perf_hash_build_exe") {
617 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
618 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
619 }
620 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
621
622 if (!-e "$mg_passes_exe" ||
623 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
624 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
625 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
626 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
627 }
628 }
629
630 # invert the text
631 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
632 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
633 $self->{'buildproc'}->reset();
634 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
635 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
636
637 $self->print_stats ();
638
639 if (!$self->{'debug'}) {
640
641 close ($handle);
642
643 # create the weights file
644 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
645 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
646 if (!-e "$mg_weights_build_exe") {
647 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
648 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
649 }
650 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
651
652 # create 'on-disk' stemmed dictionary
653 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
654 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
655 if (!-e "$mg_invf_dict_exe") {
656 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
657 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
658 }
659 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
660
661
662 # creates stem index files for the various stemming methods
663 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
664 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
665 if (!-e "$mg_stem_idx_exe") {
666 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
667 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
668 }
669 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
670 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
671 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
672
673 # remove unwanted files
674 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
675 opendir (DIR, $tmpdir) || die
676 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
677 foreach $file (readdir(DIR)) {
678 next if $file =~ /^\./;
679 my ($suffix) = $file =~ /\.([^\.]+)$/;
680 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
681 # delete it!
682 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
683 &util::rm (&util::filename_cat ($tmpdir, $file));
684 }
685 }
686 closedir (DIR);
687 }
688 print STDERR "</Stage>\n" if $self->{'gli'};
689}
690
691sub make_infodatabase {
692 my $self = shift (@_);
693 my $outhandle = $self->{'outhandle'};
694
695 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
696 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
697 &util::mk_all_dir ($textdir);
698 &util::mk_all_dir ($assocdir);
699
700 # get db name
701 my $dbext = ".bdb";
702 $dbext = ".ldb" if &util::is_little_endian();
703 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
704 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
705
706 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
707 my $exe = &util::get_os_exe ();
708 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
709
710 print $outhandle "\n*** creating the info database and processing associated files\n"
711 if ($self->{'verbosity'} >= 1);
712 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
713
714 # init all the classifiers
715 &classify::init_classifiers ($self->{'classifiers'});
716
717 # set up the document processor
718 my ($handle);
719 if ($self->{'debug'}) {
720 $handle = STDOUT;
721 } else {
722 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
723 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
724 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
725 }
726 $handle = mgbuilder::PIPEOUT;
727 }
728
729 $self->{'buildproc'}->set_output_handle ($handle);
730 $self->{'buildproc'}->set_mode ('infodb');
731 $self->{'buildproc'}->set_assocdir ($assocdir);
732 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
733 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
734 $self->{'buildproc'}->set_indexing_text (0);
735 $self->{'buildproc'}->set_store_text(1);
736 $self->{'buildproc'}->reset();
737
738 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
739
740 if (!defined $self->{'index_mapping'}) {
741 $self->{'index_mapping'} =
742 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
743 }
744
745 print $handle "[collection]\n";
746
747 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
748 my $defaultfound=0;
749 my $first=1;
750 my $metadata_entry = "";
751 my $default="";
752 my $cmetamap = "";
753 if ($cmeta =~ s/^\.//) {
754 if (defined $self->{'index_mapping'}->{$cmeta}) {
755 $cmetamap = $self->{'index_mapping'}->{$cmeta};
756 $cmeta = ".$cmeta";
757 }
758 else {
759 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
760 next; #ignore this one
761 }
762 }
763 else {
764 $cmetamap = $cmeta; # just using the same name
765 }
766 #iterate through the languages
767 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
768 if ($first) {
769 $first=0;
770 #set the default default to the first entry
771 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
772 }
773 if ($lang =~ /default/) {
774 $defaultfound=1;
775 #the default entry goes first
776 $metadata_entry = "<$cmetamap>" .
777 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
778 }
779 else {
780 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
781 if ($l) {
782 $metadata_entry .= "<$cmetamap:$l>" .
783 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
784 }
785 }
786 }
787 #if we haven't found a default, put one in
788 if (!$defaultfound) {
789 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
790 }
791 #write the entry to the file
792 print $handle $metadata_entry;
793
794 }
795
796 print $handle "\n" . ('-' x 70) . "\n";
797 }
798
799 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
800 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
801
802 # output classification information
803 &classify::output_classify_info ($self->{'classifiers'}, $handle,
804 $self->{'allclassifications'},
805 $self->{'gli'});
806
807
808 #output doclist
809 my @doclist = $self->{'buildproc'}->get_doc_list();
810 my $docs = join (";",@doclist);
811 print $handle "[browselist]\n";
812 print $handle "<hastxt>0\n";
813 print $handle "<childtype>VList\n";
814 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
815 print $handle "<thistype>Invisible\n";
816 print $handle "<contains>$docs";
817 print $handle "\n" . ('-' x 70) . "\n";
818
819 close ($handle) if !$self->{'debug'};
820
821 print STDERR "</Stage>\n" if $self->{'gli'};
822}
823
824sub collect_specific {
825 my $self = shift (@_);
826}
827
828sub make_auxiliary_files {
829 my $self = shift (@_);
830 my ($index);
831 my %build_cfg = ();
832 my $outhandle = $self->{'outhandle'};
833
834 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
835 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
836
837 # get the text directory
838 &util::mk_all_dir ($self->{'build_dir'});
839
840 # store the build date
841 $build_cfg->{'builddate'} = time;
842
843 # store the number of documents and number of bytes
844 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
845 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
846
847 # get additional stats from mg
848 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
849 my $exe = &util::get_os_exe ();
850 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
851 my $input_file = &util::filename_cat ("text", $self->{'collection'});
852 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
853 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
854 } else {
855 my $line = "";
856 while (defined ($line = <PIPEIN>)) {
857 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
858 ($build_cfg->{'numwords'}) = $1;
859 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
860 ($build_cfg->{'numsections'}) = $1;
861 }
862 }
863 close PIPEIN;
864 }
865
866 # store the mapping between the index names and the directory names
867 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
868 my @indexmap = ();
869 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
870 if (not defined ($self->{'notbuilt'}->{$index})) {
871 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
872 }
873 }
874 $build_cfg->{'indexmap'} = \@indexmap;
875
876 my @subcollectionmap = ();
877 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
878 push (@subcollectionmap, "$subcollection\-\>" .
879 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
880 }
881 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
882
883 my @languagemap = ();
884 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
885 push (@languagemap, "$language\-\>" .
886 $self->{'index_mapping'}->{'languagemap'}->{$language});
887 }
888 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
889
890 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
891 my @notbuilt = ();
892 foreach $nb (keys %{$self->{'notbuilt'}}) {
893 push (@notbuilt, $nb);
894 }
895 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
896 $build_cfg->{'maxnumeric'} = 4;
897 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
898 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
899 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
900 }
901
902 # write out the build information
903 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
904 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
905 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
906
907 print STDERR "</Stage>\n" if $self->{'gli'};
908}
909
910sub deinit {
911 my $self = shift (@_);
912}
913
914sub print_stats {
915 my $self = shift (@_);
916
917 my $outhandle = $self->{'outhandle'};
918 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
919 my $index = $self->{'buildproc'}->get_index();
920 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
921 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
922
923 if ($indexing_text) {
924 print $outhandle "Stats (Creating index $index)\n";
925 } else {
926 print $outhandle "Stats (Compressing text from $index)\n";
927 }
928 print $outhandle "Total bytes in collection: $num_bytes\n";
929 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
930
931 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
932 print $outhandle "***************\n";
933 if ($indexing_text) {
934 print $outhandle "WARNING: There is very little or no text to process for $index\n";
935 } elsif (!$self->{'no_text'}) {
936 print $outhandle "WARNING: There is very little or no text to compress\n";
937 }
938 print $outhandle " Was this your intention?\n";
939 print $outhandle "***************\n";
940 print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};
941 }
942}
943
9441;
Note: See TracBrowser for help on using the repository browser.