source: trunk/gsdl/perllib/mgbuilder.pm@ 2576

Last change on this file since 2576 was 2506, checked in by dmm9, 23 years ago

added writing of collection document list to db (OID browselist)

  • Property svn:keywords set to Author Date Id Revision
File size: 27.0 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68
69 # create an mgbuilder object
70 my $self = bless {'collection'=>$collection,
71 'source_dir'=>$source_dir,
72 'build_dir'=>$build_dir,
73 'verbosity'=>$verbosity,
74 'maxdocs'=>$maxdocs,
75 'debug'=>$debug,
76 'keepold'=>$keepold,
77 'allclassifications'=>$allclassifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'notbuilt'=>[] # indexes not built
81 }, $class;
82
83
84 # read in the collection configuration file
85 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
86 if (!-e $colcfgname) {
87 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
88 }
89 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
90
91 # sort out subcollection indexes
92 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
93 my $indexes = $self->{'collect_cfg'}->{'indexes'};
94 $self->{'collect_cfg'}->{'indexes'} = [];
95 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
96 foreach $index (@$indexes) {
97 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
98 }
99 }
100 }
101
102 # sort out language subindexes
103 if (defined $self->{'collect_cfg'}->{'languages'}) {
104 my $indexes = $self->{'collect_cfg'}->{'indexes'};
105 $self->{'collect_cfg'}->{'indexes'} = [];
106 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
107 foreach $index (@$indexes) {
108 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
109 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
110 }
111 else { # add in an empty subcollection field
112 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
113 }
114 }
115 }
116 }
117
118 # make sure that the same index isn't specified more than once
119 my %tmphash = ();
120 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
121 $self->{'collect_cfg'}->{'indexes'} = [];
122 foreach my $i (@tmparray) {
123 if (!defined ($tmphash{$i})) {
124 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
125 $tmphash{$i} = 1;
126 }
127 }
128
129 # get the list of plugins for this collection
130 my $plugins = [];
131 if (defined $self->{'collect_cfg'}->{'plugin'}) {
132 $plugins = $self->{'collect_cfg'}->{'plugin'};
133 }
134
135 # load all the plugins
136 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
137 if (scalar(@{$self->{'pluginfo'}}) == 0) {
138 print $outhandle "No plugins were loaded.\n";
139 die "\n";
140 }
141
142 # get the list of classifiers for this collection
143 my $classifiers = [];
144 if (defined $self->{'collect_cfg'}->{'classify'}) {
145 $classifiers = $self->{'collect_cfg'}->{'classify'};
146 }
147
148 # load all the classifiers
149 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
150
151 # load up any dontgdbm fields
152 $self->{'dontgdbm'} = {};
153 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
154 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
155 $self->{'dontgdbm'}->{$dg} = 1;
156 }
157 }
158
159 # load up the document processor for building
160 # if a buildproc class has been created for this collection, use it
161 # otherwise, use the mg buildproc
162 my ($buildprocdir, $buildproctype);
163 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
164 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
165 $buildproctype = "${collection}buildproc";
166 } else {
167 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
168 $buildproctype = "mgbuildproc";
169 }
170 require "$buildprocdir/$buildproctype.pm";
171
172 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
173 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
174 die "$@" if $@;
175
176 return $self;
177}
178
179sub init {
180 my $self = shift (@_);
181
182 if (!$self->{'debug'} && !$self->{'keepold'}) {
183 # remove any old builds
184 &util::rm_r($self->{'build_dir'});
185 &util::mk_all_dir($self->{'build_dir'});
186
187 # make the text directory
188 my $textdir = "$self->{'build_dir'}/text";
189 &util::mk_all_dir($textdir);
190 }
191}
192
193sub compress_text {
194 my $self = shift (@_);
195 my ($textindex) = @_;
196 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
197 my $exe = &util::get_os_exe ();
198 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
199 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
200 my $outhandle = $self->{'outhandle'};
201
202 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
203 my $basefilename = "text/$self->{'collection'}";
204 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
205
206 my $osextra = "";
207 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
208 $fulltextprefix =~ s/\//\\/g;
209 } else {
210 $osextra = " -d /";
211 }
212
213 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
214
215 # collect the statistics for the text
216 # -b $maxdocsize sets the maximum document size to be 12 meg
217 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
218
219 my ($handle);
220 if ($self->{'debug'}) {
221 $handle = STDOUT;
222 } else {
223 if (!-e "$mg_passes_exe" ||
224 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
225 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
226 }
227 $handle = mgbuilder::PIPEOUT;
228 }
229
230 $self->{'buildproc'}->set_output_handle ($handle);
231 $self->{'buildproc'}->set_mode ('text');
232 $self->{'buildproc'}->set_index ($textindex);
233 $self->{'buildproc'}->set_indexing_text (0);
234 if ($self->{'no_text'}) {
235 $self->{'buildproc'}->set_store_text(0);
236 } else {
237 $self->{'buildproc'}->set_store_text(1);
238 }
239 $self->{'buildproc'}->reset();
240 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
241 $self->{'buildproc'}, $self->{'maxdocs'});
242 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
243 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
244 &plugin::end($self->{'pluginfo'});
245
246 close ($handle) unless $self->{'debug'};
247
248 $self->print_stats();
249
250 # create the compression dictionary
251 # the compression dictionary is built by assuming the stats are from a seed
252 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
253 # and the resulting dictionary must be less than 5 meg with the most frequent
254 # words being put into the dictionary first (-2 -k 5120)
255 if (!$self->{'debug'}) {
256 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
257 if (!-e "$mg_compression_dict_exe") {
258 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
259 }
260 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
261
262 # -b $maxdocsize sets the maximum document size to be 12 meg
263 if (!-e "$mg_passes_exe" ||
264 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
265 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
266 }
267 }
268
269 $self->{'buildproc'}->reset();
270 # compress the text
271 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
272 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
273 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
274 close ($handle) unless $self->{'debug'};
275
276 $self->print_stats();
277}
278
279sub want_built {
280 my $self = shift (@_);
281 my ($index) = @_;
282
283 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
284 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
285 if ($index =~ /^$checkstr$/) {
286 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
287 return 0;
288 }
289 }
290 }
291
292 return 1;
293}
294
295sub build_indexes {
296 my $self = shift (@_);
297 my ($indexname) = @_;
298 my $outhandle = $self->{'outhandle'};
299
300 my $indexes = [];
301 if (defined $indexname && $indexname =~ /\w/) {
302 push @$indexes, $indexname;
303 } else {
304 $indexes = $self->{'collect_cfg'}->{'indexes'};
305 }
306
307 # create the mapping between the index descriptions
308 # and their directory names
309 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
310
311 # build each of the indexes
312 foreach $index (@$indexes) {
313 if ($self->want_built($index)) {
314 print $outhandle "\n*** building index $index in subdirectory " .
315 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
316 $self->build_index($index);
317 } else {
318 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
319 }
320 }
321}
322
323# creates directory names for each of the index descriptions
324sub create_index_mapping {
325 my $self = shift (@_);
326 my ($indexes) = @_;
327
328 my %mapping = ();
329 $mapping{'indexmaporder'} = [];
330 $mapping{'subcollectionmaporder'} = [];
331 $mapping{'languagemaporder'} = [];
332
333 # dirnames is used to check for collisions. Start this off
334 # with the manditory directory names
335 my %dirnames = ('text'=>'text',
336 'extra'=>'extra');
337 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
338
339 foreach $index (@$indexes) {
340 my ($level, $gran, $subcollection, $languages) = split (":", $index);
341
342 # the directory name starts with the first character of the index level
343 my ($pindex) = $level =~ /^(.)/;
344
345 # next comes a processed version of the index
346 $pindex .= $self->process_field ($gran);
347 $pindex = lc ($pindex);
348
349 # next comes a processed version of the subcollection if there is one.
350 my $psub = $self->process_field ($subcollection);
351 $psub = lc ($psub);
352
353 # next comes a processed version of the language if there is one.
354 my $plang = $self->process_field ($languages);
355 $plang = lc ($plang);
356
357 my $dirname = $pindex . $psub . $plang;
358
359 # check to be sure all index names are unique
360 while (defined ($dirnames{$dirname})) {
361 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
362 }
363 $mapping{$index} = $dirname;
364
365 # store the mapping orders as well as the maps
366 # also put index, subcollection and language fields into the mapping thing -
367 # (the full index name (eg document:text:subcol:lang) is not used on
368 # the query page) -these are used for collectionmeta later on
369 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
370 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
371 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
372 if (!defined $mapping{"$level:$gran"}) {
373 $mapping{"$level:$gran"} = $pindex;
374 }
375 }
376 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
377 $mapping{'subcollectionmap'}{$subcollection} = $psub;
378 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
379 $mapping{$subcollection} = $psub;
380 }
381 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
382 $mapping{'languagemap'}{$languages} = $plang;
383 push (@{$mapping{'languagemaporder'}}, $languages);
384 $mapping{$languages} = $plang;
385 }
386 $dirnames{$dirname} = $index;
387 $pnames{'index'}{$pindex} = "$level:$gran";
388 $pnames{'subcollection'}{$psub} = $subcollection;
389 $pnames{'languages'}{$plang} = $languages;
390 }
391
392 return \%mapping;
393}
394
395# returns a processed version of a field.
396# if the field has only one component the processed
397# version will contain the first character and next consonant
398# of that componant - otherwise it will contain the first
399# character of the first two components
400sub process_field {
401 my $self = shift (@_);
402 my ($field) = @_;
403
404 return "" unless (defined ($field) && $field =~ /\w/);
405
406 my @components = split /,/, $field;
407 if (scalar @components >= 2) {
408 splice (@components, 2);
409 map {s/^(.).*$/$1/;} @components;
410 return join("", @components);
411 } else {
412 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
413 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
414 return "$a$b";
415 }
416}
417
418sub make_unique {
419 my $self = shift (@_);
420 my ($namehash, $index, $indexref, $subref, $langref) = @_;
421 my ($level, $gran, $subcollection, $languages) = split (":", $index);
422
423 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
424 $self->get_next_version ($indexref);
425 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
426 $self->get_next_version ($subref);
427 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
428 $self->get_next_version ($langref);
429 }
430 return "$$indexref$$subref$$langref";
431}
432
433sub get_next_version {
434 my $self = shift (@_);
435 my ($nameref) = @_;
436
437 if ($$nameref =~ /(\d\d)$/) {
438 my $num = $1; $num ++;
439 $$nameref =~ s/\d\d$/$num/;
440 } elsif ($$nameref =~ /(\d)$/) {
441 my $num = $1;
442 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
443 else {$num ++; $$nameref =~ s/\d$/$num/;}
444 } else {
445 $$nameref =~ s/.$/0/;
446 }
447}
448
449sub build_index {
450 my $self = shift (@_);
451 my ($index) = @_;
452 my $outhandle = $self->{'outhandle'};
453
454 # get the full index directory path and make sure it exists
455 my $indexdir = $self->{'index_mapping'}->{$index};
456 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
457 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
458 $self->{'collection'});
459 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
460 $self->{'collection'});
461
462 # get any os specific stuff
463 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
464 my $exe = &util::get_os_exe ();
465 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
466 my $mg_perf_hash_build_exe =
467 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
468 my $mg_weights_build_exe =
469 &util::filename_cat ($exedir, "mg_weights_build$exe");
470 my $mg_invf_dict_exe =
471 &util::filename_cat ($exedir, "mg_invf_dict$exe");
472 my $mg_stem_idx_exe =
473 &util::filename_cat ($exedir, "mg_stem_idx$exe");
474
475 my $osextra = "";
476 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
477 $fullindexprefix =~ s/\//\\/g;
478 } else {
479 $osextra = " -d /";
480 }
481
482 # get the index level from the index description
483 # the index will be level 2 unless we are building a
484 # paragraph level index
485 my $index_level = 2;
486 $index_level = 3 if $index =~ /^paragraph/i;
487
488 # get the index expression if this index belongs
489 # to a subcollection
490 my $indexexparr = [];
491
492 # there may be subcollection info, and language info.
493 my ($level, $fields, $subcollection, $language) = split (":", $index);
494 my @subcollections = ();
495 @subcollections = split /,/, $subcollection if (defined $subcollection);
496
497 foreach $subcollection (@subcollections) {
498 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
499 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
500 }
501 }
502
503 # add expressions for languages if this index belongs to
504 # a language subcollection - only put languages expressions for the
505 # ones we want in the index
506
507 my @languages = ();
508 @languages = split /,/, $language if (defined $language);
509 foreach $language (@languages) {
510 my $not=0;
511 if ($language =~ s/^\!//) {
512 $not = 1;
513 }
514 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
515 if ($lang eq $language) {
516 if($not) {
517 push (@$indexexparr, "!Language/$language/");
518 } else {
519 push (@$indexexparr, "Language/$language/");
520 }
521 last;
522 }
523 }
524 }
525
526 # Build index dictionary. Uses verbatim stem method
527 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
528 my ($handle);
529 if ($self->{'debug'}) {
530 $handle = STDOUT;
531 } else {
532 if (!-e "$mg_passes_exe" ||
533 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
534 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
535 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
536 }
537 $handle = mgbuilder::PIPEOUT;
538 }
539
540 # set up the document processor
541 $self->{'buildproc'}->set_output_handle ($handle);
542 $self->{'buildproc'}->set_mode ('text');
543 $self->{'buildproc'}->set_index ($index, $indexexparr);
544 $self->{'buildproc'}->set_indexing_text (1);
545 $self->{'buildproc'}->set_store_text(1);
546
547 $self->{'buildproc'}->reset();
548 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
549 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
550 close ($handle) unless $self->{'debug'};
551
552 $self->print_stats();
553
554 if (!$self->{'debug'}) {
555 # create the perfect hash function
556 if (!-e "$mg_perf_hash_build_exe") {
557 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
558 }
559 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
560
561 if (!-e "$mg_passes_exe" ||
562 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
563 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
564 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
565 }
566 }
567
568 # invert the text
569 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
570
571 $self->{'buildproc'}->reset();
572 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
573 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
574
575 $self->print_stats ();
576
577 if (!$self->{'debug'}) {
578
579 close ($handle);
580
581 # create the weights file
582 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
583 if (!-e "$mg_weights_build_exe") {
584 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
585 }
586 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
587
588 # create 'on-disk' stemmed dictionary
589 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
590 if (!-e "$mg_invf_dict_exe") {
591 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
592 }
593 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
594
595
596 # creates stem index files for the various stemming methods
597 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
598 if (!-e "$mg_stem_idx_exe") {
599 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
600 }
601 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
602 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
603 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
604
605 # remove unwanted files
606 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
607 opendir (DIR, $tmpdir) || die
608 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
609 foreach $file (readdir(DIR)) {
610 next if $file =~ /^\./;
611 my ($suffix) = $file =~ /\.([^\.]+)$/;
612 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
613 # delete it!
614 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
615 &util::rm (&util::filename_cat ($tmpdir, $file));
616 }
617 }
618 closedir (DIR);
619 }
620}
621
622sub make_infodatabase {
623 my $self = shift (@_);
624 my $outhandle = $self->{'outhandle'};
625
626 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
627 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
628 &util::mk_all_dir ($textdir);
629 &util::mk_all_dir ($assocdir);
630
631 # get db name
632 my $dbext = ".bdb";
633 $dbext = ".ldb" if &util::is_little_endian();
634 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
635 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
636
637 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
638 my $exe = &util::get_os_exe ();
639 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
640
641 print $outhandle "\n*** creating the info database and processing associated files\n"
642 if ($self->{'verbosity'} >= 1);
643
644 # init all the classifiers
645 &classify::init_classifiers ($self->{'classifiers'});
646
647
648 # set up the document processor
649 my ($handle);
650 if ($self->{'debug'}) {
651 $handle = STDOUT;
652 } else {
653 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
654 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
655 }
656 $handle = mgbuilder::PIPEOUT;
657 }
658
659 $self->{'buildproc'}->set_output_handle ($handle);
660 $self->{'buildproc'}->set_mode ('infodb');
661 $self->{'buildproc'}->set_assocdir ($assocdir);
662 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
663 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
664 $self->{'buildproc'}->set_indexing_text (0);
665 $self->{'buildproc'}->set_store_text(1);
666 $self->{'buildproc'}->reset();
667
668 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
669
670 if (!defined $self->{'index_mapping'}) {
671 $self->{'index_mapping'} =
672 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
673 }
674
675 print $handle "[collection]\n";
676
677 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
678 if ($cmeta =~ s/^\.//) {
679 if (defined $self->{'index_mapping'}->{$cmeta}) {
680 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
681 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
682 }
683 else {
684 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
685 }
686 } else {
687 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
688 }
689 }
690 print $handle "\n" . ('-' x 70) . "\n";
691
692 }
693
694
695 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
696 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
697
698 # output classification information
699 &classify::output_classify_info ($self->{'classifiers'}, $handle,
700 $self->{'allclassifications'});
701
702
703
704 #output doclist
705 my @doclist = $self->{'buildproc'}->get_doc_list();
706 my $docs = join (";",@doclist);
707 print $handle "[browselist]\n";
708 print $handle "<hastxt>0\n";
709 print $handle "<childtype>VList\n";
710 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
711 print $handle "<thistype>Invisible\n";
712 print $handle "<contains>$docs";
713 print $handle "\n" . ('-' x 70) . "\n";
714
715 close ($handle) if !$self->{'debug'};
716}
717
718sub collect_specific {
719 my $self = shift (@_);
720}
721
722sub make_auxiliary_files {
723 my $self = shift (@_);
724 my ($index);
725 my %build_cfg = ();
726 my $outhandle = $self->{'outhandle'};
727
728 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
729
730 # get the text directory
731 &util::mk_all_dir ($self->{'build_dir'});
732
733 # store the build date
734 $build_cfg->{'builddate'} = time;
735
736 # store the number of documents and number of bytes
737 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
738 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
739
740 # get additional stats from mg
741 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
742 my $exe = &util::get_os_exe ();
743 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
744 my $input_file = &util::filename_cat ("text", $self->{'collection'});
745 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
746 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
747 } else {
748 my $line = "";
749 while (defined ($line = <PIPEIN>)) {
750 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
751 ($build_cfg->{'numwords'}) = $1;
752 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
753 ($build_cfg->{'numsections'}) = $1;
754 }
755 }
756 close PIPEIN;
757 }
758
759 # store the mapping between the index names and the directory names
760 my @indexmap = ();
761 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
762 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
763 }
764 $build_cfg->{'indexmap'} = \@indexmap;
765
766 my @subcollectionmap = ();
767 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
768 push (@subcollectionmap, "$subcollection\-\>" .
769 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
770 }
771 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
772
773 my @languagemap = ();
774 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
775 push (@languagemap, "$language\-\>" .
776 $self->{'index_mapping'}->{'languagemap'}->{$language});
777 }
778 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
779
780 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
781
782 # write out the build information
783 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
784 '^(builddate|numdocs|numbytes|numwords|numsections)$',
785 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
786
787}
788
789sub deinit {
790 my $self = shift (@_);
791}
792
793sub print_stats {
794 my $self = shift (@_);
795
796 my $outhandle = $self->{'outhandle'};
797 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
798 my $index = $self->{'buildproc'}->get_index();
799 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
800 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
801
802 if ($indexing_text) {
803 print $outhandle "Stats (Creating index $index)\n";
804 } else {
805 print $outhandle "Stats (Compressing text from $index)\n";
806 }
807 print $outhandle "Total bytes in collection: $num_bytes\n";
808 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
809
810 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
811 print $outhandle "***************\n";
812 if ($indexing_text) {
813 print $outhandle "WARNING: There is very little or no text to process for $index\n";
814 } elsif (!$self->{'no_text'}) {
815 print $outhandle "WARNING: There is very little or no text to compress\n";
816 }
817 print $outhandle " Was this your intention?\n";
818 print $outhandle "***************\n";
819 }
820}
821
8221;
Note: See TracBrowser for help on using the repository browser.