source: gsdl/trunk/perllib/mgbuilder.pm@ 19617

Last change on this file since 19617 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use basebuilder;
29use plugin;
30use strict; no strict 'refs';
31use util;
32
33
34BEGIN {
35 @mgbuilder::ISA = ('basebuilder');
36}
37
38
39my %wanted_index_files = ('td'=>1,
40 't'=>1,
41 'idb'=>1,
42 'ib1'=>1,
43 'ib2'=>1,
44 'ib3'=>1,
45 'i'=>1,
46 'ip'=>1,
47 'tiw'=>1,
48 'wa'=>1);
49
50my $maxdocsize = $basebuilder::maxdocsize;
51
52
53sub new {
54 my $class = shift(@_);
55
56 my $self = new basebuilder (@_);
57 $self = bless $self, $class;
58
59 $self->{'buildtype'} = "mg";
60 return $self;
61}
62
63sub default_buildproc {
64 my $self = shift (@_);
65
66 return "mgbuildproc";
67}
68
69sub generate_index_list {
70 my $self = shift (@_);
71
72 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
73 $self->{'collect_cfg'}->{'indexes'} = [];
74 }
75 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
76 # no indexes have been specified so we'll build a "dummy:text" index
77 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
78 }
79
80}
81
82sub generate_index_options {
83 my $self = shift (@_);
84 $self->SUPER::generate_index_options();
85
86 $self->{'casefold'} = 0;
87 $self->{'stem'} = 0;
88 $self->{'accentfold'} = 0; #not yet implemented for mg
89
90 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
91 # just use default options
92 $self->{'casefold'} = 1;
93 $self->{'stem'} = 1;
94
95 } else {
96 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
97 if ($option =~ /stem/) {
98 $self->{'stem'} = 1;
99 } elsif ($option =~ /casefold/) {
100 $self->{'casefold'} = 1;
101 }
102 }
103 }
104
105 # now we record this for the build cfg
106 $self->{'stemindexes'} = 0;
107 if ($self->{'casefold'}) {
108 $self->{'stemindexes'} += 1;
109 }
110 if ($self->{'stem'}) {
111 $self->{'stemindexes'} += 2;
112 }
113
114
115}
116
117sub compress_text {
118 my $self = shift (@_);
119 my ($textindex) = @_;
120 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
121 my $exe = &util::get_os_exe ();
122 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
123 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
124 my $outhandle = $self->{'outhandle'};
125
126 my $maxnumeric = $self->{'maxnumeric'};
127
128 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
129
130 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
131 my $basefilename = &util::filename_cat("text",$collect_tail);
132 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
133
134 my $osextra = "";
135 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
136 $fulltextprefix =~ s@/@\\@g;
137 } else {
138 $osextra = " -d /";
139 }
140
141 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
142 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
143
144 # collect the statistics for the text
145 # -b $maxdocsize sets the maximum document size to be 12 meg
146 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
147 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
148
149 my ($handle);
150 if ($self->{'debug'}) {
151 $handle = *STDOUT;
152 }
153 else {
154 if (!-e "$mg_passes_exe" ||
155 !open($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
156 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
157 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
158 }
159 }
160
161 $self->{'buildproc'}->set_output_handle ($handle);
162 $self->{'buildproc'}->set_mode ('text');
163 $self->{'buildproc'}->set_index ($textindex);
164 $self->{'buildproc'}->set_indexing_text (0);
165
166
167 if ($self->{'no_text'}) {
168 $self->{'buildproc'}->set_store_text(0);
169 } else {
170 $self->{'buildproc'}->set_store_text(1);
171 }
172 $self->{'buildproc'}->reset();
173
174 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
175 $self->{'buildproc'}, $self->{'maxdocs'});
176 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
177 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
178 &plugin::end($self->{'pluginfo'});
179
180
181 close ($handle) unless $self->{'debug'};
182
183 $self->print_stats();
184
185 # create the compression dictionary
186 # the compression dictionary is built by assuming the stats are from a seed
187 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
188 # and the resulting dictionary must be less than 5 meg with the most frequent
189 # words being put into the dictionary first (-2 -k 5120)
190 if (!$self->{'debug'}) {
191 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
192 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
193 if (!-e "$mg_compression_dict_exe") {
194 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
195 }
196 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
197
198 # -b $maxdocsize sets the maximum document size to be 12 meg
199 if (!-e "$mg_passes_exe" ||
200 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
201 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
202 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
203 }
204 }
205 else {
206 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
207 }
208
209 $self->{'buildproc'}->reset();
210 # compress the text
211 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
212 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
213
214 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
215 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
216
217 close ($handle) unless $self->{'debug'};
218
219 $self->print_stats();
220 print STDERR "</Stage>\n" if $self->{'gli'};
221}
222
223
224# creates directory names for each of the index descriptions
225sub create_index_mapping {
226 my $self = shift (@_);
227 my ($indexes) = @_;
228
229 my %mapping = ();
230 $mapping{'indexmaporder'} = [];
231 $mapping{'subcollectionmaporder'} = [];
232 $mapping{'languagemaporder'} = [];
233
234 # dirnames is used to check for collisions. Start this off
235 # with the manditory directory names
236 my %dirnames = ('text'=>'text',
237 'extra'=>'extra');
238 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
239 foreach my $index (@$indexes) {
240 my ($level, $gran, $subcollection, $languages) = split (":", $index);
241
242 # the directory name starts with the first character of the index level
243 my ($pindex) = $level =~ /^(.)/;
244
245 # next comes a processed version of the index
246 $pindex .= $self->process_field ($gran);
247 $pindex = lc ($pindex);
248
249 # next comes a processed version of the subcollection if there is one.
250 my $psub = $self->process_field ($subcollection);
251 $psub = lc ($psub);
252
253 # next comes a processed version of the language if there is one.
254 my $plang = $self->process_field ($languages);
255 $plang = lc ($plang);
256
257 my $dirname = $pindex . $psub . $plang;
258
259 # check to be sure all index names are unique
260 while (defined ($dirnames{$dirname})) {
261 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
262 }
263 $mapping{$index} = $dirname;
264
265 # store the mapping orders as well as the maps
266 # also put index, subcollection and language fields into the mapping thing -
267 # (the full index name (eg document:text:subcol:lang) is not used on
268 # the query page) -these are used for collectionmeta later on
269 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
270 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
271 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
272 if (!defined $mapping{"$level:$gran"}) {
273 $mapping{"$level:$gran"} = $pindex;
274 }
275 }
276 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
277 $mapping{'subcollectionmap'}{$subcollection} = $psub;
278 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
279 $mapping{$subcollection} = $psub;
280 }
281 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
282 $mapping{'languagemap'}{$languages} = $plang;
283 push (@{$mapping{'languagemaporder'}}, $languages);
284 $mapping{$languages} = $plang;
285 }
286 $dirnames{$dirname} = $index;
287 $pnames{'index'}->{$pindex} = "$level:$gran";
288 $pnames{'subcollection'}->{$psub} = $subcollection;
289 $pnames{'languages'}->{$plang} = $languages;
290 }
291
292 return \%mapping;
293}
294
295
296sub make_unique {
297 my $self = shift (@_);
298 my ($namehash, $index, $indexref, $subref, $langref) = @_;
299 my ($level, $gran, $subcollection, $languages) = split (":", $index);
300
301 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
302 $self->get_next_version ($indexref);
303 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
304 $self->get_next_version ($subref);
305 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
306 $self->get_next_version ($langref);
307 }
308 return "$$indexref$$subref$$langref";
309}
310
311sub build_index {
312 my $self = shift (@_);
313 my ($index) = @_;
314 my $outhandle = $self->{'outhandle'};
315
316 # get the full index directory path and make sure it exists
317 my $indexdir = $self->{'index_mapping'}->{$index};
318 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
319
320 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
321 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
322 $collect_tail);
323 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
324 $collect_tail);
325
326 # get any os specific stuff
327 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
328 my $exe = &util::get_os_exe ();
329 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
330 my $mg_perf_hash_build_exe =
331 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
332 my $mg_weights_build_exe =
333 &util::filename_cat ($exedir, "mg_weights_build$exe");
334 my $mg_invf_dict_exe =
335 &util::filename_cat ($exedir, "mg_invf_dict$exe");
336 my $mg_stem_idx_exe =
337 &util::filename_cat ($exedir, "mg_stem_idx$exe");
338
339 my $maxnumeric = $self->{'maxnumeric'};
340
341 my $osextra = "";
342 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
343 $fullindexprefix =~ s@/@\\@g;
344 } else {
345 $osextra = " -d /";
346 if ($outhandle ne "STDERR") {
347 # so mg_passes doesn't print to stderr if we redirect output
348 $osextra .= " 2>/dev/null";
349 }
350 }
351
352 # get the index level from the index description
353 # the index will be level 2 unless we are building a
354 # paragraph level index
355 my $index_level = 2;
356 $index_level = 3 if $index =~ /^paragraph/i;
357
358 # get the index expression if this index belongs
359 # to a subcollection
360 my $indexexparr = [];
361 my $langarr = [];
362 # there may be subcollection info, and language info.
363 my ($level, $fields, $subcollection, $language) = split (":", $index);
364 my @subcollections = ();
365 @subcollections = split /,/, $subcollection if (defined $subcollection);
366
367 foreach my $subcollection (@subcollections) {
368 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
369 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
370 }
371 }
372
373 # add expressions for languages if this index belongs to
374 # a language subcollection - only put languages expressions for the
375 # ones we want in the index
376
377 my @languages = ();
378 my $language_metadata = "Language";
379 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
380 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
381 }
382 @languages = split /,/, $language if (defined $language);
383 foreach my $language (@languages) {
384 my $not=0;
385 if ($language =~ s/^\!//) {
386 $not = 1;
387 }
388 if($not) {
389 push (@$langarr, "!$language");
390 } else {
391 push (@$langarr, "$language");
392 }
393 }
394
395 # Build index dictionary. Uses verbatim stem method
396 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
397 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
398 my ($handle);
399 if ($self->{'debug'}) {
400 $handle = *STDOUT;
401 }
402 else {
403 if (!-e "$mg_passes_exe" ||
404 !open($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
405 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
406 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
407 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
408 }
409 }
410
411 # set up the document processor
412 $self->{'buildproc'}->set_output_handle ($handle);
413 $self->{'buildproc'}->set_mode ('text');
414 $self->{'buildproc'}->set_index ($index, $indexexparr);
415 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
416 $self->{'buildproc'}->set_indexing_text (1);
417 $self->{'buildproc'}->set_store_text(1);
418
419 $self->{'buildproc'}->reset();
420 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
421 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
422 close ($handle) unless $self->{'debug'};
423
424 $self->print_stats();
425
426 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
427 # we check on the .id file - index dictionary
428 my $dict_file = "$fullindexprefix.id";
429 if (!-e $dict_file) {
430 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
431 $self->{'notbuilt'}->{$index}=1;
432 return;
433 }
434 if (!$self->{'debug'}) {
435 # create the perfect hash function
436 if (!-e "$mg_perf_hash_build_exe") {
437 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
438 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
439 }
440 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
441
442 if (!-e "$mg_passes_exe" ||
443 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
444 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
445 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
446 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
447 }
448 }
449
450 # invert the text
451 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
452 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
453 $self->{'buildproc'}->reset();
454 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
455 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
456
457
458 $self->print_stats ();
459
460 if (!$self->{'debug'}) {
461
462 close ($handle);
463
464 # create the weights file
465 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
466 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
467 if (!-e "$mg_weights_build_exe") {
468 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
469 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
470 }
471 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
472
473 # create 'on-disk' stemmed dictionary
474 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
475 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
476 if (!-e "$mg_invf_dict_exe") {
477 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
478 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
479 }
480 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
481
482
483 # creates stem index files for the various stemming methods
484 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
485 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
486 if (!-e "$mg_stem_idx_exe") {
487 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
488 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
489 }
490 # currently mg wont work if we don't generate all the stem idexes
491 # so we generate them whatever, but don't advertise the fact
492 #if ($self->{'casefold'}) {
493 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
494 #}
495 #if ($self->{'stem'}) {
496 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
497 #}
498 #if ($self->{'casefold'} && $self->{'stem'}) {
499 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
500 #}
501
502 # remove unwanted files
503 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
504 opendir (DIR, $tmpdir) || die
505 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
506 foreach my $file (readdir(DIR)) {
507 next if $file =~ /^\./;
508 my ($suffix) = $file =~ /\.([^\.]+)$/;
509 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
510 # delete it!
511 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
512 &util::rm (&util::filename_cat ($tmpdir, $file));
513 }
514 }
515 closedir (DIR);
516 }
517 print STDERR "</Stage>\n" if $self->{'gli'};
518}
519
520sub build_cfg_extra {
521 my $self = shift(@_);
522 my ($build_cfg) = @_;
523
524 # get additional stats from mg
525 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
526 my $exe = &util::get_os_exe ();
527 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
528
529 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
530 my $input_file = &util::filename_cat ("text", $collect_tail);
531 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
532 my $outhandle = $self->{'outhandle'};
533 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
534 } else {
535 my $line = "";
536 while (defined ($line = <PIPEIN>)) {
537 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
538 ($build_cfg->{'numwords'}) = $1;
539 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
540 ($build_cfg->{'numsections'}) = $1;
541 }
542 }
543 close PIPEIN;
544 }
545}
546
5471;
548
549
550
Note: See TracBrowser for help on using the repository browser.