root/gsdl/trunk/perllib/mgbuilder.pm @ 17110

Revision 17110, 18.7 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use basebuilder;
29use plugin;
30use strict; no strict 'refs';
31use util;
32
33
34BEGIN {
35    @mgbuilder::ISA = ('basebuilder');
36}
37
38
39my %wanted_index_files = ('td'=>1,
40               't'=>1,
41               'idb'=>1,
42               'ib1'=>1,
43               'ib2'=>1,
44               'ib3'=>1,
45               'i'=>1,
46               'ip'=>1,
47               'tiw'=>1,
48               'wa'=>1);
49
50my $maxdocsize = $basebuilder::maxdocsize;
51
52
53sub new {
54    my $class = shift(@_);
55
56    my $self = new basebuilder (@_);
57    $self = bless $self, $class;
58
59    $self->{'buildtype'} = "mg";
60    return $self;
61}
62
63sub default_buildproc {
64    my $self  = shift (@_);
65
66    return "mgbuildproc";
67}
68
69sub generate_index_list {
70    my $self = shift (@_);
71
72    if (!defined($self->{'collect_cfg'}->{'indexes'})) {
73    $self->{'collect_cfg'}->{'indexes'} = [];
74    }
75    if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
76    # no indexes have been specified so we'll build a "dummy:text" index
77    push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");   
78    }
79
80}
81
82sub generate_index_options {
83    my $self = shift (@_);
84    $self->SUPER::generate_index_options();
85   
86    $self->{'casefold'} = 0;
87    $self->{'stem'} = 0;
88    $self->{'accentfold'} = 0; #not yet implemented for mg
89   
90    if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
91    # just use default options
92    $self->{'casefold'} = 1;
93    $self->{'stem'} = 1;
94   
95    } else {
96    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
97        if ($option =~ /stem/) {
98        $self->{'stem'} = 1;
99        } elsif ($option =~ /casefold/) {
100        $self->{'casefold'} = 1;
101        }
102    }
103    }
104   
105    # now we record this for the build cfg
106    $self->{'stemindexes'} = 0;
107    if ($self->{'casefold'}) {
108    $self->{'stemindexes'} += 1;
109    }
110    if ($self->{'stem'}) {
111    $self->{'stemindexes'} += 2;
112    }
113
114
115}
116
117sub compress_text {
118    my $self = shift (@_);
119    my ($textindex) = @_;
120    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
121    my $exe = &util::get_os_exe ();
122    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
123    my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
124    my $outhandle = $self->{'outhandle'};
125
126    my $maxnumeric = $self->{'maxnumeric'};
127
128    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
129
130    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
131    my $basefilename = &util::filename_cat("text",$collect_tail);
132    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
133
134    my $osextra = "";
135    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
136    $fulltextprefix =~ s@/@\\@g;
137    } else {
138    $osextra = " -d /";
139    }
140
141    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
142    print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
143
144    # collect the statistics for the text
145    # -b $maxdocsize sets the maximum document size to be 12 meg
146    print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
147    print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
148
149    my ($handle);
150    if ($self->{'debug'}) {
151    $handle = *STDOUT;
152    }
153    else {
154    if (!-e "$mg_passes_exe" ||
155        !open($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
156        print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
157        die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
158    }
159    }
160
161    $self->{'buildproc'}->set_output_handle ($handle);
162    $self->{'buildproc'}->set_mode ('text');
163    $self->{'buildproc'}->set_index ($textindex);
164    $self->{'buildproc'}->set_indexing_text (0);
165
166
167    if ($self->{'no_text'}) {
168    $self->{'buildproc'}->set_store_text(0);
169    } else {
170    $self->{'buildproc'}->set_store_text(1);
171    }
172    $self->{'buildproc'}->reset();
173
174    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
175           $self->{'buildproc'}, $self->{'maxdocs'});
176    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
177           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
178    &plugin::end($self->{'pluginfo'});
179   
180
181    close ($handle) unless $self->{'debug'};
182
183    $self->print_stats();
184
185    # create the compression dictionary
186    # the compression dictionary is built by assuming the stats are from a seed
187    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
188    # and the resulting dictionary must be less than 5 meg with the most frequent
189    # words being put into the dictionary first (-2 -k 5120)
190    if (!$self->{'debug'}) {
191    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
192    print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
193    if (!-e "$mg_compression_dict_exe") {
194        die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
195    }
196    system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
197
198    # -b $maxdocsize sets the maximum document size to be 12 meg
199    if (!-e "$mg_passes_exe" ||
200        !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
201        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
202        die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
203    }
204    }
205    else {
206    print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
207    }
208
209    $self->{'buildproc'}->reset();
210    # compress the text
211    print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
212    print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
213
214    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
215           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
216
217    close ($handle) unless $self->{'debug'};
218
219    $self->print_stats();
220    print STDERR "</Stage>\n" if $self->{'gli'};
221}
222
223
224# creates directory names for each of the index descriptions
225sub create_index_mapping {
226    my $self = shift (@_);
227    my ($indexes) = @_;
228
229    my %mapping = ();
230    $mapping{'indexmaporder'} = [];
231    $mapping{'subcollectionmaporder'} = [];
232    $mapping{'languagemaporder'} = [];
233   
234    # dirnames is used to check for collisions. Start this off
235    # with the manditory directory names
236    my %dirnames = ('text'=>'text',
237            'extra'=>'extra');
238    my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
239    foreach my $index (@$indexes) {
240    my ($level, $gran, $subcollection, $languages) = split (":", $index);
241
242    # the directory name starts with the first character of the index level
243    my ($pindex) = $level =~ /^(.)/;
244
245    # next comes a processed version of the index
246    $pindex .= $self->process_field ($gran);
247    $pindex = lc ($pindex);
248
249    # next comes a processed version of the subcollection if there is one.
250    my $psub = $self->process_field ($subcollection);
251    $psub = lc ($psub);
252
253    # next comes a processed version of the language if there is one.
254    my $plang = $self->process_field ($languages);
255    $plang = lc ($plang);
256
257    my $dirname = $pindex . $psub . $plang;
258
259    # check to be sure all index names are unique
260    while (defined ($dirnames{$dirname})) {
261        $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
262    }
263    $mapping{$index} = $dirname;
264
265    # store the mapping orders as well as the maps
266    # also put index, subcollection and language fields into the mapping thing -
267    # (the full index name (eg document:text:subcol:lang) is not used on
268    # the query page) -these are used for collectionmeta later on
269    if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
270        $mapping{'indexmap'}{"$level:$gran"} = $pindex;
271        push (@{$mapping{'indexmaporder'}}, "$level:$gran");
272        if (!defined $mapping{"$level:$gran"}) {
273        $mapping{"$level:$gran"} = $pindex;
274        }
275    }
276    if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
277        $mapping{'subcollectionmap'}{$subcollection} = $psub;
278        push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
279        $mapping{$subcollection} = $psub;
280    }
281    if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
282        $mapping{'languagemap'}{$languages} = $plang;
283        push (@{$mapping{'languagemaporder'}}, $languages);
284        $mapping{$languages} = $plang;
285    }
286    $dirnames{$dirname} = $index;
287    $pnames{'index'}->{$pindex} = "$level:$gran";
288    $pnames{'subcollection'}->{$psub} = $subcollection;
289    $pnames{'languages'}->{$plang} = $languages;
290    }
291
292    return \%mapping;
293}
294
295
296sub make_unique {
297    my $self = shift (@_);
298    my ($namehash, $index, $indexref, $subref, $langref) = @_;
299    my ($level, $gran, $subcollection, $languages) = split (":", $index);
300
301    if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
302    $self->get_next_version ($indexref);
303    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
304    $self->get_next_version ($subref);
305    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
306    $self->get_next_version ($langref);
307    }
308    return "$$indexref$$subref$$langref";
309}   
310
311sub build_index {
312    my $self = shift (@_);
313    my ($index) = @_;
314    my $outhandle = $self->{'outhandle'};
315
316    # get the full index directory path and make sure it exists
317    my $indexdir = $self->{'index_mapping'}->{$index};
318    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
319
320    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
321    my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
322                           $collect_tail);
323    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
324                           $collect_tail);
325
326    # get any os specific stuff
327    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
328    my $exe = &util::get_os_exe ();
329    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
330    my $mg_perf_hash_build_exe =
331    &util::filename_cat($exedir, "mg_perf_hash_build$exe");
332    my $mg_weights_build_exe =
333    &util::filename_cat ($exedir, "mg_weights_build$exe");
334    my $mg_invf_dict_exe =
335    &util::filename_cat ($exedir, "mg_invf_dict$exe");
336    my $mg_stem_idx_exe =
337    &util::filename_cat ($exedir, "mg_stem_idx$exe");
338
339    my $maxnumeric = $self->{'maxnumeric'};
340
341    my $osextra = "";
342    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
343    $fullindexprefix =~ s@/@\\@g;
344    } else {
345    $osextra = " -d /";
346    if ($outhandle ne "STDERR") {
347        # so mg_passes doesn't print to stderr if we redirect output
348        $osextra .= " 2>/dev/null";
349    }
350    }
351
352    # get the index level from the index description
353    # the index will be level 2 unless we are building a
354    # paragraph level index
355    my $index_level = 2;
356    $index_level = 3 if $index =~ /^paragraph/i;
357
358    # get the index expression if this index belongs
359    # to a subcollection
360    my $indexexparr = [];
361    my $langarr = [];
362    # there may be subcollection info, and language info.
363    my ($level, $fields, $subcollection, $language) = split (":", $index);
364    my @subcollections = ();
365    @subcollections = split /,/, $subcollection if (defined $subcollection);
366
367    foreach my $subcollection (@subcollections) {
368    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
369        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
370    }
371    }
372   
373    # add expressions for languages if this index belongs to
374    # a language subcollection - only put languages expressions for the
375    # ones we want in the index
376
377    my @languages = ();
378    my $language_metadata = "Language";
379    if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
380    $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
381    }
382    @languages = split /,/, $language if (defined $language);
383    foreach my $language (@languages) {
384    my $not=0;
385    if ($language =~ s/^\!//) {
386        $not = 1;
387    }
388    if($not) {
389        push (@$langarr, "!$language");
390    } else {
391        push (@$langarr, "$language");
392    }
393    }
394   
395    # Build index dictionary. Uses verbatim stem method
396    print $outhandle "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
397    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
398    my ($handle);
399    if ($self->{'debug'}) {
400    $handle = *STDOUT;
401    }
402    else {
403    if (!-e "$mg_passes_exe" ||
404        !open($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
405           "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
406        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
407        die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
408    }
409    }
410   
411    # set up the document processor
412    $self->{'buildproc'}->set_output_handle ($handle);
413    $self->{'buildproc'}->set_mode ('text');
414    $self->{'buildproc'}->set_index ($index, $indexexparr);
415    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
416    $self->{'buildproc'}->set_indexing_text (1);
417    $self->{'buildproc'}->set_store_text(1);
418
419    $self->{'buildproc'}->reset();
420    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
421           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
422    close ($handle) unless $self->{'debug'};
423
424    $self->print_stats();
425
426    # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
427    # we check on the .id file - index dictionary
428    my $dict_file = "$fullindexprefix.id";
429    if (!-e $dict_file) {
430    print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
431    $self->{'notbuilt'}->{$index}=1;
432    return;
433    }
434    if (!$self->{'debug'}) {
435    # create the perfect hash function
436    if (!-e "$mg_perf_hash_build_exe") {
437        print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
438        die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
439    }
440    system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
441
442    if (!-e "$mg_passes_exe" ||
443        !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
444           "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
445        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
446        die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
447    }
448    }
449   
450    # invert the text
451    print $outhandle "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);
452    print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
453    $self->{'buildproc'}->reset();
454    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
455           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
456
457   
458    $self->print_stats ();
459
460    if (!$self->{'debug'}) {
461
462    close ($handle);
463   
464    # create the weights file
465    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
466    print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
467    if (!-e "$mg_weights_build_exe") {
468        print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
469        die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
470    }
471    system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
472
473    # create 'on-disk' stemmed dictionary
474    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
475    print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
476    if (!-e "$mg_invf_dict_exe") {
477        print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
478        die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
479    }
480    system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
481
482
483    # creates stem index files for the various stemming methods
484    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
485    print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
486    if (!-e "$mg_stem_idx_exe") {
487        print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
488        die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
489    }
490    # currently mg wont work if we don't generate all the stem idexes
491    # so we generate them whatever, but don't advertise the fact
492    #if ($self->{'casefold'}) {
493    system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
494    #}
495    #if ($self->{'stem'}) {
496    system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
497    #}
498    #if ($self->{'casefold'} && $self->{'stem'}) {
499    system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
500    #}
501
502    # remove unwanted files
503    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
504    opendir (DIR, $tmpdir) || die
505        "mgbuilder::build_index - couldn't read directory $tmpdir\n";
506    foreach my $file (readdir(DIR)) {
507        next if $file =~ /^\./;
508        my ($suffix) = $file =~ /\.([^\.]+)$/;
509        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
510        # delete it!
511        print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
512        &util::rm (&util::filename_cat ($tmpdir, $file));
513        }
514    }
515    closedir (DIR);
516    }
517    print STDERR "</Stage>\n" if $self->{'gli'};
518}
519
520sub build_cfg_extra {
521   my $self = shift(@_);
522   my ($build_cfg) = @_;
523   
524    # get additional stats from mg
525    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
526    my $exe = &util::get_os_exe ();
527    my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
528
529    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
530    my $input_file = &util::filename_cat ("text", $collect_tail);
531    if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
532    my $outhandle = $self->{'outhandle'};
533    print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
534    } else {
535    my $line = "";
536    while (defined ($line = <PIPEIN>)) {
537        if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
538        ($build_cfg->{'numwords'}) = $1;
539        } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
540        ($build_cfg->{'numsections'}) = $1;
541        }
542    }
543    close PIPEIN;
544    }
545}
546
5471;
548
549
550
Note: See TracBrowser for help on using the browser.