root/gsdl/trunk/perllib/mgppbuilder.pm @ 15715

Revision 15715, 28.5 KB (checked in by mdewsnip, 12 years ago)

Added "use strict" and fixed various problems that it found.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36    @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42          'section'=>'Sec',
43          'paragraph'=>'Para',
44          'Doc'=>'_textdocument_',
45          'Sec'=>'_textsection_',
46          'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49               't'=>1,
50               'tl'=>1,
51               'ti'=>1,
52               'idb'=>1,
53               'ib1'=>1,
54               'ib2'=>1,
55               'ib3'=>1,
56               'ib4'=>1,
57               'ib5'=>1,
58               'ib6'=>1,
59               'ib7'=>1,
60               'i'=>1,
61               'il'=>1,
62               'w'=>1,
63               'wa'=>1);
64
65# change this so a user can add their own ones in via a file or cfg
66#add AND, OR, NOT NEAR to this list - these cannot be used as field names
67#also add the level names (Doc, Sec, Para)
68our %static_indexfield_map = ('Title'=>'TI',
69              'TI'=>1,
70              'Subject'=>'SU',
71              'SU'=>1,
72              'Creator'=>'CR',
73              'CR'=>1,
74              'Organization'=>'ORG',
75              'ORG'=>1,
76              'Source'=>'SO',
77              'SO'=>1,
78              'Howto'=>'HT',
79              'HT'=>1,
80              'ItemTitle'=>'IT',
81              'IT'=>1,
82              'ProgNumber'=>'PN',
83              'PN'=>1,
84              'People'=>'PE',
85              'PE'=>1,
86              'Coverage'=>'CO',
87              'CO'=>1,
88              'allfields'=>'ZZ',
89              'ZZ'=>1,
90              'text'=>'TX',
91              'TX'=>1,
92              'AND'=>1,
93              'OR'=>1,
94              'NOT'=>1,
95              'NEAR'=>1,
96              'Doc'=>1,
97              'Sec'=>1,
98              'Para'=>1);
99
100my $maxdocsize = $basebuilder::maxdocsize;
101
102sub new {
103    my $class = shift(@_);
104
105    my $self = new basebuilder (@_);
106    $self = bless $self, $class;
107
108    $self->{'indexfieldmap'} = \%static_indexfield_map;
109
110    # get the levels (Section, Paragraph) for indexing and compression
111    $self->{'levels'} = {};
112    $self->{'levelorder'} = ();
113    if (defined $self->{'collect_cfg'}->{'levels'}) {
114        foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
115        $level =~ tr/A-Z/a-z/;
116            $self->{'levels'}->{$level} = 1;
117        push (@{$self->{'levelorder'}}, $level);
118        }
119    } else { # default to document
120    $self->{'levels'}->{'document'} = 1;
121    push (@{$self->{'levelorder'}}, 'document');
122    }
123   
124    $self->{'buildtype'} = "mgpp";
125
126    return $self;
127}
128
129sub generate_index_list {
130    my $self  = shift (@_);
131   
132    # sort out the indexes
133    #indexes are specified with spaces, but we put them into one index
134    my $indexes = $self->{'collect_cfg'}->{'indexes'};
135    $self->{'collect_cfg'}->{'indexes'} = [];
136    push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
137}
138
139sub generate_index_options {
140    my $self = shift (@_);
141
142    $self->{'casefold'} = 0;
143    $self->{'stem'} = 0;
144    $self->{'accentfold'} = 0;
145   
146    if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
147    # just use default options
148    $self->{'casefold'} = 1;
149    $self->{'stem'} = 1;
150    $self->{'accentfold'} = 1;
151    } else {
152    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
153        if ($option =~ /stem/) {
154        $self->{'stem'} = 1;
155        } elsif ($option =~ /casefold/) {
156        $self->{'casefold'} = 1;
157        } elsif ($option =~ /accentfold/) {
158        $self->{'accentfold'} = 1;
159        }
160    }
161    }
162   
163    # now we record this for the build cfg
164    $self->{'stemindexes'} = 0;
165    if ($self->{'casefold'}) {
166    $self->{'stemindexes'} += 1;
167    }
168    if ($self->{'stem'}) {
169    $self->{'stemindexes'} += 2;
170    }
171    if ($self->{'accentfold'}) {
172    $self->{'stemindexes'} += 4;
173    }
174   
175}
176
177sub default_buildproc {
178    my $self  = shift (@_);
179
180    return "mgppbuildproc";
181}
182
183sub compress_text {
184
185    my $self = shift (@_);
186
187    # we don't do anything if we don't want compressed text
188    return if $self->{'no_text'};
189   
190    my ($textindex) = @_;
191
192    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
193    my $exe = &util::get_os_exe ();
194    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
195    my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
196    my $outhandle = $self->{'outhandle'};
197
198    my $maxnumeric = $self->{'maxnumeric'};
199   
200    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
201
202    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
203    my $basefilename = &util::filename_cat("text",$collect_tail);
204    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
205   
206    my $osextra = "";
207    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
208    $fulltextprefix =~ s@/@\\@g;
209    }
210    else {
211    $osextra = " -d /";
212    }
213
214
215    # define the section names and possibly the doc name for mgpasses
216    # the compressor doesn't need to know about paragraphs - never want to
217    # retrieve them
218   
219    # always use Doc and Sec levels
220    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
221
222    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
223    print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
224
225    # collect the statistics for the text
226    # -b $maxdocsize sets the maximum document size to be 12 meg
227    print $outhandle "\n    collecting text statistics (mgpp_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
228    print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
229
230    my ($handle);
231    if ($self->{'debug'}) {
232    $handle = *STDOUT;
233    }
234    else {
235    if (!-e "$mgpp_passes_exe" ||
236        !open($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
237        print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
238        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
239    }
240    }
241   
242    my $db_level = "section";
243
244    $self->{'buildproc'}->set_output_handle ($handle);
245    $self->{'buildproc'}->set_mode ('text');
246    $self->{'buildproc'}->set_index ($textindex);
247    $self->{'buildproc'}->set_indexing_text (0);
248    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
249    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
250    $self->{'buildproc'}->set_db_level ($db_level);                       
251    $self->{'buildproc'}->reset();
252    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
253           $self->{'buildproc'}, $self->{'maxdocs'});
254    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
255           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
256    &plugin::end($self->{'pluginfo'});
257
258    close ($handle) unless $self->{'debug'};
259
260    $self->print_stats();
261
262    # create the compression dictionary
263    # the compression dictionary is built by assuming the stats are from a seed
264    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
265    # and the resulting dictionary must be less than 5 meg with the most
266    # frequent words being put into the dictionary first (-2 -k 5120)
267    # note: these options are left over from mg version
268    if (!$self->{'debug'}) {
269    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
270    print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
271    if (!-e "$mgpp_compression_dict_exe") {
272        print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
273        die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
274    }
275    system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
276
277    if (!$self->{'debug'}) {
278        if (!-e "$mgpp_passes_exe" ||
279        !open ($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
280        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
281        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
282        }
283    }
284    }
285    else {
286    print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
287    }
288
289    $self->{'buildproc'}->reset();
290    # compress the text
291    print $outhandle "\n    compressing the text (mgpp_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
292    print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
293
294    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
295           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
296    close ($handle) unless $self->{'debug'};
297
298    $self->print_stats();
299    print STDERR "</Stage>\n" if $self->{'gli'};
300}
301
302
303sub build_indexes_extra {
304    my $self = shift(@_);
305    #define the final field lists
306    $self->make_final_field_list();
307}   
308
309# creates directory names for each of the index descriptions
310sub create_index_mapping {
311    my $self = shift (@_);
312    my ($indexes) = @_;
313
314    my %mapping = ();
315
316    $mapping{'indexmaporder'} = [];
317    $mapping{'subcollectionmaporder'} = [];
318    $mapping{'languagemaporder'} = [];
319   
320    # dirnames is used to check for collisions. Start this off
321    # with the manditory directory names
322    my %dirnames = ('text'=>'text',
323            'extra'=>'extra');
324    my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
325
326    foreach my $index (@$indexes) {
327    my ($fields, $subcollection, $languages) = split (":", $index);
328   
329    # we only ever have one index, and its called 'idx'
330    my $pindex = 'idx';
331   
332    # next comes a processed version of the subcollection if there is one.
333    my $psub = $self->process_field ($subcollection);
334    $psub = lc ($psub);
335
336    # next comes a processed version of the language if there is one.
337    my $plang = $self->process_field ($languages);
338    $plang = lc ($plang);
339
340    my $dirname = $pindex . $psub . $plang;
341
342    # check to be sure all index names are unique
343    while (defined ($dirnames{$dirname})) {
344        $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
345    }
346
347    $mapping{$index} = $dirname;
348
349    # store the mapping orders as well as the maps
350    # also put index, subcollection and language fields into the mapping thing -
351    # (the full index name (eg text:subcol:lang) is not used on
352    # the query page) -these are used for collectionmeta later on
353    if (!defined $mapping{'indexmap'}{"$fields"}) {
354        $mapping{'indexmap'}{"$fields"} = $pindex;
355        push (@{$mapping{'indexmaporder'}}, "$fields");
356        if (!defined $mapping{"$fields"}) {
357        $mapping{"$fields"} = $pindex;
358        }   
359    }
360    if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
361        $mapping{'subcollectionmap'}{$subcollection} = $psub;
362        push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
363        $mapping{$subcollection} = $psub;
364    }
365    if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
366        $mapping{'languagemap'}{$languages} = $plang;
367        push (@{$mapping{'languagemaporder'}}, $languages);
368        $mapping{$languages} = $plang;
369    }
370    $dirnames{$dirname} = $index;
371    $pnames{'index'}->{$pindex} = "$fields";
372    $pnames{'subcollection'}->{$psub} = $subcollection;
373    $pnames{'languages'}->{$plang} = $languages;
374    }
375
376    return \%mapping;
377}
378
379sub make_unique {
380    my $self = shift (@_);
381    my ($namehash, $index, $indexref, $subref, $langref) = @_;
382    my ($fields, $subcollection, $languages) = split (":", $index);
383
384    if ($namehash->{'index'}->{$$indexref} ne "$fields") {
385    $self->get_next_version ($indexref);
386    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
387    $self->get_next_version ($subref);
388    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
389    $self->get_next_version ($langref);
390    }
391    return "$$indexref$$subref$$langref";
392}   
393
394
395sub build_index {
396    my $self = shift (@_);
397    my ($index) = @_;
398    my $outhandle = $self->{'outhandle'};
399
400    # get the full index directory path and make sure it exists
401    my $indexdir = $self->{'index_mapping'}->{$index};
402    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
403
404    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
405    my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
406                           $indexdir,
407                           $collect_tail);
408    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
409                           $collect_tail);
410
411    # get any os specific stuff
412    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
413
414    my $exe = &util::get_os_exe ();
415    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
416
417    # define the section names for mgpasses
418    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
419    if ($self->{'levels'}->{'paragraph'}) {
420    $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
421    }
422
423    my $mgpp_perf_hash_build_exe =
424    &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
425    my $mgpp_weights_build_exe =
426    &util::filename_cat ($exedir, "mgpp_weights_build$exe");
427    my $mgpp_invf_dict_exe =
428    &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
429    my $mgpp_stem_idx_exe =
430    &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
431
432    my $maxnumeric = $self->{'maxnumeric'};
433
434    my $osextra = "";
435    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
436    $fullindexprefix =~ s@/@\\@g;
437    } else {
438    $osextra = " -d /";
439    if ($outhandle ne "STDERR") {
440        # so mgpp_passes doesn't print to stderr if we redirect output
441        $osextra .= " 2>/dev/null";
442    }
443    }
444 
445    # get the index expression if this index belongs
446    # to a subcollection
447    my $indexexparr = [];
448    my $langarr = [];
449    # there may be subcollection info, and language info.
450    my ($fields, $subcollection, $language) = split (":", $index);
451    my @subcollections = ();
452    @subcollections = split /,/, $subcollection if (defined $subcollection);
453
454    foreach $subcollection (@subcollections) {
455    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
456        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
457    }
458    }
459   
460    # add expressions for languages if this index belongs to
461    # a language subcollection - only put languages expressions for the
462    # ones we want in the index
463   
464    my @languages = ();
465    my $language_metadata = "Language";
466    if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
467    $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
468    }
469    @languages = split /,/, $language if (defined $language);
470    foreach my $language (@languages) {
471    my $not=0;
472    if ($language =~ s/^\!//) {
473        $not = 1;
474    }
475    if($not) {
476        push (@$langarr, "!$language");
477    } else {
478        push (@$langarr, "$language");
479    }
480    }
481
482    # Build index dictionary. Uses verbatim stem method
483    print $outhandle "\n    creating index dictionary (mgpp_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
484    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
485    my ($handle);
486    if ($self->{'debug'}) {
487    $handle = *STDOUT;
488    }
489    else {
490    if (!-e "$mgpp_passes_exe" ||
491        !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
492        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
493        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
494    }
495    }
496       
497    # db_level is always section
498    my $db_level = "section";
499
500    # set up the document processr
501    $self->{'buildproc'}->set_output_handle ($handle);
502    $self->{'buildproc'}->set_mode ('text');
503    $self->{'buildproc'}->set_index ($index, $indexexparr);
504    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
505    $self->{'buildproc'}->set_indexing_text (1);
506    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
507    $self->{'buildproc'}->set_levels ($self->{'levels'});
508    $self->{'buildproc'}->set_db_level ($db_level);   
509   
510    $self->{'buildproc'}->reset();
511    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
512           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
513    close ($handle) unless $self->{'debug'};
514
515    $self->print_stats();
516
517    # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
518    # we check on the .id file - index dictionary
519    my $dict_file = "$fullindexprefix.id";
520    if (!-e $dict_file) {
521    print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
522    print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
523    $self->{'notbuilt'}->{$index}=1;
524    return;
525    }
526
527    if (!$self->{'debug'}) {
528    # create the perfect hash function
529    if (!-e "$mgpp_perf_hash_build_exe") {
530        print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
531        die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
532    }
533    system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
534
535    if (!-e "$mgpp_passes_exe" ||
536        !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
537        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
538        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
539    }
540    }
541   
542    # invert the text
543    print $outhandle "\n    inverting the text (mgpp_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
544    print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
545    $self->{'buildproc'}->reset();
546    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
547           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
548
549    $self->print_stats ();
550   
551    if (!$self->{'debug'}) {
552
553    close ($handle);
554   
555    # create the weights file
556    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
557    print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
558    if (!-e "$mgpp_weights_build_exe") {
559        print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
560        die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
561    }
562    system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
563
564    # create 'on-disk' stemmed dictionary
565    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
566    if (!-e "$mgpp_invf_dict_exe") {
567        print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
568        die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
569    }
570    system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
571
572
573    # creates stem index files for the various stemming methods
574    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
575    print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
576    if (!-e "$mgpp_stem_idx_exe") {
577        print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
578        die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
579    }
580    my $accent_folding_enabled = 1;
581    if ($self->{'accentfold'}) {
582        # the first time we do this, we test for accent folding enabled
583        if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
584        # accent folding has not been enabled in mgpp
585        $accent_folding_enabled = 0;
586        $self->{'stemindexes'} -= 4;
587        }
588    }
589    if ($self->{'casefold'}) {
590        system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
591        if ($accent_folding_enabled && $self->{'accentfold'}) {
592        system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
593        }
594    }
595    if ($self->{'stem'}) {
596        system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
597        if ($accent_folding_enabled && $self->{'accentfold'}) {
598        system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
599        }
600    }
601    if ($self->{'casefold'} && $self->{'stem'}) {
602        system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
603        if ($accent_folding_enabled && $self->{'accentfold'}) {
604        system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
605        }
606    }
607
608    # remove unwanted files
609    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
610    opendir (DIR, $tmpdir) || die
611        "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
612    foreach my $file (readdir(DIR)) {
613        next if $file =~ /^\./;
614        my ($suffix) = $file =~ /\.([^\.]+)$/;
615        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
616        # delete it!
617        print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
618        #&util::rm (&util::filename_cat ($tmpdir, $file));
619        }
620    }
621    closedir (DIR);
622    }
623    print STDERR "</Stage>\n" if $self->{'gli'};
624}   
625
626
627sub get_collection_meta_indexes
628{
629    my $self = shift(@_);
630    my $collection_infodb = shift(@_);
631
632    # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
633    if (!defined $self->{'build_cfg'}) {
634    $self->read_final_field_list();
635    }
636
637    # first do the collection meta stuff - everything without a dot
638    my $collmetadefined = 0;
639    my $metadata_entry;
640    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
641    $collmetadefined = 1;
642    }
643
644    #add the index field macros to [collection]
645    # eg <TI>Title
646    #    <SU>Subject
647    # these now come from collection meta. if that is not defined, usses the metadata name
648    my $collmeta = "";
649    foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
650    my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
651    next if $shortfield eq 1;
652   
653    # we need to check if some coll meta has been defined - don't output
654    # any that have
655    $collmeta = ".$longfield";
656    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
657        if ($longfield eq "allfields") {
658        $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
659        } elsif ($longfield eq "text") {
660        $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
661        } else {
662        $collection_infodb->{$shortfield} = [ $longfield ];
663        }
664    }
665    }
666   
667    # now add the level names
668    my $level_entry = "";
669    foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
670    $collmeta = ".$level"; # based on the original specification
671    $level =~ tr/A-Z/a-z/; # make it lower case
672    my $levelid = $level_map{$level}; # find the actual value we used in the index
673    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
674        # use the default macro
675        $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
676    }
677    }
678   
679    # now add subcoll meta
680    my $subcoll_entry = "";
681    my $shortname = "";
682    my $one_entry = "";
683    foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
684    $shortname = $self->{'index_mapping'}->{$subcoll};
685    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
686        $collection_infodb->{$shortname} = [ $subcoll ];
687    }
688    }
689
690    # now add language meta
691    my $lang_entry = "";
692    foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
693    $shortname = $self->{'index_mapping'}->{$lang};
694    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
695        $collection_infodb->{$shortname} = [ $lang ];
696    }
697    }
698}
699
700
701# default is to output the metadata sets (prefixes) used in collection
702sub output_collection_meta
703{
704    my $self = shift(@_);
705    my $infodb_handle = shift(@_);
706
707    my %collection_infodb = ();
708    $self->get_collection_meta_sets(\%collection_infodb);
709    $self->get_collection_meta_indexes(\%collection_infodb);
710    &dbutil::write_infodb_entry($infodb_handle, "collection", \%collection_infodb);
711}
712
713
714# at the end of building, we have an indexfieldmap with all the mappings,
715# plus some extras, and indexmap with any indexes in it that weren't
716# specified in the index definition.  we want to make an ordered list of
717# fields that are indexed, and a list of mappings that are used. this will
718# be used for the build.cfg file, and for collection meta definition we
719# store these in a build.cfg bit
720sub make_final_field_list {
721    my $self = shift (@_);
722   
723    $self->{'build_cfg'} = {};
724   
725    # store the indexfieldmap information
726    my @indexfieldmap = ();
727    my @indexfields = ();
728    my $specifiedfields = {};
729    my @specifiedfieldorder = ();
730
731    # go through the index definition and add each thing to a map, so we
732    # can easily check if it is already specified - when doing the
733    # metadata, we print out all the individual fields, but some may
734    # already be specified in the index definition, so we dont want to add
735    # those again.
736
737    my $field;
738    foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
739    # remove subcoll stuff
740    my $parts = $field;
741    $parts =~ s/:.*$//;
742    # *************
743    my @fs = split(';', $parts);
744    foreach my $f(@fs) {
745        if (!defined $specifiedfields->{$f}) {
746        $specifiedfields->{$f}=1;
747        push (@specifiedfieldorder, "$f");
748        }
749    }
750    }
751   
752    #add all fields bit
753    foreach $field (@specifiedfieldorder) {
754    if ($field eq "metadata") {
755        foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
756        if (!defined $specifiedfields->{$newfield}) {
757            push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
758            push (@indexfields, "$newfield");
759        }
760        }
761
762    } elsif ($field eq 'text') {
763        push (@indexfieldmap, "text\-\>TX");
764        push (@indexfields, "text");
765    } elsif ($field eq 'allfields') {
766        push (@indexfieldmap, "allfields\-\>ZZ");
767        push (@indexfields, "allfields");
768    } else {
769
770        my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
771
772        if (defined $ifm->{$field}) {
773        push (@indexfieldmap, "$field\-\>$ifm->{$field}");
774        push (@indexfields, "$field");
775        }
776
777       
778    }
779    }
780
781    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
782    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
783   
784}
785
786
787# recreate the field list from the build.cfg file, look first in building,
788# then in index to find it. if there is no build.cfg, we can't do the field
789# list (there is unlikely to be any index anyway.)
790sub read_final_field_list {
791    my $self = shift (@_);
792    $self->{'build_cfg'} = {};
793    my @indexfieldmap = ();
794    my @indexfields = ();
795    my @indexmap = ();
796
797    if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
798    # set the default mapping
799    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
800    }
801    # we read the stuff in from the build.cfg file - if its there
802    my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
803   
804    if (!-e $buildconfigfile) {
805    # try the index dir - but do we know where it is?? try here
806    $buildconfigfile  = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
807    if (!-e $buildconfigfile) {
808        #we cant find a config file - just ignore the field list
809        return;
810    }
811    }
812
813    my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
814    my $field;
815    if (defined $buildcfg->{'indexfields'}) {
816    foreach $field (@{$buildcfg->{'indexfields'}}) {
817        push (@indexfields, "$field");
818    }
819    }
820
821    if (defined $buildcfg->{'indexfieldmap'}) {
822    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
823        push (@indexfieldmap, "$field");
824        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
825        $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
826    }
827    }       
828
829    if (defined $buildcfg->{'indexmap'}) {
830    foreach $field (@{$buildcfg->{'indexmap'}}) {
831        push (@indexmap, "$field");
832    }
833    }       
834
835    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
836    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
837    $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
838}
839
840
841sub build_cfg_extra {
842    my $self = shift (@_);
843    my ($build_cfg) = @_;
844
845    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
846   
847    # store the level info
848    my @indexlevels = ();
849    my @levelmap = ();
850    foreach my $l (@{$self->{'levelorder'}}) {
851    push (@indexlevels, $level_map{$l});
852    push (@levelmap, "$l\-\>$level_map{$l}");
853    }
854    $build_cfg->{'indexlevels'} = \@indexlevels;
855    $build_cfg->{'levelmap'} = \@levelmap;
856
857    # text level (and database level) is always section
858    $build_cfg->{'textlevel'} = $level_map{'section'};
859   
860}
861
8621;
863
864
Note: See TracBrowser for help on using the browser.