root/main/trunk/greenstone2/perllib/mgppbuilder.pm @ 22352

Revision 22352, 27.7 KB (checked in by kjdon, 10 years ago)

remove ex. when generating index lists. Don't want any ex. in build.cfg. This fixes the problem where index list had eg ex.Photographer and collectionmeta in config file had .Photographer and then they didn't match up.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36    @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42          'section'=>'Sec',
43          'paragraph'=>'Para',
44          'Doc'=>'_textdocument_',
45          'Sec'=>'_textsection_',
46          'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49               't'=>1,
50               'tl'=>1,
51               'ti'=>1,
52               'idb'=>1,
53               'ib1'=>1,
54               'ib2'=>1,
55               'ib3'=>1,
56               'ib4'=>1,
57               'ib5'=>1,
58               'ib6'=>1,
59               'ib7'=>1,
60               'i'=>1,
61               'il'=>1,
62               'w'=>1,
63               'wa'=>1);
64
65
66my $maxdocsize = $basebuilder::maxdocsize;
67
68sub new {
69    my $class = shift(@_);
70
71    my $self = new basebuilder (@_);
72    $self = bless $self, $class;
73
74    #$self->{'indexfieldmap'} = \%static_indexfield_map;
75
76    # get the levels (Section, Paragraph) for indexing and compression
77    $self->{'levels'} = {};
78    $self->{'levelorder'} = ();
79    if (defined $self->{'collect_cfg'}->{'levels'}) {
80        foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
81        $level =~ tr/A-Z/a-z/;
82            $self->{'levels'}->{$level} = 1;
83        push (@{$self->{'levelorder'}}, $level);
84        }
85    } else { # default to document
86    $self->{'levels'}->{'document'} = 1;
87    push (@{$self->{'levelorder'}}, 'document');
88    }
89   
90    $self->{'buildtype'} = "mgpp";
91
92    return $self;
93}
94
95sub generate_index_list {
96    my $self  = shift (@_);
97   
98    # sort out the indexes
99    #indexes are specified with spaces, but we put them into one index
100    my $indexes = $self->{'collect_cfg'}->{'indexes'};
101    if (defined $indexes) {
102    $self->{'collect_cfg'}->{'indexes'} = [];
103    my $single_index = join(';', @$indexes).";";
104    # remove any ex. from index spec
105    $single_index =~ s/^ex\.//;
106    $single_index =~ s/([,;])ex\./$1/g;
107    push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
108    }
109}
110
111sub generate_index_options {
112    my $self = shift (@_);
113
114    $self->SUPER::generate_index_options();
115
116    $self->{'casefold'} = 0;
117    $self->{'stem'} = 0;
118    $self->{'accentfold'} = 0;
119   
120    if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
121    # just use default options
122    $self->{'casefold'} = 1;
123    $self->{'stem'} = 1;
124    $self->{'accentfold'} = 1;
125    } else {
126    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
127        if ($option =~ /stem/) {
128        $self->{'stem'} = 1;
129        } elsif ($option =~ /casefold/) {
130        $self->{'casefold'} = 1;
131        } elsif ($option =~ /accentfold/) {
132        $self->{'accentfold'} = 1;
133        }
134    }
135    }
136   
137    # now we record this for the build cfg
138    $self->{'stemindexes'} = 0;
139    if ($self->{'casefold'}) {
140    $self->{'stemindexes'} += 1;
141    }
142    if ($self->{'stem'}) {
143    $self->{'stemindexes'} += 2;
144    }
145    if ($self->{'accentfold'}) {
146    $self->{'stemindexes'} += 4;
147    }
148   
149}
150
151sub default_buildproc {
152    my $self  = shift (@_);
153
154    return "mgppbuildproc";
155}
156
157sub compress_text {
158
159    my $self = shift (@_);
160
161    # we don't do anything if we don't want compressed text
162    return if $self->{'no_text'};
163   
164    my ($textindex) = @_;
165
166    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
167    my $exe = &util::get_os_exe ();
168    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
169    my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
170    my $outhandle = $self->{'outhandle'};
171
172    my $maxnumeric = $self->{'maxnumeric'};
173   
174    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
175
176    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
177    my $basefilename = &util::filename_cat("text",$collect_tail);
178    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
179   
180    my $osextra = "";
181    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
182    $fulltextprefix =~ s@/@\\@g;
183    }
184    else {
185    $osextra = " -d /";
186    }
187
188
189    # define the section names and possibly the doc name for mgpasses
190    # the compressor doesn't need to know about paragraphs - never want to
191    # retrieve them
192   
193    # always use Doc and Sec levels
194    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
195
196    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
197    print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
198
199    # collect the statistics for the text
200    # -b $maxdocsize sets the maximum document size to be 12 meg
201    print $outhandle "\n    collecting text statistics (mgpp_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
202    print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
203
204    my ($handle);
205    if ($self->{'debug'}) {
206    $handle = *STDOUT;
207    }
208    else {
209    if (!-e "$mgpp_passes_exe" ||
210        !open($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
211        print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
212        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
213    }
214    }
215   
216    my $db_level = "section";
217
218    $self->{'buildproc'}->set_output_handle ($handle);
219    $self->{'buildproc'}->set_mode ('text');
220    $self->{'buildproc'}->set_index ($textindex);
221    $self->{'buildproc'}->set_indexing_text (0);
222    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
223    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
224    $self->{'buildproc'}->set_db_level ($db_level);                       
225    $self->{'buildproc'}->reset();
226    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
227           $self->{'buildproc'}, $self->{'maxdocs'});
228    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
229           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
230    &plugin::end($self->{'pluginfo'});
231
232    close ($handle) unless $self->{'debug'};
233
234    $self->print_stats();
235
236    # create the compression dictionary
237    # the compression dictionary is built by assuming the stats are from a seed
238    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
239    # and the resulting dictionary must be less than 5 meg with the most
240    # frequent words being put into the dictionary first (-2 -k 5120)
241    # note: these options are left over from mg version
242    if (!$self->{'debug'}) {
243    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
244    print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
245    if (!-e "$mgpp_compression_dict_exe") {
246        print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
247        die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
248    }
249    system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
250
251    if (!$self->{'debug'}) {
252        if (!-e "$mgpp_passes_exe" ||
253        !open ($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
254        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
255        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
256        }
257    }
258    }
259    else {
260    print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
261    }
262
263    $self->{'buildproc'}->reset();
264    # compress the text
265    print $outhandle "\n    compressing the text (mgpp_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
266    print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
267
268    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
269           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
270    close ($handle) unless $self->{'debug'};
271
272    $self->print_stats();
273    print STDERR "</Stage>\n" if $self->{'gli'};
274}
275
276
277sub build_indexes_extra {
278    my $self = shift(@_);
279    #define the final field lists
280    $self->make_final_field_list();
281}   
282
283# creates directory names for each of the index descriptions
284sub create_index_mapping {
285    my $self = shift (@_);
286    my ($indexes) = @_;
287
288    my %mapping = ();
289
290    return \%mapping if !(scalar @$indexes);
291
292    $mapping{'indexmaporder'} = [];
293    $mapping{'subcollectionmaporder'} = [];
294    $mapping{'languagemaporder'} = [];
295   
296    # dirnames is used to check for collisions. Start this off
297    # with the manditory directory names
298    my %dirnames = ('text'=>'text',
299            'extra'=>'extra');
300    my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
301
302    foreach my $index (@$indexes) {
303    my ($fields, $subcollection, $languages) = split (":", $index);
304   
305    # we only ever have one index, and its called 'idx'
306    my $pindex = 'idx';
307   
308    # next comes a processed version of the subcollection if there is one.
309    my $psub = $self->process_field ($subcollection);
310    $psub = lc ($psub);
311
312    # next comes a processed version of the language if there is one.
313    my $plang = $self->process_field ($languages);
314    $plang = lc ($plang);
315
316    my $dirname = $pindex . $psub . $plang;
317
318    # check to be sure all index names are unique
319    while (defined ($dirnames{$dirname})) {
320        $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
321    }
322
323    $mapping{$index} = $dirname;
324
325    # store the mapping orders as well as the maps
326    # also put index, subcollection and language fields into the mapping thing -
327    # (the full index name (eg text:subcol:lang) is not used on
328    # the query page) -these are used for collectionmeta later on
329    if (!defined $mapping{'indexmap'}{"$fields"}) {
330        $mapping{'indexmap'}{"$fields"} = $pindex;
331        push (@{$mapping{'indexmaporder'}}, "$fields");
332        if (!defined $mapping{"$fields"}) {
333        $mapping{"$fields"} = $pindex;
334        }   
335    }
336    if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
337        $mapping{'subcollectionmap'}{$subcollection} = $psub;
338        push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
339        $mapping{$subcollection} = $psub;
340    }
341    if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
342        $mapping{'languagemap'}{$languages} = $plang;
343        push (@{$mapping{'languagemaporder'}}, $languages);
344        $mapping{$languages} = $plang;
345    }
346    $dirnames{$dirname} = $index;
347    $pnames{'index'}->{$pindex} = "$fields";
348    $pnames{'subcollection'}->{$psub} = $subcollection;
349    $pnames{'languages'}->{$plang} = $languages;
350    }
351
352    return \%mapping;
353}
354
355sub make_unique {
356    my $self = shift (@_);
357    my ($namehash, $index, $indexref, $subref, $langref) = @_;
358    my ($fields, $subcollection, $languages) = split (":", $index);
359
360    if ($namehash->{'index'}->{$$indexref} ne "$fields") {
361    $self->get_next_version ($indexref);
362    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
363    $self->get_next_version ($subref);
364    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
365    $self->get_next_version ($langref);
366    }
367    return "$$indexref$$subref$$langref";
368}   
369
370
371sub build_index {
372    my $self = shift (@_);
373    my ($index) = @_;
374    my $outhandle = $self->{'outhandle'};
375
376    # get the full index directory path and make sure it exists
377    my $indexdir = $self->{'index_mapping'}->{$index};
378    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
379
380    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
381    my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
382                           $indexdir,
383                           $collect_tail);
384    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
385                           $collect_tail);
386
387    # get any os specific stuff
388    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
389
390    my $exe = &util::get_os_exe ();
391    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
392
393    # define the section names for mgpasses
394    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
395    if ($self->{'levels'}->{'paragraph'}) {
396    $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
397    }
398
399    my $mgpp_perf_hash_build_exe =
400    &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
401    my $mgpp_weights_build_exe =
402    &util::filename_cat ($exedir, "mgpp_weights_build$exe");
403    my $mgpp_invf_dict_exe =
404    &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
405    my $mgpp_stem_idx_exe =
406    &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
407
408    my $maxnumeric = $self->{'maxnumeric'};
409
410    my $osextra = "";
411    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
412    $fullindexprefix =~ s@/@\\@g;
413    } else {
414    $osextra = " -d /";
415    if ($outhandle ne "STDERR") {
416        # so mgpp_passes doesn't print to stderr if we redirect output
417        $osextra .= " 2>/dev/null";
418    }
419    }
420 
421    # get the index expression if this index belongs
422    # to a subcollection
423    my $indexexparr = [];
424    my $langarr = [];
425    # there may be subcollection info, and language info.
426    my ($fields, $subcollection, $language) = split (":", $index);
427    my @subcollections = ();
428    @subcollections = split /,/, $subcollection if (defined $subcollection);
429
430    foreach $subcollection (@subcollections) {
431    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
432        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
433    }
434    }
435   
436    # add expressions for languages if this index belongs to
437    # a language subcollection - only put languages expressions for the
438    # ones we want in the index
439   
440    my @languages = ();
441    my $languagemetadata = "Language";
442    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
443    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
444    }
445    @languages = split /,/, $language if (defined $language);
446    foreach my $language (@languages) {
447    my $not=0;
448    if ($language =~ s/^\!//) {
449        $not = 1;
450    }
451    if($not) {
452        push (@$langarr, "!$language");
453    } else {
454        push (@$langarr, "$language");
455    }
456    }
457
458    # Build index dictionary. Uses verbatim stem method
459    print $outhandle "\n    creating index dictionary (mgpp_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
460    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
461    my ($handle);
462    if ($self->{'debug'}) {
463    $handle = *STDOUT;
464    }
465    else {
466    if (!-e "$mgpp_passes_exe" ||
467        !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
468        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
469        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
470    }
471    }
472       
473    # db_level is always section
474    my $db_level = "section";
475
476    # set up the document processr
477    $self->{'buildproc'}->set_output_handle ($handle);
478    $self->{'buildproc'}->set_mode ('text');
479    $self->{'buildproc'}->set_index ($index, $indexexparr);
480    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
481    $self->{'buildproc'}->set_indexing_text (1);
482    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
483    $self->{'buildproc'}->set_levels ($self->{'levels'});
484    $self->{'buildproc'}->set_db_level ($db_level);   
485   
486    $self->{'buildproc'}->reset();
487    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
488           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
489    close ($handle) unless $self->{'debug'};
490
491    $self->print_stats();
492
493    # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
494    # we check on the .id file - index dictionary
495    my $dict_file = "$fullindexprefix.id";
496    if (!-e $dict_file) {
497    print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
498    print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
499    $self->{'notbuilt'}->{$index}=1;
500    return;
501    }
502
503    if (!$self->{'debug'}) {
504    # create the perfect hash function
505    if (!-e "$mgpp_perf_hash_build_exe") {
506        print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
507        die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
508    }
509    system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
510
511    if (!-e "$mgpp_passes_exe" ||
512        !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
513        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
514        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
515    }
516    }
517   
518    # invert the text
519    print $outhandle "\n    inverting the text (mgpp_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
520    print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
521    $self->{'buildproc'}->reset();
522    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
523           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
524
525    $self->print_stats ();
526   
527    if (!$self->{'debug'}) {
528
529    close ($handle);
530   
531    # create the weights file
532    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
533    print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
534    if (!-e "$mgpp_weights_build_exe") {
535        print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
536        die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
537    }
538    system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
539
540    # create 'on-disk' stemmed dictionary
541    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
542    if (!-e "$mgpp_invf_dict_exe") {
543        print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
544        die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
545    }
546    system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
547
548
549    # creates stem index files for the various stemming methods
550    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
551    print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
552    if (!-e "$mgpp_stem_idx_exe") {
553        print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
554        die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
555    }
556    my $accent_folding_enabled = 1;
557    if ($self->{'accentfold'}) {
558        # the first time we do this, we test for accent folding enabled
559        if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
560        # accent folding has not been enabled in mgpp
561        $accent_folding_enabled = 0;
562        $self->{'stemindexes'} -= 4;
563        }
564    }
565    if ($self->{'casefold'}) {
566        system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
567        if ($accent_folding_enabled && $self->{'accentfold'}) {
568        system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
569        }
570    }
571    if ($self->{'stem'}) {
572        system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
573        if ($accent_folding_enabled && $self->{'accentfold'}) {
574        system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
575        }
576    }
577    if ($self->{'casefold'} && $self->{'stem'}) {
578        system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
579        if ($accent_folding_enabled && $self->{'accentfold'}) {
580        system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
581        }
582    }
583
584    # remove unwanted files
585    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
586    opendir (DIR, $tmpdir) || die
587        "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
588    foreach my $file (readdir(DIR)) {
589        next if $file =~ /^\./;
590        my ($suffix) = $file =~ /\.([^\.]+)$/;
591        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
592        # delete it!
593        print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
594        #&util::rm (&util::filename_cat ($tmpdir, $file));
595        }
596    }
597    closedir (DIR);
598    }
599    print STDERR "</Stage>\n" if $self->{'gli'};
600}   
601
602
603sub get_collection_meta_indexes
604{
605    my $self = shift(@_);
606    my $collection_infodb = shift(@_);
607
608    # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
609    if (!defined $self->{'build_cfg'}) {
610    $self->read_final_field_list();
611    }
612
613    # first do the collection meta stuff - everything without a dot
614    my $collmetadefined = 0;
615    my $metadata_entry;
616    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
617    $collmetadefined = 1;
618    }
619
620    #add the index field macros to [collection]
621    # eg <TI>Title
622    #    <SU>Subject
623    # these now come from collection meta. if that is not defined, uses the metadata name
624    my $collmeta = "";
625    if (defined $self->{'build_cfg'}->{'indexfields'}) {
626    foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
627        my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
628        next if $shortfield eq 1;
629       
630        # we need to check if some coll meta has been defined - don't output
631        # any that have
632        $collmeta = ".$longfield";
633        if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
634        if ($longfield eq "allfields") {
635            $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
636        } elsif ($longfield eq "text") {
637            $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
638        } else {
639            $collection_infodb->{$shortfield} = [ $longfield ];
640        }
641        }
642    }
643    }
644
645    # now add the level names
646    my $level_entry = "";
647    foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
648    $collmeta = ".$level"; # based on the original specification
649    $level =~ tr/A-Z/a-z/; # make it lower case
650    my $levelid = $level_map{$level}; # find the actual value we used in the index
651    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
652        # use the default macro
653        $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
654    }
655    }
656   
657    # now add subcoll meta
658    my $subcoll_entry = "";
659    my $shortname = "";
660    my $one_entry = "";
661    foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
662    $shortname = $self->{'index_mapping'}->{$subcoll};
663    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
664        $collection_infodb->{$shortname} = [ $subcoll ];
665    }
666    }
667
668    # now add language meta
669    my $lang_entry = "";
670    foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
671    $shortname = $self->{'index_mapping'}->{$lang};
672    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
673        $collection_infodb->{$shortname} = [ $lang ];
674    }
675    }
676}
677
678
679# default is to output the metadata sets (prefixes) used in collection
680sub output_collection_meta
681{
682    my $self = shift(@_);
683    my $infodb_handle = shift(@_);
684
685    my %collection_infodb = ();
686    $self->get_collection_meta_sets(\%collection_infodb);
687    $self->get_collection_meta_indexes(\%collection_infodb);
688    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
689}
690
691
692# at the end of building, we have an indexfieldmap with all the mappings,
693# plus some extras, and indexmap with any indexes in it that weren't
694# specified in the index definition.  we want to make an ordered list of
695# fields that are indexed, and a list of mappings that are used. this will
696# be used for the build.cfg file, and for collection meta definition we
697# store these in a build.cfg bit
698sub make_final_field_list {
699    my $self = shift (@_);
700   
701    $self->{'build_cfg'} = {};
702   
703    # store the indexfieldmap information
704    my @indexfieldmap = ();
705    my @indexfields = ();
706    my $specifiedfields = {};
707    my @specifiedfieldorder = ();
708
709    # go through the index definition and add each thing to a map, so we
710    # can easily check if it is already specified - when doing the
711    # metadata, we print out all the individual fields, but some may
712    # already be specified in the index definition, so we dont want to add
713    # those again.
714
715    my $field;
716    foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
717    # remove subcoll stuff
718    my $parts = $field;
719    $parts =~ s/:.*$//;
720    # *************
721    my @fs = split(';', $parts);
722    foreach my $f(@fs) {
723        if (!defined $specifiedfields->{$f}) {
724        $specifiedfields->{$f}=1;
725        push (@specifiedfieldorder, "$f");
726        }
727    }
728    }
729   
730    #add all fields bit
731    my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
732   
733    foreach $field (@specifiedfieldorder) {
734    if ($field eq "metadata") {
735        foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
736        if (!defined $specifiedfields->{$newfield}) {
737            push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
738            push (@indexfields, "$newfield");
739        }
740        }
741
742    } elsif ($field eq 'text') {
743        push (@indexfieldmap, "text\-\>TX");
744        push (@indexfields, "text");
745    } elsif ($field eq 'allfields') {
746        push (@indexfieldmap, "allfields\-\>ZZ");
747        push (@indexfields, "allfields");
748    } else {
749        # we only add in the ones that have been processed
750        if (defined $ifm->{$field}) {
751        push (@indexfieldmap, "$field\-\>$ifm->{$field}");
752        push (@indexfields, "$field");
753        }
754
755       
756    }
757    }
758
759    if (scalar @indexfieldmap) {
760    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
761    }
762    if (scalar @indexfields) {
763    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
764    }
765}
766
767
768# recreate the field list from the build.cfg file, look first in building,
769# then in index to find it. if there is no build.cfg, we can't do the field
770# list (there is unlikely to be any index anyway.)
771sub read_final_field_list {
772    my $self = shift (@_);
773    $self->{'build_cfg'} = {};
774    my @indexfieldmap = ();
775    my @indexfields = ();
776    my @indexmap = ();
777
778    # we read the stuff in from the build.cfg file - if its there
779    my $buildcfg = $self->read_build_cfg();
780    return unless defined $buildcfg;
781
782    my $field;
783    if (defined $buildcfg->{'indexfields'}) {
784    foreach $field (@{$buildcfg->{'indexfields'}}) {
785        push (@indexfields, "$field");
786    }
787    }
788
789    if (defined $buildcfg->{'indexfieldmap'}) {
790    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
791        push (@indexfieldmap, "$field");
792        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
793        $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
794    }
795    }       
796
797    if (defined $buildcfg->{'indexmap'}) {
798    foreach $field (@{$buildcfg->{'indexmap'}}) {
799        push (@indexmap, "$field");
800    }
801    }       
802
803    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
804    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
805    $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
806}
807
808
809sub build_cfg_extra {
810    my $self = shift (@_);
811    my ($build_cfg) = @_;
812
813    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
814   
815    # store the level info
816    my @indexlevels = ();
817    my @levelmap = ();
818    foreach my $l (@{$self->{'levelorder'}}) {
819    push (@indexlevels, $level_map{$l});
820    push (@levelmap, "$l\-\>$level_map{$l}");
821    }
822    $build_cfg->{'indexlevels'} = \@indexlevels;
823    $build_cfg->{'levelmap'} = \@levelmap;
824
825    # text level (and database level) is always section
826    $build_cfg->{'textlevel'} = $level_map{'section'};
827   
828}
829
8301;
831
832
Note: See TracBrowser for help on using the browser.