root/gsdl/trunk/perllib/mgppbuilder.pm @ 17110

Revision 17110, 28.5 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33
34
35sub BEGIN {
36    @mgppbuilder::ISA = ('basebuilder');
37}
38
39
40
41our %level_map = ('document'=>'Doc',
42          'section'=>'Sec',
43          'paragraph'=>'Para',
44          'Doc'=>'_textdocument_',
45          'Sec'=>'_textsection_',
46          'Para'=>'_textparagraph_');
47
48our %wanted_index_files = ('td'=>1,
49               't'=>1,
50               'tl'=>1,
51               'ti'=>1,
52               'idb'=>1,
53               'ib1'=>1,
54               'ib2'=>1,
55               'ib3'=>1,
56               'ib4'=>1,
57               'ib5'=>1,
58               'ib6'=>1,
59               'ib7'=>1,
60               'i'=>1,
61               'il'=>1,
62               'w'=>1,
63               'wa'=>1);
64
65# change this so a user can add their own ones in via a file or cfg
66#add AND, OR, NOT NEAR to this list - these cannot be used as field names
67#also add the level names (Doc, Sec, Para)
68our %static_indexfield_map = ('Title'=>'TI',
69              'TI'=>1,
70              'Subject'=>'SU',
71              'SU'=>1,
72              'Creator'=>'CR',
73              'CR'=>1,
74              'Organization'=>'ORG',
75              'ORG'=>1,
76              'Source'=>'SO',
77              'SO'=>1,
78              'Howto'=>'HT',
79              'HT'=>1,
80              'ItemTitle'=>'IT',
81              'IT'=>1,
82              'ProgNumber'=>'PN',
83              'PN'=>1,
84              'People'=>'PE',
85              'PE'=>1,
86              'Coverage'=>'CO',
87              'CO'=>1,
88              'allfields'=>'ZZ',
89              'ZZ'=>1,
90              'text'=>'TX',
91              'TX'=>1,
92              'AND'=>1,
93              'OR'=>1,
94              'NOT'=>1,
95              'NEAR'=>1,
96              'Doc'=>1,
97              'Sec'=>1,
98              'Para'=>1);
99
100my $maxdocsize = $basebuilder::maxdocsize;
101
102sub new {
103    my $class = shift(@_);
104
105    my $self = new basebuilder (@_);
106    $self = bless $self, $class;
107
108    $self->{'indexfieldmap'} = \%static_indexfield_map;
109
110    # get the levels (Section, Paragraph) for indexing and compression
111    $self->{'levels'} = {};
112    $self->{'levelorder'} = ();
113    if (defined $self->{'collect_cfg'}->{'levels'}) {
114        foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
115        $level =~ tr/A-Z/a-z/;
116            $self->{'levels'}->{$level} = 1;
117        push (@{$self->{'levelorder'}}, $level);
118        }
119    } else { # default to document
120    $self->{'levels'}->{'document'} = 1;
121    push (@{$self->{'levelorder'}}, 'document');
122    }
123   
124    $self->{'buildtype'} = "mgpp";
125
126    return $self;
127}
128
129sub generate_index_list {
130    my $self  = shift (@_);
131   
132    # sort out the indexes
133    #indexes are specified with spaces, but we put them into one index
134    my $indexes = $self->{'collect_cfg'}->{'indexes'};
135    $self->{'collect_cfg'}->{'indexes'} = [];
136    push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
137}
138
139sub generate_index_options {
140    my $self = shift (@_);
141
142    $self->SUPER::generate_index_options();
143
144    $self->{'casefold'} = 0;
145    $self->{'stem'} = 0;
146    $self->{'accentfold'} = 0;
147   
148    if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
149    # just use default options
150    $self->{'casefold'} = 1;
151    $self->{'stem'} = 1;
152    $self->{'accentfold'} = 1;
153    } else {
154    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
155        if ($option =~ /stem/) {
156        $self->{'stem'} = 1;
157        } elsif ($option =~ /casefold/) {
158        $self->{'casefold'} = 1;
159        } elsif ($option =~ /accentfold/) {
160        $self->{'accentfold'} = 1;
161        }
162    }
163    }
164   
165    # now we record this for the build cfg
166    $self->{'stemindexes'} = 0;
167    if ($self->{'casefold'}) {
168    $self->{'stemindexes'} += 1;
169    }
170    if ($self->{'stem'}) {
171    $self->{'stemindexes'} += 2;
172    }
173    if ($self->{'accentfold'}) {
174    $self->{'stemindexes'} += 4;
175    }
176   
177}
178
179sub default_buildproc {
180    my $self  = shift (@_);
181
182    return "mgppbuildproc";
183}
184
185sub compress_text {
186
187    my $self = shift (@_);
188
189    # we don't do anything if we don't want compressed text
190    return if $self->{'no_text'};
191   
192    my ($textindex) = @_;
193
194    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
195    my $exe = &util::get_os_exe ();
196    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
197    my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
198    my $outhandle = $self->{'outhandle'};
199
200    my $maxnumeric = $self->{'maxnumeric'};
201   
202    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
203
204    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
205    my $basefilename = &util::filename_cat("text",$collect_tail);
206    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
207   
208    my $osextra = "";
209    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
210    $fulltextprefix =~ s@/@\\@g;
211    }
212    else {
213    $osextra = " -d /";
214    }
215
216
217    # define the section names and possibly the doc name for mgpasses
218    # the compressor doesn't need to know about paragraphs - never want to
219    # retrieve them
220   
221    # always use Doc and Sec levels
222    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
223
224    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
225    print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
226
227    # collect the statistics for the text
228    # -b $maxdocsize sets the maximum document size to be 12 meg
229    print $outhandle "\n    collecting text statistics (mgpp_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
230    print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
231
232    my ($handle);
233    if ($self->{'debug'}) {
234    $handle = *STDOUT;
235    }
236    else {
237    if (!-e "$mgpp_passes_exe" ||
238        !open($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
239        print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
240        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
241    }
242    }
243   
244    my $db_level = "section";
245
246    $self->{'buildproc'}->set_output_handle ($handle);
247    $self->{'buildproc'}->set_mode ('text');
248    $self->{'buildproc'}->set_index ($textindex);
249    $self->{'buildproc'}->set_indexing_text (0);
250    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
251    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
252    $self->{'buildproc'}->set_db_level ($db_level);                       
253    $self->{'buildproc'}->reset();
254    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
255           $self->{'buildproc'}, $self->{'maxdocs'});
256    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
257           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
258    &plugin::end($self->{'pluginfo'});
259
260    close ($handle) unless $self->{'debug'};
261
262    $self->print_stats();
263
264    # create the compression dictionary
265    # the compression dictionary is built by assuming the stats are from a seed
266    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
267    # and the resulting dictionary must be less than 5 meg with the most
268    # frequent words being put into the dictionary first (-2 -k 5120)
269    # note: these options are left over from mg version
270    if (!$self->{'debug'}) {
271    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
272    print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
273    if (!-e "$mgpp_compression_dict_exe") {
274        print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
275        die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
276    }
277    system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
278
279    if (!$self->{'debug'}) {
280        if (!-e "$mgpp_passes_exe" ||
281        !open ($handle, "| mgpp_passes$exe  -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
282        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
283        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
284        }
285    }
286    }
287    else {
288    print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
289    }
290
291    $self->{'buildproc'}->reset();
292    # compress the text
293    print $outhandle "\n    compressing the text (mgpp_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
294    print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
295
296    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
297           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
298    close ($handle) unless $self->{'debug'};
299
300    $self->print_stats();
301    print STDERR "</Stage>\n" if $self->{'gli'};
302}
303
304
305sub build_indexes_extra {
306    my $self = shift(@_);
307    #define the final field lists
308    $self->make_final_field_list();
309}   
310
311# creates directory names for each of the index descriptions
312sub create_index_mapping {
313    my $self = shift (@_);
314    my ($indexes) = @_;
315
316    my %mapping = ();
317
318    $mapping{'indexmaporder'} = [];
319    $mapping{'subcollectionmaporder'} = [];
320    $mapping{'languagemaporder'} = [];
321   
322    # dirnames is used to check for collisions. Start this off
323    # with the manditory directory names
324    my %dirnames = ('text'=>'text',
325            'extra'=>'extra');
326    my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
327
328    foreach my $index (@$indexes) {
329    my ($fields, $subcollection, $languages) = split (":", $index);
330   
331    # we only ever have one index, and its called 'idx'
332    my $pindex = 'idx';
333   
334    # next comes a processed version of the subcollection if there is one.
335    my $psub = $self->process_field ($subcollection);
336    $psub = lc ($psub);
337
338    # next comes a processed version of the language if there is one.
339    my $plang = $self->process_field ($languages);
340    $plang = lc ($plang);
341
342    my $dirname = $pindex . $psub . $plang;
343
344    # check to be sure all index names are unique
345    while (defined ($dirnames{$dirname})) {
346        $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
347    }
348
349    $mapping{$index} = $dirname;
350
351    # store the mapping orders as well as the maps
352    # also put index, subcollection and language fields into the mapping thing -
353    # (the full index name (eg text:subcol:lang) is not used on
354    # the query page) -these are used for collectionmeta later on
355    if (!defined $mapping{'indexmap'}{"$fields"}) {
356        $mapping{'indexmap'}{"$fields"} = $pindex;
357        push (@{$mapping{'indexmaporder'}}, "$fields");
358        if (!defined $mapping{"$fields"}) {
359        $mapping{"$fields"} = $pindex;
360        }   
361    }
362    if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
363        $mapping{'subcollectionmap'}{$subcollection} = $psub;
364        push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
365        $mapping{$subcollection} = $psub;
366    }
367    if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
368        $mapping{'languagemap'}{$languages} = $plang;
369        push (@{$mapping{'languagemaporder'}}, $languages);
370        $mapping{$languages} = $plang;
371    }
372    $dirnames{$dirname} = $index;
373    $pnames{'index'}->{$pindex} = "$fields";
374    $pnames{'subcollection'}->{$psub} = $subcollection;
375    $pnames{'languages'}->{$plang} = $languages;
376    }
377
378    return \%mapping;
379}
380
381sub make_unique {
382    my $self = shift (@_);
383    my ($namehash, $index, $indexref, $subref, $langref) = @_;
384    my ($fields, $subcollection, $languages) = split (":", $index);
385
386    if ($namehash->{'index'}->{$$indexref} ne "$fields") {
387    $self->get_next_version ($indexref);
388    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
389    $self->get_next_version ($subref);
390    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
391    $self->get_next_version ($langref);
392    }
393    return "$$indexref$$subref$$langref";
394}   
395
396
397sub build_index {
398    my $self = shift (@_);
399    my ($index) = @_;
400    my $outhandle = $self->{'outhandle'};
401
402    # get the full index directory path and make sure it exists
403    my $indexdir = $self->{'index_mapping'}->{$index};
404    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
405
406    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
407    my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
408                           $indexdir,
409                           $collect_tail);
410    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
411                           $collect_tail);
412
413    # get any os specific stuff
414    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
415
416    my $exe = &util::get_os_exe ();
417    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
418
419    # define the section names for mgpasses
420    my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
421    if ($self->{'levels'}->{'paragraph'}) {
422    $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
423    }
424
425    my $mgpp_perf_hash_build_exe =
426    &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
427    my $mgpp_weights_build_exe =
428    &util::filename_cat ($exedir, "mgpp_weights_build$exe");
429    my $mgpp_invf_dict_exe =
430    &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
431    my $mgpp_stem_idx_exe =
432    &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
433
434    my $maxnumeric = $self->{'maxnumeric'};
435
436    my $osextra = "";
437    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
438    $fullindexprefix =~ s@/@\\@g;
439    } else {
440    $osextra = " -d /";
441    if ($outhandle ne "STDERR") {
442        # so mgpp_passes doesn't print to stderr if we redirect output
443        $osextra .= " 2>/dev/null";
444    }
445    }
446 
447    # get the index expression if this index belongs
448    # to a subcollection
449    my $indexexparr = [];
450    my $langarr = [];
451    # there may be subcollection info, and language info.
452    my ($fields, $subcollection, $language) = split (":", $index);
453    my @subcollections = ();
454    @subcollections = split /,/, $subcollection if (defined $subcollection);
455
456    foreach $subcollection (@subcollections) {
457    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
458        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
459    }
460    }
461   
462    # add expressions for languages if this index belongs to
463    # a language subcollection - only put languages expressions for the
464    # ones we want in the index
465   
466    my @languages = ();
467    my $language_metadata = "Language";
468    if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
469    $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
470    }
471    @languages = split /,/, $language if (defined $language);
472    foreach my $language (@languages) {
473    my $not=0;
474    if ($language =~ s/^\!//) {
475        $not = 1;
476    }
477    if($not) {
478        push (@$langarr, "!$language");
479    } else {
480        push (@$langarr, "$language");
481    }
482    }
483
484    # Build index dictionary. Uses verbatim stem method
485    print $outhandle "\n    creating index dictionary (mgpp_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
486    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
487    my ($handle);
488    if ($self->{'debug'}) {
489    $handle = *STDOUT;
490    }
491    else {
492    if (!-e "$mgpp_passes_exe" ||
493        !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
494        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
495        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
496    }
497    }
498       
499    # db_level is always section
500    my $db_level = "section";
501
502    # set up the document processr
503    $self->{'buildproc'}->set_output_handle ($handle);
504    $self->{'buildproc'}->set_mode ('text');
505    $self->{'buildproc'}->set_index ($index, $indexexparr);
506    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
507    $self->{'buildproc'}->set_indexing_text (1);
508    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
509    $self->{'buildproc'}->set_levels ($self->{'levels'});
510    $self->{'buildproc'}->set_db_level ($db_level);   
511   
512    $self->{'buildproc'}->reset();
513    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
514           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
515    close ($handle) unless $self->{'debug'};
516
517    $self->print_stats();
518
519    # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
520    # we check on the .id file - index dictionary
521    my $dict_file = "$fullindexprefix.id";
522    if (!-e $dict_file) {
523    print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
524    print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
525    $self->{'notbuilt'}->{$index}=1;
526    return;
527    }
528
529    if (!$self->{'debug'}) {
530    # create the perfect hash function
531    if (!-e "$mgpp_perf_hash_build_exe") {
532        print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
533        die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
534    }
535    system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
536
537    if (!-e "$mgpp_passes_exe" ||
538        !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
539        print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
540        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
541    }
542    }
543   
544    # invert the text
545    print $outhandle "\n    inverting the text (mgpp_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
546    print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
547    $self->{'buildproc'}->reset();
548    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
549           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
550
551    $self->print_stats ();
552   
553    if (!$self->{'debug'}) {
554
555    close ($handle);
556   
557    # create the weights file
558    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
559    print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
560    if (!-e "$mgpp_weights_build_exe") {
561        print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
562        die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
563    }
564    system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
565
566    # create 'on-disk' stemmed dictionary
567    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
568    if (!-e "$mgpp_invf_dict_exe") {
569        print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
570        die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
571    }
572    system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
573
574
575    # creates stem index files for the various stemming methods
576    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
577    print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
578    if (!-e "$mgpp_stem_idx_exe") {
579        print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
580        die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
581    }
582    my $accent_folding_enabled = 1;
583    if ($self->{'accentfold'}) {
584        # the first time we do this, we test for accent folding enabled
585        if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
586        # accent folding has not been enabled in mgpp
587        $accent_folding_enabled = 0;
588        $self->{'stemindexes'} -= 4;
589        }
590    }
591    if ($self->{'casefold'}) {
592        system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
593        if ($accent_folding_enabled && $self->{'accentfold'}) {
594        system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
595        }
596    }
597    if ($self->{'stem'}) {
598        system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
599        if ($accent_folding_enabled && $self->{'accentfold'}) {
600        system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
601        }
602    }
603    if ($self->{'casefold'} && $self->{'stem'}) {
604        system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
605        if ($accent_folding_enabled && $self->{'accentfold'}) {
606        system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
607        }
608    }
609
610    # remove unwanted files
611    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
612    opendir (DIR, $tmpdir) || die
613        "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
614    foreach my $file (readdir(DIR)) {
615        next if $file =~ /^\./;
616        my ($suffix) = $file =~ /\.([^\.]+)$/;
617        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
618        # delete it!
619        print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
620        #&util::rm (&util::filename_cat ($tmpdir, $file));
621        }
622    }
623    closedir (DIR);
624    }
625    print STDERR "</Stage>\n" if $self->{'gli'};
626}   
627
628
629sub get_collection_meta_indexes
630{
631    my $self = shift(@_);
632    my $collection_infodb = shift(@_);
633
634    # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
635    if (!defined $self->{'build_cfg'}) {
636    $self->read_final_field_list();
637    }
638
639    # first do the collection meta stuff - everything without a dot
640    my $collmetadefined = 0;
641    my $metadata_entry;
642    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
643    $collmetadefined = 1;
644    }
645
646    #add the index field macros to [collection]
647    # eg <TI>Title
648    #    <SU>Subject
649    # these now come from collection meta. if that is not defined, usses the metadata name
650    my $collmeta = "";
651    foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
652    my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
653    next if $shortfield eq 1;
654   
655    # we need to check if some coll meta has been defined - don't output
656    # any that have
657    $collmeta = ".$longfield";
658    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
659        if ($longfield eq "allfields") {
660        $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
661        } elsif ($longfield eq "text") {
662        $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
663        } else {
664        $collection_infodb->{$shortfield} = [ $longfield ];
665        }
666    }
667    }
668   
669    # now add the level names
670    my $level_entry = "";
671    foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
672    $collmeta = ".$level"; # based on the original specification
673    $level =~ tr/A-Z/a-z/; # make it lower case
674    my $levelid = $level_map{$level}; # find the actual value we used in the index
675    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
676        # use the default macro
677        $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
678    }
679    }
680   
681    # now add subcoll meta
682    my $subcoll_entry = "";
683    my $shortname = "";
684    my $one_entry = "";
685    foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
686    $shortname = $self->{'index_mapping'}->{$subcoll};
687    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
688        $collection_infodb->{$shortname} = [ $subcoll ];
689    }
690    }
691
692    # now add language meta
693    my $lang_entry = "";
694    foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
695    $shortname = $self->{'index_mapping'}->{$lang};
696    if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
697        $collection_infodb->{$shortname} = [ $lang ];
698    }
699    }
700}
701
702
703# default is to output the metadata sets (prefixes) used in collection
704sub output_collection_meta
705{
706    my $self = shift(@_);
707    my $infodb_handle = shift(@_);
708
709    my %collection_infodb = ();
710    $self->get_collection_meta_sets(\%collection_infodb);
711    $self->get_collection_meta_indexes(\%collection_infodb);
712    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
713}
714
715
716# at the end of building, we have an indexfieldmap with all the mappings,
717# plus some extras, and indexmap with any indexes in it that weren't
718# specified in the index definition.  we want to make an ordered list of
719# fields that are indexed, and a list of mappings that are used. this will
720# be used for the build.cfg file, and for collection meta definition we
721# store these in a build.cfg bit
722sub make_final_field_list {
723    my $self = shift (@_);
724   
725    $self->{'build_cfg'} = {};
726   
727    # store the indexfieldmap information
728    my @indexfieldmap = ();
729    my @indexfields = ();
730    my $specifiedfields = {};
731    my @specifiedfieldorder = ();
732
733    # go through the index definition and add each thing to a map, so we
734    # can easily check if it is already specified - when doing the
735    # metadata, we print out all the individual fields, but some may
736    # already be specified in the index definition, so we dont want to add
737    # those again.
738
739    my $field;
740    foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
741    # remove subcoll stuff
742    my $parts = $field;
743    $parts =~ s/:.*$//;
744    # *************
745    my @fs = split(';', $parts);
746    foreach my $f(@fs) {
747        if (!defined $specifiedfields->{$f}) {
748        $specifiedfields->{$f}=1;
749        push (@specifiedfieldorder, "$f");
750        }
751    }
752    }
753   
754    #add all fields bit
755    foreach $field (@specifiedfieldorder) {
756    if ($field eq "metadata") {
757        foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
758        if (!defined $specifiedfields->{$newfield}) {
759            push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
760            push (@indexfields, "$newfield");
761        }
762        }
763
764    } elsif ($field eq 'text') {
765        push (@indexfieldmap, "text\-\>TX");
766        push (@indexfields, "text");
767    } elsif ($field eq 'allfields') {
768        push (@indexfieldmap, "allfields\-\>ZZ");
769        push (@indexfields, "allfields");
770    } else {
771
772        my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
773
774        if (defined $ifm->{$field}) {
775        push (@indexfieldmap, "$field\-\>$ifm->{$field}");
776        push (@indexfields, "$field");
777        }
778
779       
780    }
781    }
782
783    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
784    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
785   
786}
787
788
789# recreate the field list from the build.cfg file, look first in building,
790# then in index to find it. if there is no build.cfg, we can't do the field
791# list (there is unlikely to be any index anyway.)
792sub read_final_field_list {
793    my $self = shift (@_);
794    $self->{'build_cfg'} = {};
795    my @indexfieldmap = ();
796    my @indexfields = ();
797    my @indexmap = ();
798
799    if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
800    # set the default mapping
801    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
802    }
803    # we read the stuff in from the build.cfg file - if its there
804    my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
805   
806    if (!-e $buildconfigfile) {
807    # try the index dir - but do we know where it is?? try here
808    $buildconfigfile  = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
809    if (!-e $buildconfigfile) {
810        #we cant find a config file - just ignore the field list
811        return;
812    }
813    }
814
815    my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
816    my $field;
817    if (defined $buildcfg->{'indexfields'}) {
818    foreach $field (@{$buildcfg->{'indexfields'}}) {
819        push (@indexfields, "$field");
820    }
821    }
822
823    if (defined $buildcfg->{'indexfieldmap'}) {
824    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
825        push (@indexfieldmap, "$field");
826        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
827        $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
828    }
829    }       
830
831    if (defined $buildcfg->{'indexmap'}) {
832    foreach $field (@{$buildcfg->{'indexmap'}}) {
833        push (@indexmap, "$field");
834    }
835    }       
836
837    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
838    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
839    $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
840}
841
842
843sub build_cfg_extra {
844    my $self = shift (@_);
845    my ($build_cfg) = @_;
846
847    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
848   
849    # store the level info
850    my @indexlevels = ();
851    my @levelmap = ();
852    foreach my $l (@{$self->{'levelorder'}}) {
853    push (@indexlevels, $level_map{$l});
854    push (@levelmap, "$l\-\>$level_map{$l}");
855    }
856    $build_cfg->{'indexlevels'} = \@indexlevels;
857    $build_cfg->{'levelmap'} = \@levelmap;
858
859    # text level (and database level) is always section
860    $build_cfg->{'textlevel'} = $level_map{'section'};
861   
862}
863
8641;
865
866
Note: See TracBrowser for help on using the browser.