source: gsdl/trunk/perllib/mgppbuilder.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
RevLine 
[932]1###########################################################################
2#
[1852]3# mgppbuilder.pm -- MGBuilder object
[932]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
[10468]28use basebuilder;
[932]29use colcfg;
30use plugin;
[15715]31use strict; no strict 'refs';
[932]32use util;
33
[15715]34
[10468]35sub BEGIN {
36 @mgppbuilder::ISA = ('basebuilder');
[1694]37}
38
39
40
[9157]41our %level_map = ('document'=>'Doc',
[4811]42 'section'=>'Sec',
43 'paragraph'=>'Para',
44 'Doc'=>'_textdocument_',
45 'Sec'=>'_textsection_',
46 'Para'=>'_textparagraph_');
[1852]47
[9157]48our %wanted_index_files = ('td'=>1,
[932]49 't'=>1,
[1852]50 'tl'=>1,
51 'ti'=>1,
[932]52 'idb'=>1,
53 'ib1'=>1,
54 'ib2'=>1,
55 'ib3'=>1,
[12910]56 'ib4'=>1,
57 'ib5'=>1,
58 'ib6'=>1,
59 'ib7'=>1,
[932]60 'i'=>1,
[1852]61 'il'=>1,
62 'w'=>1,
[932]63 'wa'=>1);
64
[1852]65# change this so a user can add their own ones in via a file or cfg
[4768]66#add AND, OR, NOT NEAR to this list - these cannot be used as field names
[4811]67#also add the level names (Doc, Sec, Para)
[9157]68our %static_indexfield_map = ('Title'=>'TI',
[1852]69 'TI'=>1,
70 'Subject'=>'SU',
71 'SU'=>1,
72 'Creator'=>'CR',
73 'CR'=>1,
[4768]74 'Organization'=>'ORG',
75 'ORG'=>1,
[1852]76 'Source'=>'SO',
77 'SO'=>1,
78 'Howto'=>'HT',
79 'HT'=>1,
80 'ItemTitle'=>'IT',
81 'IT'=>1,
82 'ProgNumber'=>'PN',
83 'PN'=>1,
84 'People'=>'PE',
85 'PE'=>1,
[5643]86 'Coverage'=>'CO',
87 'CO'=>1,
[4794]88 'allfields'=>'ZZ',
[4768]89 'ZZ'=>1,
[4794]90 'text'=>'TX',
[4768]91 'TX'=>1,
92 'AND'=>1,
93 'OR'=>1,
94 'NOT'=>1,
[4811]95 'NEAR'=>1,
96 'Doc'=>1,
97 'Sec'=>1,
98 'Para'=>1);
[932]99
[10468]100my $maxdocsize = $basebuilder::maxdocsize;
101
[932]102sub new {
[7953]103 my $class = shift(@_);
104
[10468]105 my $self = new basebuilder (@_);
106 $self = bless $self, $class;
[932]107
[10468]108 $self->{'indexfieldmap'} = \%static_indexfield_map;
[6407]109
[1852]110 # get the levels (Section, Paragraph) for indexing and compression
111 $self->{'levels'} = {};
[4811]112 $self->{'levelorder'} = ();
[1852]113 if (defined $self->{'collect_cfg'}->{'levels'}) {
[8716]114 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
[4811]115 $level =~ tr/A-Z/a-z/;
[1852]116 $self->{'levels'}->{$level} = 1;
[4811]117 push (@{$self->{'levelorder'}}, $level);
[1852]118 }
[4811]119 } else { # default to document
120 $self->{'levels'}->{'document'} = 1;
121 push (@{$self->{'levelorder'}}, 'document');
122 }
123
[7953]124 $self->{'buildtype'} = "mgpp";
[932]125
126 return $self;
127}
128
[10468]129sub generate_index_list {
130 my $self = shift (@_);
131
132 # sort out the indexes
133 #indexes are specified with spaces, but we put them into one index
134 my $indexes = $self->{'collect_cfg'}->{'indexes'};
135 $self->{'collect_cfg'}->{'indexes'} = [];
[13274]136 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
[932]137}
138
[12910]139sub generate_index_options {
140 my $self = shift (@_);
141
[17110]142 $self->SUPER::generate_index_options();
143
[12910]144 $self->{'casefold'} = 0;
145 $self->{'stem'} = 0;
146 $self->{'accentfold'} = 0;
147
148 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
149 # just use default options
150 $self->{'casefold'} = 1;
151 $self->{'stem'} = 1;
152 $self->{'accentfold'} = 1;
153 } else {
154 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
155 if ($option =~ /stem/) {
156 $self->{'stem'} = 1;
157 } elsif ($option =~ /casefold/) {
158 $self->{'casefold'} = 1;
159 } elsif ($option =~ /accentfold/) {
160 $self->{'accentfold'} = 1;
161 }
162 }
163 }
164
165 # now we record this for the build cfg
166 $self->{'stemindexes'} = 0;
167 if ($self->{'casefold'}) {
168 $self->{'stemindexes'} += 1;
169 }
170 if ($self->{'stem'}) {
171 $self->{'stemindexes'} += 2;
172 }
173 if ($self->{'accentfold'}) {
174 $self->{'stemindexes'} += 4;
175 }
[13341]176
[12910]177}
178
[10468]179sub default_buildproc {
180 my $self = shift (@_);
181
182 return "mgppbuildproc";
[932]183}
184
185sub compress_text {
186
187 my $self = shift (@_);
[10961]188
189 # we don't do anything if we don't want compressed text
190 return if $self->{'no_text'};
191
[932]192 my ($textindex) = @_;
193
[2478]194 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
[932]195 my $exe = &util::get_os_exe ();
[2478]196 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
197 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
[1694]198 my $outhandle = $self->{'outhandle'};
[932]199
[12340]200 my $maxnumeric = $self->{'maxnumeric'};
[12325]201
[932]202 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
203
[15003]204 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
205 my $basefilename = &util::filename_cat("text",$collect_tail);
[2700]206 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
[7904]207
[15003]208 my $osextra = "";
[2478]209 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[3115]210 $fulltextprefix =~ s@/@\\@g;
[2478]211 }
[2700]212 else {
213 $osextra = " -d /";
214 }
[1852]215
216
[4811]217 # define the section names and possibly the doc name for mgpasses
[1852]218 # the compressor doesn't need to know about paragraphs - never want to
219 # retrieve them
[13590]220
221 # always use Doc and Sec levels
222 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
[12911]223
[1694]224 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
[6407]225 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
[932]226
227 # collect the statistics for the text
[1694]228 # -b $maxdocsize sets the maximum document size to be 12 meg
[2478]229 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
[6407]230 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
[932]231
232 my ($handle);
233 if ($self->{'debug'}) {
[15715]234 $handle = *STDOUT;
235 }
236 else {
[2478]237 if (!-e "$mgpp_passes_exe" ||
[15715]238 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
[6407]239 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
[2478]240 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
[932]241 }
242 }
[13590]243
[15685]244 my $db_level = "section";
[9919]245
[932]246 $self->{'buildproc'}->set_output_handle ($handle);
247 $self->{'buildproc'}->set_mode ('text');
248 $self->{'buildproc'}->set_index ($textindex);
249 $self->{'buildproc'}->set_indexing_text (0);
[1852]250 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
251 $self->{'buildproc'}->set_levels ($self->{'levels'});
[15685]252 $self->{'buildproc'}->set_db_level ($db_level);
[932]253 $self->{'buildproc'}->reset();
254 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
255 $self->{'buildproc'}, $self->{'maxdocs'});
256 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]257 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]258 &plugin::end($self->{'pluginfo'});
259
260 close ($handle) unless $self->{'debug'};
261
[2478]262 $self->print_stats();
263
[932]264 # create the compression dictionary
265 # the compression dictionary is built by assuming the stats are from a seed
266 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
267 # and the resulting dictionary must be less than 5 meg with the most
268 # frequent words being put into the dictionary first (-2 -k 5120)
[1852]269 # note: these options are left over from mg version
[932]270 if (!$self->{'debug'}) {
[1694]271 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
[6407]272 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
[2478]273 if (!-e "$mgpp_compression_dict_exe") {
[6407]274 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
[2478]275 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
[932]276 }
[2700]277 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
[932]278
279 if (!$self->{'debug'}) {
[2478]280 if (!-e "$mgpp_passes_exe" ||
[12325]281 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
[6407]282 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]283 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
[932]284 }
285 }
286 }
[6407]287 else {
288 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
289 }
[932]290
291 $self->{'buildproc'}->reset();
292 # compress the text
[2478]293 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
[6407]294 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
295
[932]296 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]297 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]298 close ($handle) unless $self->{'debug'};
[1694]299
300 $self->print_stats();
[6407]301 print STDERR "</Stage>\n" if $self->{'gli'};
[932]302}
303
304
[10468]305sub build_indexes_extra {
306 my $self = shift(@_);
[5617]307 #define the final field lists
308 $self->make_final_field_list();
[10468]309}
[5617]310
[932]311# creates directory names for each of the index descriptions
312sub create_index_mapping {
313 my $self = shift (@_);
314 my ($indexes) = @_;
315
316 my %mapping = ();
[5935]317
[932]318 $mapping{'indexmaporder'} = [];
319 $mapping{'subcollectionmaporder'} = [];
320 $mapping{'languagemaporder'} = [];
321
322 # dirnames is used to check for collisions. Start this off
323 # with the manditory directory names
324 my %dirnames = ('text'=>'text',
325 'extra'=>'extra');
[8716]326 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
[932]327
[8716]328 foreach my $index (@$indexes) {
[932]329 my ($fields, $subcollection, $languages) = split (":", $index);
[13590]330
331 # we only ever have one index, and its called 'idx'
[8716]332 my $pindex = 'idx';
[4768]333
[932]334 # next comes a processed version of the subcollection if there is one.
335 my $psub = $self->process_field ($subcollection);
336 $psub = lc ($psub);
337
338 # next comes a processed version of the language if there is one.
339 my $plang = $self->process_field ($languages);
340 $plang = lc ($plang);
341
342 my $dirname = $pindex . $psub . $plang;
343
344 # check to be sure all index names are unique
345 while (defined ($dirnames{$dirname})) {
346 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
347 }
348
[2478]349 $mapping{$index} = $dirname;
350
[932]351 # store the mapping orders as well as the maps
[2478]352 # also put index, subcollection and language fields into the mapping thing -
[4794]353 # (the full index name (eg text:subcol:lang) is not used on
[2478]354 # the query page) -these are used for collectionmeta later on
[932]355 if (!defined $mapping{'indexmap'}{"$fields"}) {
356 $mapping{'indexmap'}{"$fields"} = $pindex;
357 push (@{$mapping{'indexmaporder'}}, "$fields");
[2478]358 if (!defined $mapping{"$fields"}) {
359 $mapping{"$fields"} = $pindex;
360 }
[932]361 }
362 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
363 $mapping{'subcollectionmap'}{$subcollection} = $psub;
364 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
[2478]365 $mapping{$subcollection} = $psub;
[932]366 }
367 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
368 $mapping{'languagemap'}{$languages} = $plang;
[6544]369 push (@{$mapping{'languagemaporder'}}, $languages);
[2478]370 $mapping{$languages} = $plang;
[932]371 }
372 $dirnames{$dirname} = $index;
[8716]373 $pnames{'index'}->{$pindex} = "$fields";
374 $pnames{'subcollection'}->{$psub} = $subcollection;
375 $pnames{'languages'}->{$plang} = $languages;
[932]376 }
377
378 return \%mapping;
379}
380
381sub make_unique {
382 my $self = shift (@_);
383 my ($namehash, $index, $indexref, $subref, $langref) = @_;
384 my ($fields, $subcollection, $languages) = split (":", $index);
385
386 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
387 $self->get_next_version ($indexref);
388 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
389 $self->get_next_version ($subref);
390 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
391 $self->get_next_version ($langref);
392 }
393 return "$$indexref$$subref$$langref";
394}
395
396
397sub build_index {
398 my $self = shift (@_);
399 my ($index) = @_;
[1694]400 my $outhandle = $self->{'outhandle'};
[932]401
402 # get the full index directory path and make sure it exists
403 my $indexdir = $self->{'index_mapping'}->{$index};
404 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
[15003]405
406 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
[2700]407 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
408 $indexdir,
[15003]409 $collect_tail);
[2700]410 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
[15003]411 $collect_tail);
[932]412
413 # get any os specific stuff
[2478]414 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
[932]415
416 my $exe = &util::get_os_exe ();
[2478]417 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
[1852]418
419 # define the section names for mgpasses
[13590]420 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
421 if ($self->{'levels'}->{'paragraph'}) {
422 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
[1852]423 }
424
[2478]425 my $mgpp_perf_hash_build_exe =
426 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
427 my $mgpp_weights_build_exe =
428 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
429 my $mgpp_invf_dict_exe =
430 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
431 my $mgpp_stem_idx_exe =
432 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
[932]433
[12340]434 my $maxnumeric = $self->{'maxnumeric'};
[12325]435
436 my $osextra = "";
[2700]437 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[3115]438 $fullindexprefix =~ s@/@\\@g;
[2700]439 } else {
440 $osextra = " -d /";
[3115]441 if ($outhandle ne "STDERR") {
442 # so mgpp_passes doesn't print to stderr if we redirect output
443 $osextra .= " 2>/dev/null";
444 }
[2478]445 }
[2700]446
[932]447 # get the index expression if this index belongs
448 # to a subcollection
449 my $indexexparr = [];
[9669]450 my $langarr = [];
[2478]451 # there may be subcollection info, and language info.
452 my ($fields, $subcollection, $language) = split (":", $index);
[932]453 my @subcollections = ();
454 @subcollections = split /,/, $subcollection if (defined $subcollection);
455
456 foreach $subcollection (@subcollections) {
457 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
458 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
459 }
460 }
461
462 # add expressions for languages if this index belongs to
[2478]463 # a language subcollection - only put languages expressions for the
464 # ones we want in the index
[6544]465
[2478]466 my @languages = ();
[9548]467 my $language_metadata = "Language";
468 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
469 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
470 }
[2478]471 @languages = split /,/, $language if (defined $language);
[9548]472 foreach my $language (@languages) {
[2478]473 my $not=0;
[932]474 if ($language =~ s/^\!//) {
[2478]475 $not = 1;
[932]476 }
[9548]477 if($not) {
[9669]478 push (@$langarr, "!$language");
[6544]479 } else {
[9669]480 push (@$langarr, "$language");
[2478]481 }
[932]482 }
483
484 # Build index dictionary. Uses verbatim stem method
[2478]485 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
[6407]486 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
[932]487 my ($handle);
488 if ($self->{'debug'}) {
[15715]489 $handle = *STDOUT;
490 }
491 else {
[2478]492 if (!-e "$mgpp_passes_exe" ||
[15715]493 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
[6407]494 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]495 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
[932]496 }
497 }
[9919]498
[15685]499 # db_level is always section
500 my $db_level = "section";
[9919]501
[4794]502 # set up the document processr
[932]503 $self->{'buildproc'}->set_output_handle ($handle);
504 $self->{'buildproc'}->set_mode ('text');
505 $self->{'buildproc'}->set_index ($index, $indexexparr);
[9669]506 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
[932]507 $self->{'buildproc'}->set_indexing_text (1);
[1852]508 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[9919]509 $self->{'buildproc'}->set_levels ($self->{'levels'});
[15685]510 $self->{'buildproc'}->set_db_level ($db_level);
[9919]511
[932]512 $self->{'buildproc'}->reset();
513 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]514 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]515 close ($handle) unless $self->{'debug'};
516
[1694]517 $self->print_stats();
518
[5768]519 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
520 # we check on the .id file - index dictionary
521 my $dict_file = "$fullindexprefix.id";
522 if (!-e $dict_file) {
523 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
[6407]524 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
[5768]525 $self->{'notbuilt'}->{$index}=1;
526 return;
527 }
528
[932]529 if (!$self->{'debug'}) {
530 # create the perfect hash function
[2478]531 if (!-e "$mgpp_perf_hash_build_exe") {
[6407]532 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
[2478]533 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
[932]534 }
[2700]535 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
[932]536
[2478]537 if (!-e "$mgpp_passes_exe" ||
[12325]538 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
[6407]539 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]540 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
[932]541 }
542 }
543
544 # invert the text
[2478]545 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
[6407]546 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
[932]547 $self->{'buildproc'}->reset();
548 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]549 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[1694]550
551 $self->print_stats ();
[932]552
553 if (!$self->{'debug'}) {
554
555 close ($handle);
556
557 # create the weights file
[1694]558 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
[6407]559 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
[2478]560 if (!-e "$mgpp_weights_build_exe") {
[6407]561 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
[2478]562 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
[932]563 }
[2700]564 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
[932]565
566 # create 'on-disk' stemmed dictionary
[1694]567 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
[2478]568 if (!-e "$mgpp_invf_dict_exe") {
[6407]569 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
[2478]570 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
[932]571 }
[2700]572 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
[932]573
574
575 # creates stem index files for the various stemming methods
[1694]576 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
[6407]577 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
[2478]578 if (!-e "$mgpp_stem_idx_exe") {
[6407]579 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
[2478]580 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
[932]581 }
[12910]582 my $accent_folding_enabled = 1;
583 if ($self->{'accentfold'}) {
584 # the first time we do this, we test for accent folding enabled
[13813]585 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
[12910]586 # accent folding has not been enabled in mgpp
587 $accent_folding_enabled = 0;
588 $self->{'stemindexes'} -= 4;
589 }
590 }
591 if ($self->{'casefold'}) {
592 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
593 if ($accent_folding_enabled && $self->{'accentfold'}) {
594 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
595 }
596 }
597 if ($self->{'stem'}) {
598 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
599 if ($accent_folding_enabled && $self->{'accentfold'}) {
600 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
601 }
602 }
603 if ($self->{'casefold'} && $self->{'stem'}) {
604 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
605 if ($accent_folding_enabled && $self->{'accentfold'}) {
606 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
607 }
608 }
609
[932]610 # remove unwanted files
[1852]611 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
612 opendir (DIR, $tmpdir) || die
613 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
[8716]614 foreach my $file (readdir(DIR)) {
[1852]615 next if $file =~ /^\./;
616 my ($suffix) = $file =~ /\.([^\.]+)$/;
617 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
[932]618 # delete it!
[1852]619 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
[2772]620 #&util::rm (&util::filename_cat ($tmpdir, $file));
[1852]621 }
622 }
623 closedir (DIR);
[4794]624 }
[6407]625 print STDERR "</Stage>\n" if $self->{'gli'};
[932]626}
627
[15709]628
629sub get_collection_meta_indexes
630{
[10468]631 my $self = shift(@_);
[15709]632 my $collection_infodb = shift(@_);
[1694]633
[10477]634 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
635 if (!defined $self->{'build_cfg'}) {
636 $self->read_final_field_list();
637 }
638
[4794]639 # first do the collection meta stuff - everything without a dot
640 my $collmetadefined = 0;
[10468]641 my $metadata_entry;
[932]642 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
[4794]643 $collmetadefined = 1;
644 }
[11965]645
[4811]646 #add the index field macros to [collection]
[4794]647 # eg <TI>Title
648 # <SU>Subject
649 # these now come from collection meta. if that is not defined, usses the metadata name
[8716]650 my $collmeta = "";
651 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
652 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
[4794]653 next if $shortfield eq 1;
654
[11965]655 # we need to check if some coll meta has been defined - don't output
656 # any that have
[8716]657 $collmeta = ".$longfield";
[11965]658 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
[4794]659 if ($longfield eq "allfields") {
[15709]660 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
[4794]661 } elsif ($longfield eq "text") {
[15709]662 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
[4794]663 } else {
[15709]664 $collection_infodb->{$shortfield} = [ $longfield ];
[4794]665 }
[932]666 }
667 }
[2772]668
[4811]669 # now add the level names
[8716]670 my $level_entry = "";
671 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
672 $collmeta = ".$level"; # based on the original specification
[4811]673 $level =~ tr/A-Z/a-z/; # make it lower case
[7090]674 my $levelid = $level_map{$level}; # find the actual value we used in the index
[11965]675 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
[4811]676 # use the default macro
[15709]677 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
[4811]678 }
679 }
[5935]680
681 # now add subcoll meta
[8716]682 my $subcoll_entry = "";
683 my $shortname = "";
684 my $one_entry = "";
685 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
[11965]686 $shortname = $self->{'index_mapping'}->{$subcoll};
687 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
[15709]688 $collection_infodb->{$shortname} = [ $subcoll ];
[5935]689 }
690 }
[10158]691
692 # now add language meta
[8716]693 my $lang_entry = "";
694 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
[11965]695 $shortname = $self->{'index_mapping'}->{$lang};
696 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
[15709]697 $collection_infodb->{$shortname} = [ $lang ];
[6544]698 }
699 }
[15709]700}
[932]701
[15709]702
703# default is to output the metadata sets (prefixes) used in collection
704sub output_collection_meta
705{
706 my $self = shift(@_);
707 my $infodb_handle = shift(@_);
708
709 my %collection_infodb = ();
710 $self->get_collection_meta_sets(\%collection_infodb);
711 $self->get_collection_meta_indexes(\%collection_infodb);
[15725]712 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[932]713}
[7150]714
[15709]715
[10158]716# at the end of building, we have an indexfieldmap with all the mappings,
717# plus some extras, and indexmap with any indexes in it that weren't
718# specified in the index definition. we want to make an ordered list of
719# fields that are indexed, and a list of mappings that are used. this will
720# be used for the build.cfg file, and for collection meta definition we
721# store these in a build.cfg bit
[4794]722sub make_final_field_list {
723 my $self = shift (@_);
724
725 $self->{'build_cfg'} = {};
[10158]726
[4794]727 # store the indexfieldmap information
728 my @indexfieldmap = ();
729 my @indexfields = ();
730 my $specifiedfields = {};
731 my @specifiedfieldorder = ();
[10158]732
733 # go through the index definition and add each thing to a map, so we
734 # can easily check if it is already specified - when doing the
735 # metadata, we print out all the individual fields, but some may
736 # already be specified in the index definition, so we dont want to add
737 # those again.
738
[10468]739 my $field;
740 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
[5617]741 # remove subcoll stuff
742 my $parts = $field;
743 $parts =~ s/:.*$//;
[10961]744 # *************
745 my @fs = split(';', $parts);
[8716]746 foreach my $f(@fs) {
[5617]747 if (!defined $specifiedfields->{$f}) {
748 $specifiedfields->{$f}=1;
749 push (@specifiedfieldorder, "$f");
750 }
[4794]751 }
752 }
[5643]753
[4794]754 #add all fields bit
[10468]755 foreach $field (@specifiedfieldorder) {
[4794]756 if ($field eq "metadata") {
[8716]757 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
[4794]758 if (!defined $specifiedfields->{$newfield}) {
759 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
760 push (@indexfields, "$newfield");
761 }
762 }
763
764 } elsif ($field eq 'text') {
765 push (@indexfieldmap, "text\-\>TX");
766 push (@indexfields, "text");
767 } elsif ($field eq 'allfields') {
768 push (@indexfieldmap, "allfields\-\>ZZ");
769 push (@indexfields, "allfields");
770 } else {
[11996]771
772 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
773
774 if (defined $ifm->{$field}) {
775 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
776 push (@indexfields, "$field");
777 }
778
[4794]779
780 }
781 }
[10158]782
[4794]783 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
784 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
[10961]785
[4794]786}
787
788
[10158]789# recreate the field list from the build.cfg file, look first in building,
790# then in index to find it. if there is no build.cfg, we can't do the field
791# list (there is unlikely to be any index anyway.)
[4794]792sub read_final_field_list {
793 my $self = shift (@_);
794 $self->{'build_cfg'} = {};
795 my @indexfieldmap = ();
796 my @indexfields = ();
[14666]797 my @indexmap = ();
798
[4794]799 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
800 # set the default mapping
801 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
802 }
803 # we read the stuff in from the build.cfg file - if its there
[8716]804 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
[4794]805
806 if (!-e $buildconfigfile) {
807 # try the index dir - but do we know where it is?? try here
808 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
809 if (!-e $buildconfigfile) {
810 #we cant find a config file - just ignore the field list
811 return;
812 }
813 }
[10158]814
[8716]815 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
[10468]816 my $field;
[4794]817 if (defined $buildcfg->{'indexfields'}) {
[10468]818 foreach $field (@{$buildcfg->{'indexfields'}}) {
[4794]819 push (@indexfields, "$field");
820 }
821 }
[10158]822
[4794]823 if (defined $buildcfg->{'indexfieldmap'}) {
[10468]824 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
[4794]825 push (@indexfieldmap, "$field");
[8716]826 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
[4794]827 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
828 }
829 }
[10158]830
[14666]831 if (defined $buildcfg->{'indexmap'}) {
832 foreach $field (@{$buildcfg->{'indexmap'}}) {
833 push (@indexmap, "$field");
834 }
835 }
836
[4794]837 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
838 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
[14666]839 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
[4794]840}
[10158]841
[10468]842
843sub build_cfg_extra {
[932]844 my $self = shift (@_);
[10468]845 my ($build_cfg) = @_;
846
847 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
[4794]848
[4811]849 # store the level info
850 my @indexlevels = ();
[9936]851 my @levelmap = ();
[8716]852 foreach my $l (@{$self->{'levelorder'}}) {
[7090]853 push (@indexlevels, $level_map{$l});
[9936]854 push (@levelmap, "$l\-\>$level_map{$l}");
[4811]855 }
856 $build_cfg->{'indexlevels'} = \@indexlevels;
[9936]857 $build_cfg->{'levelmap'} = \@levelmap;
858
[15687]859 # text level (and database level) is always section
[13590]860 $build_cfg->{'textlevel'} = $level_map{'section'};
[10468]861
[932]862}
863
8641;
865
866
Note: See TracBrowser for help on using the repository browser.