source: gsdl/trunk/perllib/mgppbuilder.pm@ 14934

Last change on this file since 14934 was 14934, checked in by davidb, 16 years ago

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.4 KB
RevLine 
[932]1###########################################################################
2#
[1852]3# mgppbuilder.pm -- MGBuilder object
[932]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
[10468]28use basebuilder;
[932]29use classify;
30use cfgread;
31use colcfg;
32use plugin;
33use util;
[1694]34use FileHandle;
[932]35
[10468]36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
[1694]38}
39
40
41
[9157]42our %level_map = ('document'=>'Doc',
[4811]43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
[1852]48
[9157]49our %wanted_index_files = ('td'=>1,
[932]50 't'=>1,
[1852]51 'tl'=>1,
52 'ti'=>1,
[932]53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
[12910]57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
[932]61 'i'=>1,
[1852]62 'il'=>1,
63 'w'=>1,
[932]64 'wa'=>1);
65
[1852]66# change this so a user can add their own ones in via a file or cfg
[4768]67#add AND, OR, NOT NEAR to this list - these cannot be used as field names
[4811]68#also add the level names (Doc, Sec, Para)
[9157]69our %static_indexfield_map = ('Title'=>'TI',
[1852]70 'TI'=>1,
71 'Subject'=>'SU',
72 'SU'=>1,
73 'Creator'=>'CR',
74 'CR'=>1,
[4768]75 'Organization'=>'ORG',
76 'ORG'=>1,
[1852]77 'Source'=>'SO',
78 'SO'=>1,
79 'Howto'=>'HT',
80 'HT'=>1,
81 'ItemTitle'=>'IT',
82 'IT'=>1,
83 'ProgNumber'=>'PN',
84 'PN'=>1,
85 'People'=>'PE',
86 'PE'=>1,
[5643]87 'Coverage'=>'CO',
88 'CO'=>1,
[4794]89 'allfields'=>'ZZ',
[4768]90 'ZZ'=>1,
[4794]91 'text'=>'TX',
[4768]92 'TX'=>1,
93 'AND'=>1,
94 'OR'=>1,
95 'NOT'=>1,
[4811]96 'NEAR'=>1,
97 'Doc'=>1,
98 'Sec'=>1,
99 'Para'=>1);
[932]100
[10468]101my $maxdocsize = $basebuilder::maxdocsize;
102
[932]103sub new {
[7953]104 my $class = shift(@_);
105
[10468]106 my $self = new basebuilder (@_);
107 $self = bless $self, $class;
[932]108
[10468]109 $self->{'indexfieldmap'} = \%static_indexfield_map;
[6407]110
[1852]111 # get the levels (Section, Paragraph) for indexing and compression
112 $self->{'levels'} = {};
[4811]113 $self->{'levelorder'} = ();
[1852]114 if (defined $self->{'collect_cfg'}->{'levels'}) {
[8716]115 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
[4811]116 $level =~ tr/A-Z/a-z/;
[1852]117 $self->{'levels'}->{$level} = 1;
[4811]118 push (@{$self->{'levelorder'}}, $level);
[1852]119 }
[4811]120 } else { # default to document
121 $self->{'levels'}->{'document'} = 1;
122 push (@{$self->{'levelorder'}}, 'document');
123 }
124
[7953]125 $self->{'buildtype'} = "mgpp";
[932]126
127 return $self;
128}
129
[10468]130sub generate_index_list {
131 my $self = shift (@_);
132
133 # sort out the indexes
134 #indexes are specified with spaces, but we put them into one index
135 my $indexes = $self->{'collect_cfg'}->{'indexes'};
136 $self->{'collect_cfg'}->{'indexes'} = [];
[13274]137 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
[932]138}
139
[12910]140sub generate_index_options {
141 my $self = shift (@_);
142
143 $self->{'casefold'} = 0;
144 $self->{'stem'} = 0;
145 $self->{'accentfold'} = 0;
146
147 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
148 # just use default options
149 $self->{'casefold'} = 1;
150 $self->{'stem'} = 1;
151 $self->{'accentfold'} = 1;
152 } else {
153 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
154 if ($option =~ /stem/) {
155 $self->{'stem'} = 1;
156 } elsif ($option =~ /casefold/) {
157 $self->{'casefold'} = 1;
158 } elsif ($option =~ /accentfold/) {
159 $self->{'accentfold'} = 1;
160 }
161 }
162 }
163
164 # now we record this for the build cfg
165 $self->{'stemindexes'} = 0;
166 if ($self->{'casefold'}) {
167 $self->{'stemindexes'} += 1;
168 }
169 if ($self->{'stem'}) {
170 $self->{'stemindexes'} += 2;
171 }
172 if ($self->{'accentfold'}) {
173 $self->{'stemindexes'} += 4;
174 }
[13341]175
[12910]176}
177
[10468]178sub default_buildproc {
179 my $self = shift (@_);
180
181 return "mgppbuildproc";
[932]182}
183
184sub compress_text {
185
186 my $self = shift (@_);
[10961]187
188 # we don't do anything if we don't want compressed text
189 return if $self->{'no_text'};
190
[932]191 my ($textindex) = @_;
192
[2478]193 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
[932]194 my $exe = &util::get_os_exe ();
[2478]195 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
196 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
[1694]197 my $outhandle = $self->{'outhandle'};
[932]198
[12340]199 my $maxnumeric = $self->{'maxnumeric'};
[12325]200
[932]201 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
202
203 my $basefilename = "text/$self->{'collection'}";
[2700]204 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
[7904]205
[2700]206 my $osextra = "";
[2478]207 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[3115]208 $fulltextprefix =~ s@/@\\@g;
[2478]209 }
[2700]210 else {
211 $osextra = " -d /";
212 }
[1852]213
214
[4811]215 # define the section names and possibly the doc name for mgpasses
[1852]216 # the compressor doesn't need to know about paragraphs - never want to
217 # retrieve them
[13590]218
219 # always use Doc and Sec levels
220 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
[12911]221
[1694]222 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
[6407]223 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
[932]224
225 # collect the statistics for the text
[1694]226 # -b $maxdocsize sets the maximum document size to be 12 meg
[2478]227 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
[6407]228 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
[932]229
230 my ($handle);
231 if ($self->{'debug'}) {
232 $handle = STDOUT;
233 } else {
[2478]234 if (!-e "$mgpp_passes_exe" ||
[12325]235 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
[6407]236 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
[2478]237 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
[932]238 }
239 $handle = mgppbuilder::PIPEOUT;
240 }
[13590]241
242 my $gdbm_level = "section";
[9919]243
[932]244 $self->{'buildproc'}->set_output_handle ($handle);
245 $self->{'buildproc'}->set_mode ('text');
246 $self->{'buildproc'}->set_index ($textindex);
247 $self->{'buildproc'}->set_indexing_text (0);
[1852]248 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
249 $self->{'buildproc'}->set_levels ($self->{'levels'});
[9919]250 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
[932]251 $self->{'buildproc'}->reset();
252 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
253 $self->{'buildproc'}, $self->{'maxdocs'});
254 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[9853]255 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]256 &plugin::end($self->{'pluginfo'});
257 close (PIPEOUT);
258
259 close ($handle) unless $self->{'debug'};
260
[2478]261 $self->print_stats();
262
[932]263 # create the compression dictionary
264 # the compression dictionary is built by assuming the stats are from a seed
265 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
266 # and the resulting dictionary must be less than 5 meg with the most
267 # frequent words being put into the dictionary first (-2 -k 5120)
[1852]268 # note: these options are left over from mg version
[932]269 if (!$self->{'debug'}) {
[1694]270 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
[6407]271 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
[2478]272 if (!-e "$mgpp_compression_dict_exe") {
[6407]273 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
[2478]274 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
[932]275 }
[2700]276 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
[932]277
278 if (!$self->{'debug'}) {
[2478]279 if (!-e "$mgpp_passes_exe" ||
[12325]280 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
[6407]281 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]282 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
[932]283 }
284 }
285 }
[6407]286 else {
287 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
288 }
[932]289
290 $self->{'buildproc'}->reset();
291 # compress the text
[2478]292 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
[6407]293 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
294
[932]295 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[9853]296 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]297 close ($handle) unless $self->{'debug'};
[1694]298
299 $self->print_stats();
[6407]300 print STDERR "</Stage>\n" if $self->{'gli'};
[932]301}
302
303
[10468]304sub build_indexes_extra {
305 my $self = shift(@_);
[5617]306 #define the final field lists
307 $self->make_final_field_list();
[10468]308}
[5617]309
[932]310# creates directory names for each of the index descriptions
311sub create_index_mapping {
312 my $self = shift (@_);
313 my ($indexes) = @_;
314
315 my %mapping = ();
[5935]316
[932]317 $mapping{'indexmaporder'} = [];
318 $mapping{'subcollectionmaporder'} = [];
319 $mapping{'languagemaporder'} = [];
320
321 # dirnames is used to check for collisions. Start this off
322 # with the manditory directory names
323 my %dirnames = ('text'=>'text',
324 'extra'=>'extra');
[8716]325 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
[932]326
[8716]327 foreach my $index (@$indexes) {
[932]328 my ($fields, $subcollection, $languages) = split (":", $index);
[13590]329
330 # we only ever have one index, and its called 'idx'
[8716]331 my $pindex = 'idx';
[4768]332
[932]333 # next comes a processed version of the subcollection if there is one.
334 my $psub = $self->process_field ($subcollection);
335 $psub = lc ($psub);
336
337 # next comes a processed version of the language if there is one.
338 my $plang = $self->process_field ($languages);
339 $plang = lc ($plang);
340
341 my $dirname = $pindex . $psub . $plang;
342
343 # check to be sure all index names are unique
344 while (defined ($dirnames{$dirname})) {
345 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
346 }
347
[2478]348 $mapping{$index} = $dirname;
349
[932]350 # store the mapping orders as well as the maps
[2478]351 # also put index, subcollection and language fields into the mapping thing -
[4794]352 # (the full index name (eg text:subcol:lang) is not used on
[2478]353 # the query page) -these are used for collectionmeta later on
[932]354 if (!defined $mapping{'indexmap'}{"$fields"}) {
355 $mapping{'indexmap'}{"$fields"} = $pindex;
356 push (@{$mapping{'indexmaporder'}}, "$fields");
[2478]357 if (!defined $mapping{"$fields"}) {
358 $mapping{"$fields"} = $pindex;
359 }
[932]360 }
361 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
362 $mapping{'subcollectionmap'}{$subcollection} = $psub;
363 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
[2478]364 $mapping{$subcollection} = $psub;
[932]365 }
366 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
367 $mapping{'languagemap'}{$languages} = $plang;
[6544]368 push (@{$mapping{'languagemaporder'}}, $languages);
[2478]369 $mapping{$languages} = $plang;
[932]370 }
371 $dirnames{$dirname} = $index;
[8716]372 $pnames{'index'}->{$pindex} = "$fields";
373 $pnames{'subcollection'}->{$psub} = $subcollection;
374 $pnames{'languages'}->{$plang} = $languages;
[932]375 }
376
377 return \%mapping;
378}
379
380sub make_unique {
381 my $self = shift (@_);
382 my ($namehash, $index, $indexref, $subref, $langref) = @_;
383 my ($fields, $subcollection, $languages) = split (":", $index);
384
385 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
386 $self->get_next_version ($indexref);
387 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
388 $self->get_next_version ($subref);
389 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
390 $self->get_next_version ($langref);
391 }
392 return "$$indexref$$subref$$langref";
393}
394
395
396sub build_index {
397 my $self = shift (@_);
398 my ($index) = @_;
[1694]399 my $outhandle = $self->{'outhandle'};
[932]400
401 # get the full index directory path and make sure it exists
402 my $indexdir = $self->{'index_mapping'}->{$index};
403 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
[2700]404 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
405 $indexdir,
[932]406 $self->{'collection'});
[2700]407 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
408 $self->{'collection'});
[932]409
410 # get any os specific stuff
[2478]411 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
[932]412
413 my $exe = &util::get_os_exe ();
[2478]414 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
[1852]415
416 # define the section names for mgpasses
[13590]417 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
418 if ($self->{'levels'}->{'paragraph'}) {
419 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
[1852]420 }
421
[2478]422 my $mgpp_perf_hash_build_exe =
423 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
424 my $mgpp_weights_build_exe =
425 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
426 my $mgpp_invf_dict_exe =
427 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
428 my $mgpp_stem_idx_exe =
429 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
[932]430
[12340]431 my $maxnumeric = $self->{'maxnumeric'};
[12325]432
433 my $osextra = "";
[2700]434 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[3115]435 $fullindexprefix =~ s@/@\\@g;
[2700]436 } else {
437 $osextra = " -d /";
[3115]438 if ($outhandle ne "STDERR") {
439 # so mgpp_passes doesn't print to stderr if we redirect output
440 $osextra .= " 2>/dev/null";
441 }
[2478]442 }
[2700]443
[932]444 # get the index expression if this index belongs
445 # to a subcollection
446 my $indexexparr = [];
[9669]447 my $langarr = [];
[2478]448 # there may be subcollection info, and language info.
449 my ($fields, $subcollection, $language) = split (":", $index);
[932]450 my @subcollections = ();
451 @subcollections = split /,/, $subcollection if (defined $subcollection);
452
453 foreach $subcollection (@subcollections) {
454 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
455 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
456 }
457 }
458
459 # add expressions for languages if this index belongs to
[2478]460 # a language subcollection - only put languages expressions for the
461 # ones we want in the index
[6544]462
[2478]463 my @languages = ();
[9548]464 my $language_metadata = "Language";
465 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
466 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
467 }
[2478]468 @languages = split /,/, $language if (defined $language);
[9548]469 foreach my $language (@languages) {
[2478]470 my $not=0;
[932]471 if ($language =~ s/^\!//) {
[2478]472 $not = 1;
[932]473 }
[9548]474 if($not) {
[9669]475 push (@$langarr, "!$language");
[6544]476 } else {
[9669]477 push (@$langarr, "$language");
[2478]478 }
[932]479 }
480
481 # Build index dictionary. Uses verbatim stem method
[2478]482 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
[6407]483 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
[932]484 my ($handle);
485 if ($self->{'debug'}) {
486 $handle = STDOUT;
487 } else {
[2478]488 if (!-e "$mgpp_passes_exe" ||
[12325]489 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
[6407]490 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]491 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
[932]492 }
493 $handle = mgppbuilder::PIPEOUT;
494 }
[9919]495
[13590]496 # gdbm_level is always section
497 my $gdbm_level = "section";
[9919]498
[4794]499 # set up the document processr
[932]500 $self->{'buildproc'}->set_output_handle ($handle);
501 $self->{'buildproc'}->set_mode ('text');
502 $self->{'buildproc'}->set_index ($index, $indexexparr);
[9669]503 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
[932]504 $self->{'buildproc'}->set_indexing_text (1);
[1852]505 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[9919]506 $self->{'buildproc'}->set_levels ($self->{'levels'});
507 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
508
[932]509 $self->{'buildproc'}->reset();
510 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[9853]511 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[932]512 close ($handle) unless $self->{'debug'};
513
[1694]514 $self->print_stats();
515
[5768]516 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
517 # we check on the .id file - index dictionary
518 my $dict_file = "$fullindexprefix.id";
519 if (!-e $dict_file) {
520 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
[6407]521 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
[5768]522 $self->{'notbuilt'}->{$index}=1;
523 return;
524 }
525
[932]526 if (!$self->{'debug'}) {
527 # create the perfect hash function
[2478]528 if (!-e "$mgpp_perf_hash_build_exe") {
[6407]529 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
[2478]530 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
[932]531 }
[2700]532 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
[932]533
[2478]534 if (!-e "$mgpp_passes_exe" ||
[12325]535 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
[6407]536 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
[2478]537 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
[932]538 }
539 }
540
541 # invert the text
[2478]542 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
[6407]543 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
[932]544 $self->{'buildproc'}->reset();
545 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[9853]546 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[1694]547
548 $self->print_stats ();
[932]549
550 if (!$self->{'debug'}) {
551
552 close ($handle);
553
554 # create the weights file
[1694]555 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
[6407]556 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
[2478]557 if (!-e "$mgpp_weights_build_exe") {
[6407]558 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
[2478]559 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
[932]560 }
[2700]561 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
[932]562
563 # create 'on-disk' stemmed dictionary
[1694]564 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
[2478]565 if (!-e "$mgpp_invf_dict_exe") {
[6407]566 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
[2478]567 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
[932]568 }
[2700]569 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
[932]570
571
572 # creates stem index files for the various stemming methods
[1694]573 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
[6407]574 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
[2478]575 if (!-e "$mgpp_stem_idx_exe") {
[6407]576 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
[2478]577 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
[932]578 }
[12910]579 my $accent_folding_enabled = 1;
580 if ($self->{'accentfold'}) {
581 # the first time we do this, we test for accent folding enabled
[13813]582 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
[12910]583 # accent folding has not been enabled in mgpp
584 $accent_folding_enabled = 0;
585 $self->{'stemindexes'} -= 4;
586 }
587 }
588 if ($self->{'casefold'}) {
589 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
590 if ($accent_folding_enabled && $self->{'accentfold'}) {
591 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
592 }
593 }
594 if ($self->{'stem'}) {
595 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
596 if ($accent_folding_enabled && $self->{'accentfold'}) {
597 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
598 }
599 }
600 if ($self->{'casefold'} && $self->{'stem'}) {
601 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
602 if ($accent_folding_enabled && $self->{'accentfold'}) {
603 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
604 }
605 }
606
[932]607 # remove unwanted files
[1852]608 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
609 opendir (DIR, $tmpdir) || die
610 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
[8716]611 foreach my $file (readdir(DIR)) {
[1852]612 next if $file =~ /^\./;
613 my ($suffix) = $file =~ /\.([^\.]+)$/;
614 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
[932]615 # delete it!
[1852]616 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
[2772]617 #&util::rm (&util::filename_cat ($tmpdir, $file));
[1852]618 }
619 }
620 closedir (DIR);
[4794]621 }
[6407]622 print STDERR "</Stage>\n" if $self->{'gli'};
[932]623}
624
[11965]625# now only outputs stuff if you can't generate it from collectionmeta - e.g. if someone has specified 'metadata' as an index.
[10468]626sub output_collection_meta {
627 my $self = shift(@_);
628 my ($handle) = @_;
[1694]629
[10477]630 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
631 if (!defined $self->{'build_cfg'}) {
632 $self->read_final_field_list();
633 }
634
[4794]635 # do the collection info
[14934]636 $self->output_collection_meta_start($handle);
637 $self->output_collection_meta_sets($handle);
638
[4794]639 # first do the collection meta stuff - everything without a dot
640 my $collmetadefined = 0;
[10468]641 my $metadata_entry;
[932]642 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
[4794]643 $collmetadefined = 1;
644 }
[11965]645
[4811]646 #add the index field macros to [collection]
[4794]647 # eg <TI>Title
648 # <SU>Subject
649 # these now come from collection meta. if that is not defined, usses the metadata name
[8716]650 my $field_entry="";
651 my $collmeta = "";
652 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
653 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
[4794]654 next if $shortfield eq 1;
655
[11965]656 # we need to check if some coll meta has been defined - don't output
657 # any that have
[8716]658 $collmeta = ".$longfield";
[11965]659 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
[4794]660 if ($longfield eq "allfields") {
661 $field_entry .= "<$shortfield>_query:textallfields_\n";
662 } elsif ($longfield eq "text") {
663 $field_entry .= "<$shortfield>_query:texttextonly_\n";
664 } else {
665 $field_entry .= "<$shortfield>$longfield\n";
666 }
[932]667 }
668 }
[11965]669 print $handle $field_entry;
[2772]670
[4811]671 # now add the level names
[8716]672 my $level_entry = "";
673 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
674 $collmeta = ".$level"; # based on the original specification
[4811]675 $level =~ tr/A-Z/a-z/; # make it lower case
[7090]676 my $levelid = $level_map{$level}; # find the actual value we used in the index
[11965]677 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
[4811]678 # use the default macro
[7090]679 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
[4811]680 }
681 }
682 print $handle $level_entry;
[5935]683
684 # now add subcoll meta
[8716]685 my $subcoll_entry = "";
686 my $shortname = "";
687 my $one_entry = "";
688 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
[11965]689 $shortname = $self->{'index_mapping'}->{$subcoll};
690 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
[5935]691 $subcoll_entry .= "<$shortname>$subcoll\n";
692 }
693 }
694 print $handle $subcoll_entry;
[10158]695
696 # now add language meta
[8716]697 my $lang_entry = "";
698 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
[11965]699 $shortname = $self->{'index_mapping'}->{$lang};
700 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
[6544]701 $lang_entry .= "<$shortname>$lang\n";
702 }
703 }
[14934]704 print $handle "$lang_entry\n";
[932]705
[14934]706 $self->output_collection_meta_end($handle);
[932]707}
[7150]708
[10158]709# at the end of building, we have an indexfieldmap with all the mappings,
710# plus some extras, and indexmap with any indexes in it that weren't
711# specified in the index definition. we want to make an ordered list of
712# fields that are indexed, and a list of mappings that are used. this will
713# be used for the build.cfg file, and for collection meta definition we
714# store these in a build.cfg bit
[4794]715sub make_final_field_list {
716 my $self = shift (@_);
717
718 $self->{'build_cfg'} = {};
[10158]719
[4794]720 # store the indexfieldmap information
721 my @indexfieldmap = ();
722 my @indexfields = ();
723 my $specifiedfields = {};
724 my @specifiedfieldorder = ();
[10158]725
726 # go through the index definition and add each thing to a map, so we
727 # can easily check if it is already specified - when doing the
728 # metadata, we print out all the individual fields, but some may
729 # already be specified in the index definition, so we dont want to add
730 # those again.
731
[10468]732 my $field;
733 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
[5617]734 # remove subcoll stuff
735 my $parts = $field;
736 $parts =~ s/:.*$//;
[10961]737 # *************
738 my @fs = split(';', $parts);
[8716]739 foreach my $f(@fs) {
[5617]740 if (!defined $specifiedfields->{$f}) {
741 $specifiedfields->{$f}=1;
742 push (@specifiedfieldorder, "$f");
743 }
[4794]744 }
745 }
[5643]746
[4794]747 #add all fields bit
[10468]748 foreach $field (@specifiedfieldorder) {
[4794]749 if ($field eq "metadata") {
[8716]750 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
[4794]751 if (!defined $specifiedfields->{$newfield}) {
752 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
753 push (@indexfields, "$newfield");
754 }
755 }
756
757 } elsif ($field eq 'text') {
758 push (@indexfieldmap, "text\-\>TX");
759 push (@indexfields, "text");
760 } elsif ($field eq 'allfields') {
761 push (@indexfieldmap, "allfields\-\>ZZ");
762 push (@indexfields, "allfields");
763 } else {
[11996]764
765 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
766
767 if (defined $ifm->{$field}) {
768 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
769 push (@indexfields, "$field");
770 }
771
[4794]772
773 }
774 }
[10158]775
[4794]776 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
777 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
[10961]778
[4794]779}
780
781
[10158]782# recreate the field list from the build.cfg file, look first in building,
783# then in index to find it. if there is no build.cfg, we can't do the field
784# list (there is unlikely to be any index anyway.)
[4794]785sub read_final_field_list {
786 my $self = shift (@_);
787 $self->{'build_cfg'} = {};
788 my @indexfieldmap = ();
789 my @indexfields = ();
[14666]790 my @indexmap = ();
791
[4794]792 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
793 # set the default mapping
794 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
795 }
796 # we read the stuff in from the build.cfg file - if its there
[8716]797 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
[4794]798
799 if (!-e $buildconfigfile) {
800 # try the index dir - but do we know where it is?? try here
801 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
802 if (!-e $buildconfigfile) {
803 #we cant find a config file - just ignore the field list
804 return;
805 }
806 }
[10158]807
[8716]808 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
[10468]809 my $field;
[4794]810 if (defined $buildcfg->{'indexfields'}) {
[10468]811 foreach $field (@{$buildcfg->{'indexfields'}}) {
[4794]812 push (@indexfields, "$field");
813 }
814 }
[10158]815
[4794]816 if (defined $buildcfg->{'indexfieldmap'}) {
[10468]817 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
[4794]818 push (@indexfieldmap, "$field");
[8716]819 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
[4794]820 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
821 }
822 }
[10158]823
[14666]824 if (defined $buildcfg->{'indexmap'}) {
825 foreach $field (@{$buildcfg->{'indexmap'}}) {
826 push (@indexmap, "$field");
827 }
828 }
829
[4794]830 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
831 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
[14666]832 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
[4794]833}
[10158]834
[10468]835
836sub build_cfg_extra {
[932]837 my $self = shift (@_);
[10468]838 my ($build_cfg) = @_;
839
840 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
[4794]841
[4811]842 # store the level info
843 my @indexlevels = ();
[9936]844 my @levelmap = ();
[8716]845 foreach my $l (@{$self->{'levelorder'}}) {
[7090]846 push (@indexlevels, $level_map{$l});
[9936]847 push (@levelmap, "$l\-\>$level_map{$l}");
[4811]848 }
849 $build_cfg->{'indexlevels'} = \@indexlevels;
[9936]850 $build_cfg->{'levelmap'} = \@levelmap;
851
[13590]852 # text level (and gdbm level) is always section
853 $build_cfg->{'textlevel'} = $level_map{'section'};
[10468]854
[932]855}
856
8571;
858
859
Note: See TracBrowser for help on using the repository browser.