source: trunk/gsdl/perllib/mgppbuilder.pm@ 12448

Last change on this file since 12448 was 12340, checked in by kjdon, 18 years ago

maxnumeric is set using set_maxnumeric (by buildcol.pl) rather than the builder looking directly in the collect.cfg file

  • Property svn:keywords set to Author Date Id Revision
File size: 28.2 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use classify;
30use cfgread;
31use colcfg;
32use plugin;
33use util;
34use FileHandle;
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49#$doc_level = "Doc";
50#$sec_level = "Sec";
51#$para_level = "Para";
52
53our %wanted_index_files = ('td'=>1,
54 't'=>1,
55 'tl'=>1,
56 'ti'=>1,
57 'idb'=>1,
58 'ib1'=>1,
59 'ib2'=>1,
60 'ib3'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66# change this so a user can add their own ones in via a file or cfg
67#add AND, OR, NOT NEAR to this list - these cannot be used as field names
68#also add the level names (Doc, Sec, Para)
69our %static_indexfield_map = ('Title'=>'TI',
70 'TI'=>1,
71 'Subject'=>'SU',
72 'SU'=>1,
73 'Creator'=>'CR',
74 'CR'=>1,
75 'Organization'=>'ORG',
76 'ORG'=>1,
77 'Source'=>'SO',
78 'SO'=>1,
79 'Howto'=>'HT',
80 'HT'=>1,
81 'ItemTitle'=>'IT',
82 'IT'=>1,
83 'ProgNumber'=>'PN',
84 'PN'=>1,
85 'People'=>'PE',
86 'PE'=>1,
87 'Coverage'=>'CO',
88 'CO'=>1,
89 'allfields'=>'ZZ',
90 'ZZ'=>1,
91 'text'=>'TX',
92 'TX'=>1,
93 'AND'=>1,
94 'OR'=>1,
95 'NOT'=>1,
96 'NEAR'=>1,
97 'Doc'=>1,
98 'Sec'=>1,
99 'Para'=>1);
100
101my $maxdocsize = $basebuilder::maxdocsize;
102
103sub new {
104 my $class = shift(@_);
105
106 my ($collection, $source_dir, $build_dir, $verbosity,
107 $maxdocs, $debug, $keepold, $remove_empty_classifications,
108 $outhandle, $no_text, $failhandle, $gli) = @_;
109
110 my $self = new basebuilder (@_);
111 $self = bless $self, $class;
112
113 $self->{'indexfieldmap'} = \%static_indexfield_map;
114
115 # get the levels (Section, Paragraph) for indexing and compression
116 $self->{'levels'} = {};
117 $self->{'levelorder'} = ();
118 if (defined $self->{'collect_cfg'}->{'levels'}) {
119 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
120 $level =~ tr/A-Z/a-z/;
121 $self->{'levels'}->{$level} = 1;
122 push (@{$self->{'levelorder'}}, $level);
123 }
124 } else { # default to document
125 $self->{'levels'}->{'document'} = 1;
126 push (@{$self->{'levelorder'}}, 'document');
127 }
128
129 $self->{'doc_level'} = "document";
130 if (! $self->{'levels'}->{'document'}) {
131 if ($self->{'levels'}->{'section'}) {
132 $self->{'doc_level'} = "section";
133 } else {
134 die "you must have either document or section level specified!!\n";
135 }
136 }
137
138 $self->{'buildtype'} = "mgpp";
139
140 return $self;
141}
142
143sub generate_index_list {
144 my $self = shift (@_);
145
146 # sort out the indexes
147 #indexes are specified with spaces, but we put them into one index
148 my $indexes = $self->{'collect_cfg'}->{'indexes'};
149 $self->{'collect_cfg'}->{'indexes'} = [];
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes));
151}
152
153sub default_buildproc {
154 my $self = shift (@_);
155
156 return "mgppbuildproc";
157}
158
159sub compress_text {
160
161 my $self = shift (@_);
162
163 # we don't do anything if we don't want compressed text
164 return if $self->{'no_text'};
165
166 my ($textindex) = @_;
167
168 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
169 my $exe = &util::get_os_exe ();
170 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
171 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
172 my $outhandle = $self->{'outhandle'};
173
174 my $maxnumeric = $self->{'maxnumeric'};
175
176 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
177
178 my $basefilename = "text/$self->{'collection'}";
179 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
180
181 my $osextra = "";
182 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
183 $fulltextprefix =~ s@/@\\@g;
184 }
185 else {
186 $osextra = " -d /";
187 }
188
189
190 # define the section names and possibly the doc name for mgpasses
191 # the compressor doesn't need to know about paragraphs - never want to
192 # retrieve them
193 my $mgpp_passes_sections = "";
194 my ($doc_level) = $self->{'doc_level'};
195 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
196 foreach my $level (keys %{$self->{'levels'}}) {
197 if ($level ne $doc_level && $level ne "paragraph") {
198 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
199 }
200 }
201 $mgpp_passes_sections .= "-K SENT ";
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = STDOUT;
213 } else {
214 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
215 if (!-e "$mgpp_passes_exe" ||
216 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
217 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
218 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
219 }
220 $handle = mgppbuilder::PIPEOUT;
221 }
222
223 # gdbm_level
224 my $gdbm_level = "document";
225 if ($self->{'levels'}->{'section'}) {
226 $gdbm_level = "section";
227 }
228
229 $self->{'buildproc'}->set_output_handle ($handle);
230 $self->{'buildproc'}->set_mode ('text');
231 $self->{'buildproc'}->set_index ($textindex);
232 $self->{'buildproc'}->set_indexing_text (0);
233 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
234 $self->{'buildproc'}->set_levels ($self->{'levels'});
235 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
236 $self->{'buildproc'}->reset();
237 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
238 $self->{'buildproc'}, $self->{'maxdocs'});
239 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
240 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
241 &plugin::end($self->{'pluginfo'});
242 close (PIPEOUT);
243
244 close ($handle) unless $self->{'debug'};
245
246 $self->print_stats();
247
248 # create the compression dictionary
249 # the compression dictionary is built by assuming the stats are from a seed
250 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
251 # and the resulting dictionary must be less than 5 meg with the most
252 # frequent words being put into the dictionary first (-2 -k 5120)
253 # note: these options are left over from mg version
254 if (!$self->{'debug'}) {
255 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
256 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
257 if (!-e "$mgpp_compression_dict_exe") {
258 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
259 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
260 }
261 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
262
263 if (!$self->{'debug'}) {
264 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
265 if (!-e "$mgpp_passes_exe" ||
266 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
267 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
268 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
269 }
270 }
271 }
272 else {
273 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
274 }
275
276 $self->{'buildproc'}->reset();
277 # compress the text
278 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
279 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
280
281 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
282 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
283 close ($handle) unless $self->{'debug'};
284
285 $self->print_stats();
286 print STDERR "</Stage>\n" if $self->{'gli'};
287}
288
289
290sub build_indexes_extra {
291 my $self = shift(@_);
292 #define the final field lists
293 $self->make_final_field_list();
294}
295
296# creates directory names for each of the index descriptions
297sub create_index_mapping {
298 my $self = shift (@_);
299 my ($indexes) = @_;
300
301 my %mapping = ();
302
303 $mapping{'indexmaporder'} = [];
304 $mapping{'subcollectionmaporder'} = [];
305 $mapping{'languagemaporder'} = [];
306
307 # dirnames is used to check for collisions. Start this off
308 # with the manditory directory names
309 my %dirnames = ('text'=>'text',
310 'extra'=>'extra');
311 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
312
313 foreach my $index (@$indexes) {
314 my ($fields, $subcollection, $languages) = split (":", $index);
315 # the directory name starts with a processed version of index fields
316 #my ($pindex) = $self->process_field($fields);
317 #$pindex = lc ($pindex);
318 # now we only ever have one index, and its called 'idx'
319 my $pindex = 'idx';
320
321 # next comes a processed version of the subcollection if there is one.
322 my $psub = $self->process_field ($subcollection);
323 $psub = lc ($psub);
324
325 # next comes a processed version of the language if there is one.
326 my $plang = $self->process_field ($languages);
327 $plang = lc ($plang);
328
329 my $dirname = $pindex . $psub . $plang;
330
331 # check to be sure all index names are unique
332 while (defined ($dirnames{$dirname})) {
333 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
334 }
335
336 $mapping{$index} = $dirname;
337
338 # store the mapping orders as well as the maps
339 # also put index, subcollection and language fields into the mapping thing -
340 # (the full index name (eg text:subcol:lang) is not used on
341 # the query page) -these are used for collectionmeta later on
342 if (!defined $mapping{'indexmap'}{"$fields"}) {
343 $mapping{'indexmap'}{"$fields"} = $pindex;
344 push (@{$mapping{'indexmaporder'}}, "$fields");
345 if (!defined $mapping{"$fields"}) {
346 $mapping{"$fields"} = $pindex;
347 }
348 }
349 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
350 $mapping{'subcollectionmap'}{$subcollection} = $psub;
351 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
352 $mapping{$subcollection} = $psub;
353 }
354 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
355 $mapping{'languagemap'}{$languages} = $plang;
356 push (@{$mapping{'languagemaporder'}}, $languages);
357 $mapping{$languages} = $plang;
358 }
359 $dirnames{$dirname} = $index;
360 $pnames{'index'}->{$pindex} = "$fields";
361 $pnames{'subcollection'}->{$psub} = $subcollection;
362 $pnames{'languages'}->{$plang} = $languages;
363 }
364
365 return \%mapping;
366}
367
368sub make_unique {
369 my $self = shift (@_);
370 my ($namehash, $index, $indexref, $subref, $langref) = @_;
371 my ($fields, $subcollection, $languages) = split (":", $index);
372
373 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
374 $self->get_next_version ($indexref);
375 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
376 $self->get_next_version ($subref);
377 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
378 $self->get_next_version ($langref);
379 }
380 return "$$indexref$$subref$$langref";
381}
382
383
384sub build_index {
385 my $self = shift (@_);
386 my ($index) = @_;
387 my $outhandle = $self->{'outhandle'};
388
389 # get the full index directory path and make sure it exists
390 my $indexdir = $self->{'index_mapping'}->{$index};
391 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
392 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
393 $indexdir,
394 $self->{'collection'});
395 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
396 $self->{'collection'});
397
398 # get any os specific stuff
399 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
400
401 my $exe = &util::get_os_exe ();
402 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
403
404 # define the section names for mgpasses
405 # define the section names and possibly the doc name for mgpasses
406 my $mgpp_passes_sections = "";
407 my ($doc_level) = $self->{'doc_level'};
408 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
409
410 foreach my $level (keys %{$self->{'levels'}}) {
411 if ($level ne $doc_level) {
412 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
413 }
414 }
415
416 my $mgpp_perf_hash_build_exe =
417 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
418 my $mgpp_weights_build_exe =
419 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
420 my $mgpp_invf_dict_exe =
421 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
422 my $mgpp_stem_idx_exe =
423 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
424
425 my $maxnumeric = $self->{'maxnumeric'};
426
427 my $osextra = "";
428 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
429 $fullindexprefix =~ s@/@\\@g;
430 } else {
431 $osextra = " -d /";
432 if ($outhandle ne "STDERR") {
433 # so mgpp_passes doesn't print to stderr if we redirect output
434 $osextra .= " 2>/dev/null";
435 }
436 }
437
438 # get the index expression if this index belongs
439 # to a subcollection
440 my $indexexparr = [];
441 my $langarr = [];
442 # there may be subcollection info, and language info.
443 my ($fields, $subcollection, $language) = split (":", $index);
444 my @subcollections = ();
445 @subcollections = split /,/, $subcollection if (defined $subcollection);
446
447 foreach $subcollection (@subcollections) {
448 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
449 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
450 }
451 }
452
453 # add expressions for languages if this index belongs to
454 # a language subcollection - only put languages expressions for the
455 # ones we want in the index
456
457 my @languages = ();
458 my $language_metadata = "Language";
459 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
460 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
461 }
462 @languages = split /,/, $language if (defined $language);
463 foreach my $language (@languages) {
464 my $not=0;
465 if ($language =~ s/^\!//) {
466 $not = 1;
467 }
468 if($not) {
469 push (@$langarr, "!$language");
470 } else {
471 push (@$langarr, "$language");
472 }
473 }
474
475 # Build index dictionary. Uses verbatim stem method
476 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
477 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
478 my ($handle);
479 if ($self->{'debug'}) {
480 $handle = STDOUT;
481 } else {
482 if (!-e "$mgpp_passes_exe" ||
483 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
484 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
485 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
486 }
487 $handle = mgppbuilder::PIPEOUT;
488 }
489
490 # gdbm_level
491 my $gdbm_level = "document";
492 if ($self->{'levels'}->{'section'}) {
493 $gdbm_level = "section";
494 }
495
496 # set up the document processr
497 $self->{'buildproc'}->set_output_handle ($handle);
498 $self->{'buildproc'}->set_mode ('text');
499 $self->{'buildproc'}->set_index ($index, $indexexparr);
500 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
501 $self->{'buildproc'}->set_indexing_text (1);
502 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
503 $self->{'buildproc'}->set_levels ($self->{'levels'});
504 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
505
506 $self->{'buildproc'}->reset();
507 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
508 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
509 close ($handle) unless $self->{'debug'};
510
511 $self->print_stats();
512
513 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
514 # we check on the .id file - index dictionary
515 my $dict_file = "$fullindexprefix.id";
516 if (!-e $dict_file) {
517 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
518 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
519 $self->{'notbuilt'}->{$index}=1;
520 return;
521 }
522
523 if (!$self->{'debug'}) {
524 # create the perfect hash function
525 if (!-e "$mgpp_perf_hash_build_exe") {
526 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
527 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
528 }
529 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
530
531 if (!-e "$mgpp_passes_exe" ||
532 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
533 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
534 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
535 }
536 }
537
538 # invert the text
539 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
540 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
541 $self->{'buildproc'}->reset();
542 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
543 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
544
545 $self->print_stats ();
546
547 if (!$self->{'debug'}) {
548
549 close ($handle);
550
551 # create the weights file
552 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
553 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
554 if (!-e "$mgpp_weights_build_exe") {
555 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
556 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
557 }
558 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
559
560 # create 'on-disk' stemmed dictionary
561 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
562 if (!-e "$mgpp_invf_dict_exe") {
563 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
564 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
565 }
566 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
567
568
569 # creates stem index files for the various stemming methods
570 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
571 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
572 if (!-e "$mgpp_stem_idx_exe") {
573 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
574 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
575 }
576 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
577 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
578 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
579
580 # remove unwanted files
581 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
582 opendir (DIR, $tmpdir) || die
583 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
584 foreach my $file (readdir(DIR)) {
585 next if $file =~ /^\./;
586 my ($suffix) = $file =~ /\.([^\.]+)$/;
587 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
588 # delete it!
589 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
590 #&util::rm (&util::filename_cat ($tmpdir, $file));
591 }
592 }
593 closedir (DIR);
594 }
595 print STDERR "</Stage>\n" if $self->{'gli'};
596}
597
598# now only outputs stuff if you can't generate it from collectionmeta - e.g. if someone has specified 'metadata' as an index.
599sub output_collection_meta {
600 my $self = shift(@_);
601 my ($handle) = @_;
602
603 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
604 if (!defined $self->{'build_cfg'}) {
605 $self->read_final_field_list();
606 }
607
608 # do the collection info
609 print $handle "[collection]\n";
610
611 # first do the collection meta stuff - everything without a dot
612 my $collmetadefined = 0;
613 my $metadata_entry;
614 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
615 $collmetadefined = 1;
616 }
617
618 #add the index field macros to [collection]
619 # eg <TI>Title
620 # <SU>Subject
621 # these now come from collection meta. if that is not defined, usses the metadata name
622 my $field_entry="";
623 my $collmeta = "";
624 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
625 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
626 next if $shortfield eq 1;
627
628 # we need to check if some coll meta has been defined - don't output
629 # any that have
630 $collmeta = ".$longfield";
631 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
632 if ($longfield eq "allfields") {
633 $field_entry .= "<$shortfield>_query:textallfields_\n";
634 } elsif ($longfield eq "text") {
635 $field_entry .= "<$shortfield>_query:texttextonly_\n";
636 } else {
637 $field_entry .= "<$shortfield>$longfield\n";
638 }
639 }
640 }
641 print $handle $field_entry;
642
643 # now add the level names
644 my $level_entry = "";
645 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
646 $collmeta = ".$level"; # based on the original specification
647 $level =~ tr/A-Z/a-z/; # make it lower case
648 my $levelid = $level_map{$level}; # find the actual value we used in the index
649 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
650 # use the default macro
651 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
652 }
653 }
654 print $handle $level_entry;
655
656 # now add subcoll meta
657 my $subcoll_entry = "";
658 my $shortname = "";
659 my $one_entry = "";
660 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
661 $shortname = $self->{'index_mapping'}->{$subcoll};
662 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
663 $subcoll_entry .= "<$shortname>$subcoll\n";
664 }
665 }
666 print $handle $subcoll_entry;
667
668 # now add language meta
669 my $lang_entry = "";
670 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
671 $shortname = $self->{'index_mapping'}->{$lang};
672 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
673 $lang_entry .= "<$shortname>$lang\n";
674 }
675 }
676 print $handle $lang_entry;
677 # end the collection entry
678 print $handle "\n" . ('-' x 70) . "\n";
679
680
681}
682
683# at the end of building, we have an indexfieldmap with all the mappings,
684# plus some extras, and indexmap with any indexes in it that weren't
685# specified in the index definition. we want to make an ordered list of
686# fields that are indexed, and a list of mappings that are used. this will
687# be used for the build.cfg file, and for collection meta definition we
688# store these in a build.cfg bit
689sub make_final_field_list {
690 my $self = shift (@_);
691
692 $self->{'build_cfg'} = {};
693
694 # store the indexfieldmap information
695 my @indexfieldmap = ();
696 my @indexfields = ();
697 my $specifiedfields = {};
698 my @specifiedfieldorder = ();
699
700 # go through the index definition and add each thing to a map, so we
701 # can easily check if it is already specified - when doing the
702 # metadata, we print out all the individual fields, but some may
703 # already be specified in the index definition, so we dont want to add
704 # those again.
705
706 my $field;
707 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
708 # remove subcoll stuff
709 my $parts = $field;
710 $parts =~ s/:.*$//;
711 # *************
712 my @fs = split(';', $parts);
713 foreach my $f(@fs) {
714 if (!defined $specifiedfields->{$f}) {
715 $specifiedfields->{$f}=1;
716 push (@specifiedfieldorder, "$f");
717 }
718 }
719 }
720
721 #add all fields bit
722 foreach $field (@specifiedfieldorder) {
723 if ($field eq "metadata") {
724 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
725 if (!defined $specifiedfields->{$newfield}) {
726 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
727 push (@indexfields, "$newfield");
728 }
729 }
730
731 } elsif ($field eq 'text') {
732 push (@indexfieldmap, "text\-\>TX");
733 push (@indexfields, "text");
734 } elsif ($field eq 'allfields') {
735 push (@indexfieldmap, "allfields\-\>ZZ");
736 push (@indexfields, "allfields");
737 } else {
738
739 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
740
741 if (defined $ifm->{$field}) {
742 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
743 push (@indexfields, "$field");
744 }
745
746
747 }
748 }
749
750 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
751 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
752
753}
754
755
756# recreate the field list from the build.cfg file, look first in building,
757# then in index to find it. if there is no build.cfg, we can't do the field
758# list (there is unlikely to be any index anyway.)
759sub read_final_field_list {
760 my $self = shift (@_);
761 $self->{'build_cfg'} = {};
762 my @indexfieldmap = ();
763 my @indexfields = ();
764
765 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
766 # set the default mapping
767 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
768 }
769 # we read the stuff in from the build.cfg file - if its there
770 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
771
772 if (!-e $buildconfigfile) {
773 # try the index dir - but do we know where it is?? try here
774 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
775 if (!-e $buildconfigfile) {
776 #we cant find a config file - just ignore the field list
777 return;
778 }
779 }
780
781 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
782 my $field;
783 if (defined $buildcfg->{'indexfields'}) {
784 foreach $field (@{$buildcfg->{'indexfields'}}) {
785 push (@indexfields, "$field");
786 }
787 }
788
789 if (defined $buildcfg->{'indexfieldmap'}) {
790 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
791 push (@indexfieldmap, "$field");
792 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
793 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
794 }
795 }
796
797 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
798 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
799}
800
801
802sub write_cfg_file {
803 my $self = shift(@_);
804 my ($build_cfg) = @_;
805
806 # write out the build information
807 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
808 '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem|maxnumeric)$',
809 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
810
811}
812
813sub build_cfg_extra {
814 my $self = shift (@_);
815 my ($build_cfg) = @_;
816
817 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
818
819 # store the level info
820 my @indexlevels = ();
821 my @levelmap = ();
822 foreach my $l (@{$self->{'levelorder'}}) {
823 push (@indexlevels, $level_map{$l});
824 push (@levelmap, "$l\-\>$level_map{$l}");
825 }
826 $build_cfg->{'indexlevels'} = \@indexlevels;
827 $build_cfg->{'levelmap'} = \@levelmap;
828
829 if ($self->{'levels'}->{'section'}) {
830 $build_cfg->{'textlevel'} = $level_map{'section'};
831 } else {
832 $build_cfg->{'textlevel'} = $level_map{'document'};
833 }
834
835}
836
8371;
838
839
Note: See TracBrowser for help on using the repository browser.