source: trunk/gsdl/perllib/mgppbuilder.pm@ 12270

Last change on this file since 12270 was 11996, checked in by davidb, 18 years ago

Extra tweak to code (if statement added) to help cope with the situation of
a collection being built with no documents.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.1 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use classify;
30use cfgread;
31use colcfg;
32use plugin;
33use util;
34use FileHandle;
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49#$doc_level = "Doc";
50#$sec_level = "Sec";
51#$para_level = "Para";
52
53our %wanted_index_files = ('td'=>1,
54 't'=>1,
55 'tl'=>1,
56 'ti'=>1,
57 'idb'=>1,
58 'ib1'=>1,
59 'ib2'=>1,
60 'ib3'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66# change this so a user can add their own ones in via a file or cfg
67#add AND, OR, NOT NEAR to this list - these cannot be used as field names
68#also add the level names (Doc, Sec, Para)
69our %static_indexfield_map = ('Title'=>'TI',
70 'TI'=>1,
71 'Subject'=>'SU',
72 'SU'=>1,
73 'Creator'=>'CR',
74 'CR'=>1,
75 'Organization'=>'ORG',
76 'ORG'=>1,
77 'Source'=>'SO',
78 'SO'=>1,
79 'Howto'=>'HT',
80 'HT'=>1,
81 'ItemTitle'=>'IT',
82 'IT'=>1,
83 'ProgNumber'=>'PN',
84 'PN'=>1,
85 'People'=>'PE',
86 'PE'=>1,
87 'Coverage'=>'CO',
88 'CO'=>1,
89 'allfields'=>'ZZ',
90 'ZZ'=>1,
91 'text'=>'TX',
92 'TX'=>1,
93 'AND'=>1,
94 'OR'=>1,
95 'NOT'=>1,
96 'NEAR'=>1,
97 'Doc'=>1,
98 'Sec'=>1,
99 'Para'=>1);
100
101my $maxdocsize = $basebuilder::maxdocsize;
102
103sub new {
104 my $class = shift(@_);
105
106 my ($collection, $source_dir, $build_dir, $verbosity,
107 $maxdocs, $debug, $keepold, $remove_empty_classifications,
108 $outhandle, $no_text, $failhandle, $gli) = @_;
109
110 my $self = new basebuilder (@_);
111 $self = bless $self, $class;
112
113 $self->{'indexfieldmap'} = \%static_indexfield_map;
114
115 # get the levels (Section, Paragraph) for indexing and compression
116 $self->{'levels'} = {};
117 $self->{'levelorder'} = ();
118 if (defined $self->{'collect_cfg'}->{'levels'}) {
119 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
120 $level =~ tr/A-Z/a-z/;
121 $self->{'levels'}->{$level} = 1;
122 push (@{$self->{'levelorder'}}, $level);
123 }
124 } else { # default to document
125 $self->{'levels'}->{'document'} = 1;
126 push (@{$self->{'levelorder'}}, 'document');
127 }
128
129 $self->{'doc_level'} = "document";
130 if (! $self->{'levels'}->{'document'}) {
131 if ($self->{'levels'}->{'section'}) {
132 $self->{'doc_level'} = "section";
133 } else {
134 die "you must have either document or section level specified!!\n";
135 }
136 }
137
138 $self->{'buildtype'} = "mgpp";
139
140 return $self;
141}
142
143sub generate_index_list {
144 my $self = shift (@_);
145
146 # sort out the indexes
147 #indexes are specified with spaces, but we put them into one index
148 my $indexes = $self->{'collect_cfg'}->{'indexes'};
149 $self->{'collect_cfg'}->{'indexes'} = [];
150 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes));
151}
152
153sub default_buildproc {
154 my $self = shift (@_);
155
156 return "mgppbuildproc";
157}
158
159sub compress_text {
160
161 my $self = shift (@_);
162
163 # we don't do anything if we don't want compressed text
164 return if $self->{'no_text'};
165
166 my ($textindex) = @_;
167
168 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
169 my $exe = &util::get_os_exe ();
170 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
171 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
172 my $outhandle = $self->{'outhandle'};
173
174 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
175
176 my $basefilename = "text/$self->{'collection'}";
177 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
178
179 my $osextra = "";
180 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
181 $fulltextprefix =~ s@/@\\@g;
182 }
183 else {
184 $osextra = " -d /";
185 }
186
187
188 # define the section names and possibly the doc name for mgpasses
189 # the compressor doesn't need to know about paragraphs - never want to
190 # retrieve them
191 my $mgpp_passes_sections = "";
192 my ($doc_level) = $self->{'doc_level'};
193 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
194 foreach my $level (keys %{$self->{'levels'}}) {
195 if ($level ne $doc_level && $level ne "paragraph") {
196 $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
197 }
198 }
199 $mgpp_passes_sections .= "-K SENT ";
200 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
201 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
202
203 # collect the statistics for the text
204 # -b $maxdocsize sets the maximum document size to be 12 meg
205 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
206 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
207
208 my ($handle);
209 if ($self->{'debug'}) {
210 $handle = STDOUT;
211 } else {
212 #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
213 if (!-e "$mgpp_passes_exe" ||
214 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
215 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
216 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
217 }
218 $handle = mgppbuilder::PIPEOUT;
219 }
220
221 # gdbm_level
222 my $gdbm_level = "document";
223 if ($self->{'levels'}->{'section'}) {
224 $gdbm_level = "section";
225 }
226
227 $self->{'buildproc'}->set_output_handle ($handle);
228 $self->{'buildproc'}->set_mode ('text');
229 $self->{'buildproc'}->set_index ($textindex);
230 $self->{'buildproc'}->set_indexing_text (0);
231 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
232 $self->{'buildproc'}->set_levels ($self->{'levels'});
233 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
234 $self->{'buildproc'}->reset();
235 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
236 $self->{'buildproc'}, $self->{'maxdocs'});
237 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
238 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
239 &plugin::end($self->{'pluginfo'});
240 close (PIPEOUT);
241
242 close ($handle) unless $self->{'debug'};
243
244 $self->print_stats();
245
246 # create the compression dictionary
247 # the compression dictionary is built by assuming the stats are from a seed
248 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
249 # and the resulting dictionary must be less than 5 meg with the most
250 # frequent words being put into the dictionary first (-2 -k 5120)
251 # note: these options are left over from mg version
252 if (!$self->{'debug'}) {
253 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
254 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
255 if (!-e "$mgpp_compression_dict_exe") {
256 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
257 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
258 }
259 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
260
261 if (!$self->{'debug'}) {
262 #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
263 if (!-e "$mgpp_passes_exe" ||
264 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
265 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
266 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
267 }
268 }
269 }
270 else {
271 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
272 }
273
274 $self->{'buildproc'}->reset();
275 # compress the text
276 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
277 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
278
279 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
280 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
281 close ($handle) unless $self->{'debug'};
282
283 $self->print_stats();
284 print STDERR "</Stage>\n" if $self->{'gli'};
285}
286
287
288sub build_indexes_extra {
289 my $self = shift(@_);
290 #define the final field lists
291 $self->make_final_field_list();
292}
293
294# creates directory names for each of the index descriptions
295sub create_index_mapping {
296 my $self = shift (@_);
297 my ($indexes) = @_;
298
299 my %mapping = ();
300
301 $mapping{'indexmaporder'} = [];
302 $mapping{'subcollectionmaporder'} = [];
303 $mapping{'languagemaporder'} = [];
304
305 # dirnames is used to check for collisions. Start this off
306 # with the manditory directory names
307 my %dirnames = ('text'=>'text',
308 'extra'=>'extra');
309 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
310
311 foreach my $index (@$indexes) {
312 my ($fields, $subcollection, $languages) = split (":", $index);
313 # the directory name starts with a processed version of index fields
314 #my ($pindex) = $self->process_field($fields);
315 #$pindex = lc ($pindex);
316 # now we only ever have one index, and its called 'idx'
317 my $pindex = 'idx';
318
319 # next comes a processed version of the subcollection if there is one.
320 my $psub = $self->process_field ($subcollection);
321 $psub = lc ($psub);
322
323 # next comes a processed version of the language if there is one.
324 my $plang = $self->process_field ($languages);
325 $plang = lc ($plang);
326
327 my $dirname = $pindex . $psub . $plang;
328
329 # check to be sure all index names are unique
330 while (defined ($dirnames{$dirname})) {
331 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
332 }
333
334 $mapping{$index} = $dirname;
335
336 # store the mapping orders as well as the maps
337 # also put index, subcollection and language fields into the mapping thing -
338 # (the full index name (eg text:subcol:lang) is not used on
339 # the query page) -these are used for collectionmeta later on
340 if (!defined $mapping{'indexmap'}{"$fields"}) {
341 $mapping{'indexmap'}{"$fields"} = $pindex;
342 push (@{$mapping{'indexmaporder'}}, "$fields");
343 if (!defined $mapping{"$fields"}) {
344 $mapping{"$fields"} = $pindex;
345 }
346 }
347 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
348 $mapping{'subcollectionmap'}{$subcollection} = $psub;
349 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
350 $mapping{$subcollection} = $psub;
351 }
352 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
353 $mapping{'languagemap'}{$languages} = $plang;
354 push (@{$mapping{'languagemaporder'}}, $languages);
355 $mapping{$languages} = $plang;
356 }
357 $dirnames{$dirname} = $index;
358 $pnames{'index'}->{$pindex} = "$fields";
359 $pnames{'subcollection'}->{$psub} = $subcollection;
360 $pnames{'languages'}->{$plang} = $languages;
361 }
362
363 return \%mapping;
364}
365
366sub make_unique {
367 my $self = shift (@_);
368 my ($namehash, $index, $indexref, $subref, $langref) = @_;
369 my ($fields, $subcollection, $languages) = split (":", $index);
370
371 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
372 $self->get_next_version ($indexref);
373 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
374 $self->get_next_version ($subref);
375 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
376 $self->get_next_version ($langref);
377 }
378 return "$$indexref$$subref$$langref";
379}
380
381
382sub build_index {
383 my $self = shift (@_);
384 my ($index) = @_;
385 my $outhandle = $self->{'outhandle'};
386
387 # get the full index directory path and make sure it exists
388 my $indexdir = $self->{'index_mapping'}->{$index};
389 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
390 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
391 $indexdir,
392 $self->{'collection'});
393 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
394 $self->{'collection'});
395
396 # get any os specific stuff
397 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
398
399 my $exe = &util::get_os_exe ();
400 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
401
402 # define the section names for mgpasses
403 # define the section names and possibly the doc name for mgpasses
404 my $mgpp_passes_sections = "";
405 my ($doc_level) = $self->{'doc_level'};
406 $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
407
408 foreach my $level (keys %{$self->{'levels'}}) {
409 if ($level ne $doc_level) {
410 $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
411 }
412 }
413
414 my $mgpp_perf_hash_build_exe =
415 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
416 my $mgpp_weights_build_exe =
417 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
418 my $mgpp_invf_dict_exe =
419 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
420 my $mgpp_stem_idx_exe =
421 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
422
423 my $osextra = "";
424 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
425 $fullindexprefix =~ s@/@\\@g;
426 } else {
427 $osextra = " -d /";
428 if ($outhandle ne "STDERR") {
429 # so mgpp_passes doesn't print to stderr if we redirect output
430 $osextra .= " 2>/dev/null";
431 }
432 }
433
434 # get the index expression if this index belongs
435 # to a subcollection
436 my $indexexparr = [];
437 my $langarr = [];
438 # there may be subcollection info, and language info.
439 my ($fields, $subcollection, $language) = split (":", $index);
440 my @subcollections = ();
441 @subcollections = split /,/, $subcollection if (defined $subcollection);
442
443 foreach $subcollection (@subcollections) {
444 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
445 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
446 }
447 }
448
449 # add expressions for languages if this index belongs to
450 # a language subcollection - only put languages expressions for the
451 # ones we want in the index
452
453 my @languages = ();
454 my $language_metadata = "Language";
455 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
456 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
457 }
458 @languages = split /,/, $language if (defined $language);
459 foreach my $language (@languages) {
460 my $not=0;
461 if ($language =~ s/^\!//) {
462 $not = 1;
463 }
464 if($not) {
465 push (@$langarr, "!$language");
466 } else {
467 push (@$langarr, "$language");
468 }
469 }
470
471 # Build index dictionary. Uses verbatim stem method
472 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
473 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
474 my ($handle);
475 if ($self->{'debug'}) {
476 $handle = STDOUT;
477 } else {
478 if (!-e "$mgpp_passes_exe" ||
479 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
480 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
481 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
482 }
483 $handle = mgppbuilder::PIPEOUT;
484 }
485
486 # gdbm_level
487 my $gdbm_level = "document";
488 if ($self->{'levels'}->{'section'}) {
489 $gdbm_level = "section";
490 }
491
492 # set up the document processr
493 $self->{'buildproc'}->set_output_handle ($handle);
494 $self->{'buildproc'}->set_mode ('text');
495 $self->{'buildproc'}->set_index ($index, $indexexparr);
496 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
497 $self->{'buildproc'}->set_indexing_text (1);
498 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
499 $self->{'buildproc'}->set_levels ($self->{'levels'});
500 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
501
502 $self->{'buildproc'}->reset();
503 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
504 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
505 close ($handle) unless $self->{'debug'};
506
507 $self->print_stats();
508
509 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
510 # we check on the .id file - index dictionary
511 my $dict_file = "$fullindexprefix.id";
512 if (!-e $dict_file) {
513 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
514 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
515 $self->{'notbuilt'}->{$index}=1;
516 return;
517 }
518
519 if (!$self->{'debug'}) {
520 # create the perfect hash function
521 if (!-e "$mgpp_perf_hash_build_exe") {
522 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
523 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
524 }
525 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
526
527 if (!-e "$mgpp_passes_exe" ||
528 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
529 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
530 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
531 }
532 }
533
534 # invert the text
535 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
536 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
537 $self->{'buildproc'}->reset();
538 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
539 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
540
541 $self->print_stats ();
542
543 if (!$self->{'debug'}) {
544
545 close ($handle);
546
547 # create the weights file
548 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
549 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
550 if (!-e "$mgpp_weights_build_exe") {
551 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
552 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
553 }
554 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
555
556 # create 'on-disk' stemmed dictionary
557 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
558 if (!-e "$mgpp_invf_dict_exe") {
559 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
560 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
561 }
562 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
563
564
565 # creates stem index files for the various stemming methods
566 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
567 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
568 if (!-e "$mgpp_stem_idx_exe") {
569 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
570 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
571 }
572 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
573 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
574 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
575
576 # remove unwanted files
577 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
578 opendir (DIR, $tmpdir) || die
579 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
580 foreach my $file (readdir(DIR)) {
581 next if $file =~ /^\./;
582 my ($suffix) = $file =~ /\.([^\.]+)$/;
583 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
584 # delete it!
585 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
586 #&util::rm (&util::filename_cat ($tmpdir, $file));
587 }
588 }
589 closedir (DIR);
590 }
591 print STDERR "</Stage>\n" if $self->{'gli'};
592}
593
594# now only outputs stuff if you can't generate it from collectionmeta - e.g. if someone has specified 'metadata' as an index.
595sub output_collection_meta {
596 my $self = shift(@_);
597 my ($handle) = @_;
598
599 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
600 if (!defined $self->{'build_cfg'}) {
601 $self->read_final_field_list();
602 }
603
604 # do the collection info
605 print $handle "[collection]\n";
606
607 # first do the collection meta stuff - everything without a dot
608 my $collmetadefined = 0;
609 my $metadata_entry;
610 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
611 $collmetadefined = 1;
612 }
613
614 #add the index field macros to [collection]
615 # eg <TI>Title
616 # <SU>Subject
617 # these now come from collection meta. if that is not defined, usses the metadata name
618 my $field_entry="";
619 my $collmeta = "";
620 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
621 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
622 next if $shortfield eq 1;
623
624 # we need to check if some coll meta has been defined - don't output
625 # any that have
626 $collmeta = ".$longfield";
627 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
628 if ($longfield eq "allfields") {
629 $field_entry .= "<$shortfield>_query:textallfields_\n";
630 } elsif ($longfield eq "text") {
631 $field_entry .= "<$shortfield>_query:texttextonly_\n";
632 } else {
633 $field_entry .= "<$shortfield>$longfield\n";
634 }
635 }
636 }
637 print $handle $field_entry;
638
639 # now add the level names
640 my $level_entry = "";
641 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
642 $collmeta = ".$level"; # based on the original specification
643 $level =~ tr/A-Z/a-z/; # make it lower case
644 my $levelid = $level_map{$level}; # find the actual value we used in the index
645 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
646 # use the default macro
647 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
648 }
649 }
650 print $handle $level_entry;
651
652 # now add subcoll meta
653 my $subcoll_entry = "";
654 my $shortname = "";
655 my $one_entry = "";
656 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
657 $shortname = $self->{'index_mapping'}->{$subcoll};
658 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
659 $subcoll_entry .= "<$shortname>$subcoll\n";
660 }
661 }
662 print $handle $subcoll_entry;
663
664 # now add language meta
665 my $lang_entry = "";
666 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
667 $shortname = $self->{'index_mapping'}->{$lang};
668 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
669 $lang_entry .= "<$shortname>$lang\n";
670 }
671 }
672 print $handle $lang_entry;
673 # end the collection entry
674 print $handle "\n" . ('-' x 70) . "\n";
675
676
677}
678
679# at the end of building, we have an indexfieldmap with all the mappings,
680# plus some extras, and indexmap with any indexes in it that weren't
681# specified in the index definition. we want to make an ordered list of
682# fields that are indexed, and a list of mappings that are used. this will
683# be used for the build.cfg file, and for collection meta definition we
684# store these in a build.cfg bit
685sub make_final_field_list {
686 my $self = shift (@_);
687
688 $self->{'build_cfg'} = {};
689
690 # store the indexfieldmap information
691 my @indexfieldmap = ();
692 my @indexfields = ();
693 my $specifiedfields = {};
694 my @specifiedfieldorder = ();
695
696 # go through the index definition and add each thing to a map, so we
697 # can easily check if it is already specified - when doing the
698 # metadata, we print out all the individual fields, but some may
699 # already be specified in the index definition, so we dont want to add
700 # those again.
701
702 my $field;
703 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
704 # remove subcoll stuff
705 my $parts = $field;
706 $parts =~ s/:.*$//;
707 # *************
708 my @fs = split(';', $parts);
709 foreach my $f(@fs) {
710 if (!defined $specifiedfields->{$f}) {
711 $specifiedfields->{$f}=1;
712 push (@specifiedfieldorder, "$f");
713 }
714 }
715 }
716
717 #add all fields bit
718 foreach $field (@specifiedfieldorder) {
719 if ($field eq "metadata") {
720 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
721 if (!defined $specifiedfields->{$newfield}) {
722 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
723 push (@indexfields, "$newfield");
724 }
725 }
726
727 } elsif ($field eq 'text') {
728 push (@indexfieldmap, "text\-\>TX");
729 push (@indexfields, "text");
730 } elsif ($field eq 'allfields') {
731 push (@indexfieldmap, "allfields\-\>ZZ");
732 push (@indexfields, "allfields");
733 } else {
734
735 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
736
737 if (defined $ifm->{$field}) {
738 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
739 push (@indexfields, "$field");
740 }
741
742
743 }
744 }
745
746 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
747 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
748
749}
750
751
752# recreate the field list from the build.cfg file, look first in building,
753# then in index to find it. if there is no build.cfg, we can't do the field
754# list (there is unlikely to be any index anyway.)
755sub read_final_field_list {
756 my $self = shift (@_);
757 $self->{'build_cfg'} = {};
758 my @indexfieldmap = ();
759 my @indexfields = ();
760
761 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
762 # set the default mapping
763 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
764 }
765 # we read the stuff in from the build.cfg file - if its there
766 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
767
768 if (!-e $buildconfigfile) {
769 # try the index dir - but do we know where it is?? try here
770 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
771 if (!-e $buildconfigfile) {
772 #we cant find a config file - just ignore the field list
773 return;
774 }
775 }
776
777 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
778 my $field;
779 if (defined $buildcfg->{'indexfields'}) {
780 foreach $field (@{$buildcfg->{'indexfields'}}) {
781 push (@indexfields, "$field");
782 }
783 }
784
785 if (defined $buildcfg->{'indexfieldmap'}) {
786 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
787 push (@indexfieldmap, "$field");
788 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
789 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
790 }
791 }
792
793 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
794 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
795}
796
797
798sub write_cfg_file {
799 my $self = shift(@_);
800 my ($build_cfg) = @_;
801
802 # write out the build information
803 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
804 '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem|maxnumeric)$',
805 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
806
807}
808
809sub build_cfg_extra {
810 my $self = shift (@_);
811 my ($build_cfg) = @_;
812
813 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
814
815 # store the level info
816 my @indexlevels = ();
817 my @levelmap = ();
818 foreach my $l (@{$self->{'levelorder'}}) {
819 push (@indexlevels, $level_map{$l});
820 push (@levelmap, "$l\-\>$level_map{$l}");
821 }
822 $build_cfg->{'indexlevels'} = \@indexlevels;
823 $build_cfg->{'levelmap'} = \@levelmap;
824
825 if ($self->{'levels'}->{'section'}) {
826 $build_cfg->{'textlevel'} = $level_map{'section'};
827 } else {
828 $build_cfg->{'textlevel'} = $level_map{'document'};
829 }
830
831}
832
8331;
834
835
Note: See TracBrowser for help on using the repository browser.