source: gsdl/trunk/perllib/mgppbuilder.pm@ 14934

Last change on this file since 14934 was 14934, checked in by davidb, 16 years ago

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.4 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use classify;
30use cfgread;
31use colcfg;
32use plugin;
33use util;
34use FileHandle;
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66# change this so a user can add their own ones in via a file or cfg
67#add AND, OR, NOT NEAR to this list - these cannot be used as field names
68#also add the level names (Doc, Sec, Para)
69our %static_indexfield_map = ('Title'=>'TI',
70 'TI'=>1,
71 'Subject'=>'SU',
72 'SU'=>1,
73 'Creator'=>'CR',
74 'CR'=>1,
75 'Organization'=>'ORG',
76 'ORG'=>1,
77 'Source'=>'SO',
78 'SO'=>1,
79 'Howto'=>'HT',
80 'HT'=>1,
81 'ItemTitle'=>'IT',
82 'IT'=>1,
83 'ProgNumber'=>'PN',
84 'PN'=>1,
85 'People'=>'PE',
86 'PE'=>1,
87 'Coverage'=>'CO',
88 'CO'=>1,
89 'allfields'=>'ZZ',
90 'ZZ'=>1,
91 'text'=>'TX',
92 'TX'=>1,
93 'AND'=>1,
94 'OR'=>1,
95 'NOT'=>1,
96 'NEAR'=>1,
97 'Doc'=>1,
98 'Sec'=>1,
99 'Para'=>1);
100
101my $maxdocsize = $basebuilder::maxdocsize;
102
103sub new {
104 my $class = shift(@_);
105
106 my $self = new basebuilder (@_);
107 $self = bless $self, $class;
108
109 $self->{'indexfieldmap'} = \%static_indexfield_map;
110
111 # get the levels (Section, Paragraph) for indexing and compression
112 $self->{'levels'} = {};
113 $self->{'levelorder'} = ();
114 if (defined $self->{'collect_cfg'}->{'levels'}) {
115 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
116 $level =~ tr/A-Z/a-z/;
117 $self->{'levels'}->{$level} = 1;
118 push (@{$self->{'levelorder'}}, $level);
119 }
120 } else { # default to document
121 $self->{'levels'}->{'document'} = 1;
122 push (@{$self->{'levelorder'}}, 'document');
123 }
124
125 $self->{'buildtype'} = "mgpp";
126
127 return $self;
128}
129
130sub generate_index_list {
131 my $self = shift (@_);
132
133 # sort out the indexes
134 #indexes are specified with spaces, but we put them into one index
135 my $indexes = $self->{'collect_cfg'}->{'indexes'};
136 $self->{'collect_cfg'}->{'indexes'} = [];
137 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes).";");
138}
139
140sub generate_index_options {
141 my $self = shift (@_);
142
143 $self->{'casefold'} = 0;
144 $self->{'stem'} = 0;
145 $self->{'accentfold'} = 0;
146
147 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
148 # just use default options
149 $self->{'casefold'} = 1;
150 $self->{'stem'} = 1;
151 $self->{'accentfold'} = 1;
152 } else {
153 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
154 if ($option =~ /stem/) {
155 $self->{'stem'} = 1;
156 } elsif ($option =~ /casefold/) {
157 $self->{'casefold'} = 1;
158 } elsif ($option =~ /accentfold/) {
159 $self->{'accentfold'} = 1;
160 }
161 }
162 }
163
164 # now we record this for the build cfg
165 $self->{'stemindexes'} = 0;
166 if ($self->{'casefold'}) {
167 $self->{'stemindexes'} += 1;
168 }
169 if ($self->{'stem'}) {
170 $self->{'stemindexes'} += 2;
171 }
172 if ($self->{'accentfold'}) {
173 $self->{'stemindexes'} += 4;
174 }
175
176}
177
178sub default_buildproc {
179 my $self = shift (@_);
180
181 return "mgppbuildproc";
182}
183
184sub compress_text {
185
186 my $self = shift (@_);
187
188 # we don't do anything if we don't want compressed text
189 return if $self->{'no_text'};
190
191 my ($textindex) = @_;
192
193 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
194 my $exe = &util::get_os_exe ();
195 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
196 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
197 my $outhandle = $self->{'outhandle'};
198
199 my $maxnumeric = $self->{'maxnumeric'};
200
201 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
202
203 my $basefilename = "text/$self->{'collection'}";
204 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
205
206 my $osextra = "";
207 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
208 $fulltextprefix =~ s@/@\\@g;
209 }
210 else {
211 $osextra = " -d /";
212 }
213
214
215 # define the section names and possibly the doc name for mgpasses
216 # the compressor doesn't need to know about paragraphs - never want to
217 # retrieve them
218
219 # always use Doc and Sec levels
220 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
221
222 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
223 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
224
225 # collect the statistics for the text
226 # -b $maxdocsize sets the maximum document size to be 12 meg
227 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
228 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
229
230 my ($handle);
231 if ($self->{'debug'}) {
232 $handle = STDOUT;
233 } else {
234 if (!-e "$mgpp_passes_exe" ||
235 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
236 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
237 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
238 }
239 $handle = mgppbuilder::PIPEOUT;
240 }
241
242 my $gdbm_level = "section";
243
244 $self->{'buildproc'}->set_output_handle ($handle);
245 $self->{'buildproc'}->set_mode ('text');
246 $self->{'buildproc'}->set_index ($textindex);
247 $self->{'buildproc'}->set_indexing_text (0);
248 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
249 $self->{'buildproc'}->set_levels ($self->{'levels'});
250 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
251 $self->{'buildproc'}->reset();
252 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
253 $self->{'buildproc'}, $self->{'maxdocs'});
254 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
255 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
256 &plugin::end($self->{'pluginfo'});
257 close (PIPEOUT);
258
259 close ($handle) unless $self->{'debug'};
260
261 $self->print_stats();
262
263 # create the compression dictionary
264 # the compression dictionary is built by assuming the stats are from a seed
265 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
266 # and the resulting dictionary must be less than 5 meg with the most
267 # frequent words being put into the dictionary first (-2 -k 5120)
268 # note: these options are left over from mg version
269 if (!$self->{'debug'}) {
270 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
271 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
272 if (!-e "$mgpp_compression_dict_exe") {
273 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
274 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
275 }
276 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
277
278 if (!$self->{'debug'}) {
279 if (!-e "$mgpp_passes_exe" ||
280 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
281 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
282 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
283 }
284 }
285 }
286 else {
287 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
288 }
289
290 $self->{'buildproc'}->reset();
291 # compress the text
292 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
293 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
294
295 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
296 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
297 close ($handle) unless $self->{'debug'};
298
299 $self->print_stats();
300 print STDERR "</Stage>\n" if $self->{'gli'};
301}
302
303
304sub build_indexes_extra {
305 my $self = shift(@_);
306 #define the final field lists
307 $self->make_final_field_list();
308}
309
310# creates directory names for each of the index descriptions
311sub create_index_mapping {
312 my $self = shift (@_);
313 my ($indexes) = @_;
314
315 my %mapping = ();
316
317 $mapping{'indexmaporder'} = [];
318 $mapping{'subcollectionmaporder'} = [];
319 $mapping{'languagemaporder'} = [];
320
321 # dirnames is used to check for collisions. Start this off
322 # with the manditory directory names
323 my %dirnames = ('text'=>'text',
324 'extra'=>'extra');
325 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
326
327 foreach my $index (@$indexes) {
328 my ($fields, $subcollection, $languages) = split (":", $index);
329
330 # we only ever have one index, and its called 'idx'
331 my $pindex = 'idx';
332
333 # next comes a processed version of the subcollection if there is one.
334 my $psub = $self->process_field ($subcollection);
335 $psub = lc ($psub);
336
337 # next comes a processed version of the language if there is one.
338 my $plang = $self->process_field ($languages);
339 $plang = lc ($plang);
340
341 my $dirname = $pindex . $psub . $plang;
342
343 # check to be sure all index names are unique
344 while (defined ($dirnames{$dirname})) {
345 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
346 }
347
348 $mapping{$index} = $dirname;
349
350 # store the mapping orders as well as the maps
351 # also put index, subcollection and language fields into the mapping thing -
352 # (the full index name (eg text:subcol:lang) is not used on
353 # the query page) -these are used for collectionmeta later on
354 if (!defined $mapping{'indexmap'}{"$fields"}) {
355 $mapping{'indexmap'}{"$fields"} = $pindex;
356 push (@{$mapping{'indexmaporder'}}, "$fields");
357 if (!defined $mapping{"$fields"}) {
358 $mapping{"$fields"} = $pindex;
359 }
360 }
361 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
362 $mapping{'subcollectionmap'}{$subcollection} = $psub;
363 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
364 $mapping{$subcollection} = $psub;
365 }
366 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
367 $mapping{'languagemap'}{$languages} = $plang;
368 push (@{$mapping{'languagemaporder'}}, $languages);
369 $mapping{$languages} = $plang;
370 }
371 $dirnames{$dirname} = $index;
372 $pnames{'index'}->{$pindex} = "$fields";
373 $pnames{'subcollection'}->{$psub} = $subcollection;
374 $pnames{'languages'}->{$plang} = $languages;
375 }
376
377 return \%mapping;
378}
379
380sub make_unique {
381 my $self = shift (@_);
382 my ($namehash, $index, $indexref, $subref, $langref) = @_;
383 my ($fields, $subcollection, $languages) = split (":", $index);
384
385 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
386 $self->get_next_version ($indexref);
387 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
388 $self->get_next_version ($subref);
389 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
390 $self->get_next_version ($langref);
391 }
392 return "$$indexref$$subref$$langref";
393}
394
395
396sub build_index {
397 my $self = shift (@_);
398 my ($index) = @_;
399 my $outhandle = $self->{'outhandle'};
400
401 # get the full index directory path and make sure it exists
402 my $indexdir = $self->{'index_mapping'}->{$index};
403 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
404 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
405 $indexdir,
406 $self->{'collection'});
407 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
408 $self->{'collection'});
409
410 # get any os specific stuff
411 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
412
413 my $exe = &util::get_os_exe ();
414 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
415
416 # define the section names for mgpasses
417 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
418 if ($self->{'levels'}->{'paragraph'}) {
419 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
420 }
421
422 my $mgpp_perf_hash_build_exe =
423 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
424 my $mgpp_weights_build_exe =
425 &util::filename_cat ($exedir, "mgpp_weights_build$exe");
426 my $mgpp_invf_dict_exe =
427 &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
428 my $mgpp_stem_idx_exe =
429 &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
430
431 my $maxnumeric = $self->{'maxnumeric'};
432
433 my $osextra = "";
434 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
435 $fullindexprefix =~ s@/@\\@g;
436 } else {
437 $osextra = " -d /";
438 if ($outhandle ne "STDERR") {
439 # so mgpp_passes doesn't print to stderr if we redirect output
440 $osextra .= " 2>/dev/null";
441 }
442 }
443
444 # get the index expression if this index belongs
445 # to a subcollection
446 my $indexexparr = [];
447 my $langarr = [];
448 # there may be subcollection info, and language info.
449 my ($fields, $subcollection, $language) = split (":", $index);
450 my @subcollections = ();
451 @subcollections = split /,/, $subcollection if (defined $subcollection);
452
453 foreach $subcollection (@subcollections) {
454 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
455 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
456 }
457 }
458
459 # add expressions for languages if this index belongs to
460 # a language subcollection - only put languages expressions for the
461 # ones we want in the index
462
463 my @languages = ();
464 my $language_metadata = "Language";
465 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
466 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
467 }
468 @languages = split /,/, $language if (defined $language);
469 foreach my $language (@languages) {
470 my $not=0;
471 if ($language =~ s/^\!//) {
472 $not = 1;
473 }
474 if($not) {
475 push (@$langarr, "!$language");
476 } else {
477 push (@$langarr, "$language");
478 }
479 }
480
481 # Build index dictionary. Uses verbatim stem method
482 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
483 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
484 my ($handle);
485 if ($self->{'debug'}) {
486 $handle = STDOUT;
487 } else {
488 if (!-e "$mgpp_passes_exe" ||
489 !open (PIPEOUT, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
490 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
491 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
492 }
493 $handle = mgppbuilder::PIPEOUT;
494 }
495
496 # gdbm_level is always section
497 my $gdbm_level = "section";
498
499 # set up the document processr
500 $self->{'buildproc'}->set_output_handle ($handle);
501 $self->{'buildproc'}->set_mode ('text');
502 $self->{'buildproc'}->set_index ($index, $indexexparr);
503 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
504 $self->{'buildproc'}->set_indexing_text (1);
505 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
506 $self->{'buildproc'}->set_levels ($self->{'levels'});
507 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
508
509 $self->{'buildproc'}->reset();
510 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
511 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
512 close ($handle) unless $self->{'debug'};
513
514 $self->print_stats();
515
516 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
517 # we check on the .id file - index dictionary
518 my $dict_file = "$fullindexprefix.id";
519 if (!-e $dict_file) {
520 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
521 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
522 $self->{'notbuilt'}->{$index}=1;
523 return;
524 }
525
526 if (!$self->{'debug'}) {
527 # create the perfect hash function
528 if (!-e "$mgpp_perf_hash_build_exe") {
529 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
530 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
531 }
532 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
533
534 if (!-e "$mgpp_passes_exe" ||
535 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
536 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
537 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
538 }
539 }
540
541 # invert the text
542 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
543 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
544 $self->{'buildproc'}->reset();
545 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
546 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
547
548 $self->print_stats ();
549
550 if (!$self->{'debug'}) {
551
552 close ($handle);
553
554 # create the weights file
555 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
556 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
557 if (!-e "$mgpp_weights_build_exe") {
558 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
559 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
560 }
561 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
562
563 # create 'on-disk' stemmed dictionary
564 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
565 if (!-e "$mgpp_invf_dict_exe") {
566 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
567 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
568 }
569 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
570
571
572 # creates stem index files for the various stemming methods
573 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
574 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
575 if (!-e "$mgpp_stem_idx_exe") {
576 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
577 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
578 }
579 my $accent_folding_enabled = 1;
580 if ($self->{'accentfold'}) {
581 # the first time we do this, we test for accent folding enabled
582 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
583 # accent folding has not been enabled in mgpp
584 $accent_folding_enabled = 0;
585 $self->{'stemindexes'} -= 4;
586 }
587 }
588 if ($self->{'casefold'}) {
589 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
590 if ($accent_folding_enabled && $self->{'accentfold'}) {
591 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
592 }
593 }
594 if ($self->{'stem'}) {
595 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
596 if ($accent_folding_enabled && $self->{'accentfold'}) {
597 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
598 }
599 }
600 if ($self->{'casefold'} && $self->{'stem'}) {
601 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
602 if ($accent_folding_enabled && $self->{'accentfold'}) {
603 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
604 }
605 }
606
607 # remove unwanted files
608 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
609 opendir (DIR, $tmpdir) || die
610 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
611 foreach my $file (readdir(DIR)) {
612 next if $file =~ /^\./;
613 my ($suffix) = $file =~ /\.([^\.]+)$/;
614 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
615 # delete it!
616 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
617 #&util::rm (&util::filename_cat ($tmpdir, $file));
618 }
619 }
620 closedir (DIR);
621 }
622 print STDERR "</Stage>\n" if $self->{'gli'};
623}
624
625# now only outputs stuff if you can't generate it from collectionmeta - e.g. if someone has specified 'metadata' as an index.
626sub output_collection_meta {
627 my $self = shift(@_);
628 my ($handle) = @_;
629
630 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
631 if (!defined $self->{'build_cfg'}) {
632 $self->read_final_field_list();
633 }
634
635 # do the collection info
636 $self->output_collection_meta_start($handle);
637 $self->output_collection_meta_sets($handle);
638
639 # first do the collection meta stuff - everything without a dot
640 my $collmetadefined = 0;
641 my $metadata_entry;
642 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
643 $collmetadefined = 1;
644 }
645
646 #add the index field macros to [collection]
647 # eg <TI>Title
648 # <SU>Subject
649 # these now come from collection meta. if that is not defined, usses the metadata name
650 my $field_entry="";
651 my $collmeta = "";
652 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
653 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
654 next if $shortfield eq 1;
655
656 # we need to check if some coll meta has been defined - don't output
657 # any that have
658 $collmeta = ".$longfield";
659 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
660 if ($longfield eq "allfields") {
661 $field_entry .= "<$shortfield>_query:textallfields_\n";
662 } elsif ($longfield eq "text") {
663 $field_entry .= "<$shortfield>_query:texttextonly_\n";
664 } else {
665 $field_entry .= "<$shortfield>$longfield\n";
666 }
667 }
668 }
669 print $handle $field_entry;
670
671 # now add the level names
672 my $level_entry = "";
673 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
674 $collmeta = ".$level"; # based on the original specification
675 $level =~ tr/A-Z/a-z/; # make it lower case
676 my $levelid = $level_map{$level}; # find the actual value we used in the index
677 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
678 # use the default macro
679 $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
680 }
681 }
682 print $handle $level_entry;
683
684 # now add subcoll meta
685 my $subcoll_entry = "";
686 my $shortname = "";
687 my $one_entry = "";
688 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
689 $shortname = $self->{'index_mapping'}->{$subcoll};
690 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
691 $subcoll_entry .= "<$shortname>$subcoll\n";
692 }
693 }
694 print $handle $subcoll_entry;
695
696 # now add language meta
697 my $lang_entry = "";
698 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
699 $shortname = $self->{'index_mapping'}->{$lang};
700 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
701 $lang_entry .= "<$shortname>$lang\n";
702 }
703 }
704 print $handle "$lang_entry\n";
705
706 $self->output_collection_meta_end($handle);
707}
708
709# at the end of building, we have an indexfieldmap with all the mappings,
710# plus some extras, and indexmap with any indexes in it that weren't
711# specified in the index definition. we want to make an ordered list of
712# fields that are indexed, and a list of mappings that are used. this will
713# be used for the build.cfg file, and for collection meta definition we
714# store these in a build.cfg bit
715sub make_final_field_list {
716 my $self = shift (@_);
717
718 $self->{'build_cfg'} = {};
719
720 # store the indexfieldmap information
721 my @indexfieldmap = ();
722 my @indexfields = ();
723 my $specifiedfields = {};
724 my @specifiedfieldorder = ();
725
726 # go through the index definition and add each thing to a map, so we
727 # can easily check if it is already specified - when doing the
728 # metadata, we print out all the individual fields, but some may
729 # already be specified in the index definition, so we dont want to add
730 # those again.
731
732 my $field;
733 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
734 # remove subcoll stuff
735 my $parts = $field;
736 $parts =~ s/:.*$//;
737 # *************
738 my @fs = split(';', $parts);
739 foreach my $f(@fs) {
740 if (!defined $specifiedfields->{$f}) {
741 $specifiedfields->{$f}=1;
742 push (@specifiedfieldorder, "$f");
743 }
744 }
745 }
746
747 #add all fields bit
748 foreach $field (@specifiedfieldorder) {
749 if ($field eq "metadata") {
750 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
751 if (!defined $specifiedfields->{$newfield}) {
752 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
753 push (@indexfields, "$newfield");
754 }
755 }
756
757 } elsif ($field eq 'text') {
758 push (@indexfieldmap, "text\-\>TX");
759 push (@indexfields, "text");
760 } elsif ($field eq 'allfields') {
761 push (@indexfieldmap, "allfields\-\>ZZ");
762 push (@indexfields, "allfields");
763 } else {
764
765 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
766
767 if (defined $ifm->{$field}) {
768 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
769 push (@indexfields, "$field");
770 }
771
772
773 }
774 }
775
776 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
777 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
778
779}
780
781
782# recreate the field list from the build.cfg file, look first in building,
783# then in index to find it. if there is no build.cfg, we can't do the field
784# list (there is unlikely to be any index anyway.)
785sub read_final_field_list {
786 my $self = shift (@_);
787 $self->{'build_cfg'} = {};
788 my @indexfieldmap = ();
789 my @indexfields = ();
790 my @indexmap = ();
791
792 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
793 # set the default mapping
794 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
795 }
796 # we read the stuff in from the build.cfg file - if its there
797 my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
798
799 if (!-e $buildconfigfile) {
800 # try the index dir - but do we know where it is?? try here
801 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
802 if (!-e $buildconfigfile) {
803 #we cant find a config file - just ignore the field list
804 return;
805 }
806 }
807
808 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
809 my $field;
810 if (defined $buildcfg->{'indexfields'}) {
811 foreach $field (@{$buildcfg->{'indexfields'}}) {
812 push (@indexfields, "$field");
813 }
814 }
815
816 if (defined $buildcfg->{'indexfieldmap'}) {
817 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
818 push (@indexfieldmap, "$field");
819 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
820 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
821 }
822 }
823
824 if (defined $buildcfg->{'indexmap'}) {
825 foreach $field (@{$buildcfg->{'indexmap'}}) {
826 push (@indexmap, "$field");
827 }
828 }
829
830 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
831 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
832 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
833}
834
835
836sub build_cfg_extra {
837 my $self = shift (@_);
838 my ($build_cfg) = @_;
839
840 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
841
842 # store the level info
843 my @indexlevels = ();
844 my @levelmap = ();
845 foreach my $l (@{$self->{'levelorder'}}) {
846 push (@indexlevels, $level_map{$l});
847 push (@levelmap, "$l\-\>$level_map{$l}");
848 }
849 $build_cfg->{'indexlevels'} = \@indexlevels;
850 $build_cfg->{'levelmap'} = \@levelmap;
851
852 # text level (and gdbm level) is always section
853 $build_cfg->{'textlevel'} = $level_map{'section'};
854
855}
856
8571;
858
859
Note: See TracBrowser for help on using the repository browser.