source: main/trunk/greenstone2/perllib/mgppbuilder.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
Line 
1###########################################################################
2#
3# mgppbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use basebuilder;
29use colcfg;
30use plugin;
31use strict; no strict 'refs';
32use util;
33use FileUtils;
34
35
36sub BEGIN {
37 @mgppbuilder::ISA = ('basebuilder');
38}
39
40
41
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para',
45 'Doc'=>'_textdocument_',
46 'Sec'=>'_textsection_',
47 'Para'=>'_textparagraph_');
48
49our %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'tl'=>1,
52 'ti'=>1,
53 'idb'=>1,
54 'ib1'=>1,
55 'ib2'=>1,
56 'ib3'=>1,
57 'ib4'=>1,
58 'ib5'=>1,
59 'ib6'=>1,
60 'ib7'=>1,
61 'i'=>1,
62 'il'=>1,
63 'w'=>1,
64 'wa'=>1);
65
66
67my $maxdocsize = $basebuilder::maxdocsize;
68
69sub new {
70 my $class = shift(@_);
71
72 my $self = new basebuilder (@_);
73 $self = bless $self, $class;
74
75 #$self->{'indexfieldmap'} = \%static_indexfield_map;
76
77 # get the levels (Section, Paragraph) for indexing and compression
78 $self->{'levels'} = {};
79 $self->{'levelorder'} = ();
80 if (defined $self->{'collect_cfg'}->{'levels'}) {
81 foreach my $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
82 $level =~ tr/A-Z/a-z/;
83 $self->{'levels'}->{$level} = 1;
84 push (@{$self->{'levelorder'}}, $level);
85 }
86 } else { # default to document
87 $self->{'levels'}->{'document'} = 1;
88 push (@{$self->{'levelorder'}}, 'document');
89 }
90
91 $self->{'buildtype'} = "mgpp";
92
93 return $self;
94}
95
96sub generate_index_list {
97 my $self = shift (@_);
98
99 # sort out the indexes
100 #indexes are specified with spaces, but we put them into one index
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 if (defined $indexes) {
103 $self->{'collect_cfg'}->{'indexes'} = [];
104
105 # remove any ex. from index spec but iff it is the only namespace in the metadata name
106 my @indexes_copy = @$indexes; # make a copy, as 'map' changes entry in array
107 #map { $_ =~ s/(^|,|;)ex\.([^.]+)$/$1$2/; } @indexes_copy; # No. Will replace metanames like flex.Image with fl.Image
108 map { $_ =~ s/(,|;)/$1 /g; } @indexes_copy; # introduce a space after every separator
109 map { $_ =~ s/(^| )ex\.([^.,:]+)(,|;|$)/$1$2$3/g; } @indexes_copy; # replace all <ex.> at start of metanames or <, ex.> when in a comma separated list
110 map { $_ =~ s/(,|:) /$1/g; } @indexes_copy; # remove space introduced after every separator
111 my $single_index = join(';', @indexes_copy).";";
112
113 push (@{$self->{'collect_cfg'}->{'indexes'}}, $single_index);
114 }
115}
116
117sub generate_index_options {
118 my $self = shift (@_);
119
120 $self->SUPER::generate_index_options();
121
122 $self->{'casefold'} = 0;
123 $self->{'stem'} = 0;
124 $self->{'accentfold'} = 0;
125
126 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
127 # just use default options
128 $self->{'casefold'} = 1;
129 $self->{'stem'} = 1;
130 $self->{'accentfold'} = 1;
131 } else {
132 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
133 if ($option =~ /stem/) {
134 $self->{'stem'} = 1;
135 } elsif ($option =~ /casefold/) {
136 $self->{'casefold'} = 1;
137 } elsif ($option =~ /accentfold/) {
138 $self->{'accentfold'} = 1;
139 }
140 }
141 }
142
143 # now we record this for the build cfg
144 $self->{'stemindexes'} = 0;
145 if ($self->{'casefold'}) {
146 $self->{'stemindexes'} += 1;
147 }
148 if ($self->{'stem'}) {
149 $self->{'stemindexes'} += 2;
150 }
151 if ($self->{'accentfold'}) {
152 $self->{'stemindexes'} += 4;
153 }
154
155}
156
157sub default_buildproc {
158 my $self = shift (@_);
159
160 return "mgppbuildproc";
161}
162
163sub compress_text {
164
165 my $self = shift (@_);
166
167 # we don't do anything if we don't want compressed text
168 return if $self->{'no_text'};
169
170 my ($textindex) = @_;
171
172 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
173 my $exe = &util::get_os_exe ();
174 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
175 my $mgpp_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_compression_dict$exe");
176 my $outhandle = $self->{'outhandle'};
177
178 my $maxnumeric = $self->{'maxnumeric'};
179
180 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
181
182 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
183 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
184 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
185
186 my $osextra = "";
187 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
188 $fulltextprefix =~ s@/@\\@g;
189 }
190 else {
191 $osextra = " -d /";
192 }
193
194
195 # define the section names and possibly the doc name for mgpasses
196 # the compressor doesn't need to know about paragraphs - never want to
197 # retrieve them
198
199 # always use Doc and Sec levels
200 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
201
202 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
204
205 # collect the statistics for the text
206 # -b $maxdocsize sets the maximum document size to be 12 meg
207 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1);
208 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
209
210 my ($handle);
211 if ($self->{'debug'}) {
212 $handle = *STDOUT;
213 }
214 else {
215 if (!-e "$mgpp_passes_exe" ||
216 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
217 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
218 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
219 }
220 }
221
222 my $db_level = "section";
223
224 $self->{'buildproc'}->set_output_handle ($handle);
225 $self->{'buildproc'}->set_mode ('text');
226 $self->{'buildproc'}->set_index ($textindex);
227 $self->{'buildproc'}->set_indexing_text (0);
228 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
229 $self->{'buildproc'}->set_levels ($self->{'levels'});
230 $self->{'buildproc'}->set_db_level ($db_level);
231 $self->{'buildproc'}->reset();
232 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
233 $self->{'buildproc'}, $self->{'maxdocs'});
234 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
235 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
236 &plugin::end($self->{'pluginfo'});
237
238 close ($handle) unless $self->{'debug'};
239
240 $self->print_stats();
241
242 # create the compression dictionary
243 # the compression dictionary is built by assuming the stats are from a seed
244 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
245 # and the resulting dictionary must be less than 5 meg with the most
246 # frequent words being put into the dictionary first (-2 -k 5120)
247 # note: these options are left over from mg version
248 if (!$self->{'debug'}) {
249 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
251 if (!-e "$mgpp_compression_dict_exe") {
252 print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
253 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
254 }
255 system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
256
257 if (!$self->{'debug'}) {
258 if (!-e "$mgpp_passes_exe" ||
259 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
260 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
261 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
262 }
263 }
264 }
265 else {
266 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
267 }
268
269 $self->{'buildproc'}->set_output_handle ($handle);
270 $self->{'buildproc'}->reset();
271
272 # compress the text
273 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1);
274 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
275
276 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
277 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
278 close ($handle) unless $self->{'debug'};
279
280 $self->print_stats();
281 print STDERR "</Stage>\n" if $self->{'gli'};
282}
283
284
285sub post_build_indexes {
286 my $self = shift(@_);
287
288 #define the final field lists
289 $self->make_final_field_list();
290}
291
292# creates directory names for each of the index descriptions
293sub create_index_mapping {
294 my $self = shift (@_);
295 my ($indexes) = @_;
296
297 my %mapping = ();
298
299 return \%mapping if !(scalar @$indexes);
300
301 $mapping{'indexmaporder'} = [];
302 $mapping{'subcollectionmaporder'} = [];
303 $mapping{'languagemaporder'} = [];
304
305 # dirnames is used to check for collisions. Start this off
306 # with the manditory directory names
307 my %dirnames = ('text'=>'text',
308 'extra'=>'extra');
309 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
310
311 foreach my $index (@$indexes) {
312 my ($fields, $subcollection, $languages) = split (":", $index);
313
314 # we only ever have one index, and its called 'idx'
315 my $pindex = 'idx';
316
317 # next comes a processed version of the subcollection if there is one.
318 my $psub = $self->process_field ($subcollection);
319 $psub = lc ($psub);
320
321 # next comes a processed version of the language if there is one.
322 my $plang = $self->process_field ($languages);
323 $plang = lc ($plang);
324
325 my $dirname = $pindex . $psub . $plang;
326
327 # check to be sure all index names are unique
328 while (defined ($dirnames{$dirname})) {
329 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
330 }
331
332 $mapping{$index} = $dirname;
333
334 # store the mapping orders as well as the maps
335 # also put index, subcollection and language fields into the mapping thing -
336 # (the full index name (eg text:subcol:lang) is not used on
337 # the query page) -these are used for collectionmeta later on
338 if (!defined $mapping{'indexmap'}{"$fields"}) {
339 $mapping{'indexmap'}{"$fields"} = $pindex;
340 push (@{$mapping{'indexmaporder'}}, "$fields");
341 if (!defined $mapping{"$fields"}) {
342 $mapping{"$fields"} = $pindex;
343 }
344 }
345 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
346 $mapping{'subcollectionmap'}{$subcollection} = $psub;
347 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
348 $mapping{$subcollection} = $psub;
349 }
350 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
351 $mapping{'languagemap'}{$languages} = $plang;
352 push (@{$mapping{'languagemaporder'}}, $languages);
353 $mapping{$languages} = $plang;
354 }
355 $dirnames{$dirname} = $index;
356 $pnames{'index'}->{$pindex} = "$fields";
357 $pnames{'subcollection'}->{$psub} = $subcollection;
358 $pnames{'languages'}->{$plang} = $languages;
359 }
360
361 return \%mapping;
362}
363
364sub make_unique {
365 my $self = shift (@_);
366 my ($namehash, $index, $indexref, $subref, $langref) = @_;
367 my ($fields, $subcollection, $languages) = split (":", $index);
368
369 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
370 $self->get_next_version ($indexref);
371 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
372 $self->get_next_version ($subref);
373 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
374 $self->get_next_version ($langref);
375 }
376 return "$$indexref$$subref$$langref";
377}
378
379
380sub build_index {
381 my $self = shift (@_);
382 my ($index) = @_;
383 my $outhandle = $self->{'outhandle'};
384
385 # get the full index directory path and make sure it exists
386 my $indexdir = $self->{'index_mapping'}->{$index};
387 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
388
389 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
390 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'},
391 $indexdir,
392 $collect_tail);
393 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
394 $collect_tail);
395
396 # get any os specific stuff
397 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
398
399 my $exe = &util::get_os_exe ();
400 my $mgpp_passes_exe = &FileUtils::filenameConcatenate($exedir, "mgpp_passes$exe");
401
402 # define the section names for mgpasses
403 my $mgpp_passes_sections = "-J ". $level_map{"document"} ." -K " . $level_map{"section"} ." ";
404 if ($self->{'levels'}->{'paragraph'}) {
405 $mgpp_passes_sections .= "-K " . $level_map{'paragraph'}. " ";
406 }
407
408 my $mgpp_perf_hash_build_exe =
409 &FileUtils::filenameConcatenate($exedir, "mgpp_perf_hash_build$exe");
410 my $mgpp_weights_build_exe =
411 &FileUtils::filenameConcatenate($exedir, "mgpp_weights_build$exe");
412 my $mgpp_invf_dict_exe =
413 &FileUtils::filenameConcatenate($exedir, "mgpp_invf_dict$exe");
414 my $mgpp_stem_idx_exe =
415 &FileUtils::filenameConcatenate($exedir, "mgpp_stem_idx$exe");
416
417 my $maxnumeric = $self->{'maxnumeric'};
418
419 my $osextra = "";
420 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
421 $fullindexprefix =~ s@/@\\@g;
422 } else {
423 $osextra = " -d /";
424 if ($outhandle ne "STDERR") {
425 # so mgpp_passes doesn't print to stderr if we redirect output
426 $osextra .= " 2>/dev/null";
427 }
428 }
429
430 # get the index expression if this index belongs
431 # to a subcollection
432 my $indexexparr = [];
433 my $langarr = [];
434 # there may be subcollection info, and language info.
435 my ($fields, $subcollection, $language) = split (":", $index);
436 my @subcollections = ();
437 @subcollections = split /,/, $subcollection if (defined $subcollection);
438
439 foreach $subcollection (@subcollections) {
440 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
441 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
442 }
443 }
444
445 # add expressions for languages if this index belongs to
446 # a language subcollection - only put languages expressions for the
447 # ones we want in the index
448
449 my @languages = ();
450 my $languagemetadata = "Language";
451 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
452 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
453 }
454 @languages = split /,/, $language if (defined $language);
455 foreach my $language (@languages) {
456 my $not=0;
457 if ($language =~ s/^\!//) {
458 $not = 1;
459 }
460 if($not) {
461 push (@$langarr, "!$language");
462 } else {
463 push (@$langarr, "$language");
464 }
465 }
466
467 # Build index dictionary. Uses verbatim stem method
468 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1);
469 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
470 my ($handle);
471 if ($self->{'debug'}) {
472 $handle = *STDOUT;
473 }
474 else {
475 if (!-e "$mgpp_passes_exe" ||
476 !open($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
477 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
478 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
479 }
480 }
481
482 # db_level is always section
483 my $db_level = "section";
484
485 # set up the document processr
486 $self->{'buildproc'}->set_output_handle ($handle);
487 $self->{'buildproc'}->set_mode ('text');
488 $self->{'buildproc'}->set_index ($index, $indexexparr);
489 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
490 $self->{'buildproc'}->set_indexing_text (1);
491 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
492 $self->{'buildproc'}->set_levels ($self->{'levels'});
493 $self->{'buildproc'}->set_db_level ($db_level);
494
495 $self->{'buildproc'}->reset();
496
497 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
498 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
499 close ($handle) unless $self->{'debug'};
500
501 $self->print_stats();
502
503 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
504 # we check on the .id file - index dictionary
505 my $dict_file = "$fullindexprefix.id";
506 if (!-e $dict_file) {
507 print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
508 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
509 $self->{'notbuilt'}->{$index}=1;
510 return;
511 }
512
513 if (!$self->{'debug'}) {
514 # create the perfect hash function
515 if (!-e "$mgpp_perf_hash_build_exe") {
516 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
517 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
518 }
519 system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
520
521 if (!-e "$mgpp_passes_exe" ||
522 !open ($handle, "| mgpp_passes$exe -M $maxnumeric $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
523 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
524 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
525 }
526 }
527
528 # invert the text
529 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1);
530 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
531
532 $self->{'buildproc'}->set_output_handle ($handle);
533 $self->{'buildproc'}->reset();
534
535 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
536 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
537
538 $self->print_stats ();
539
540 if (!$self->{'debug'}) {
541
542 close ($handle);
543
544 # create the weights file
545 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
546 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
547 if (!-e "$mgpp_weights_build_exe") {
548 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
549 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
550 }
551 system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");
552
553 # create 'on-disk' stemmed dictionary
554 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
555 if (!-e "$mgpp_invf_dict_exe") {
556 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
557 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
558 }
559 system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );
560
561
562 # creates stem index files for the various stemming methods
563 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
564 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
565 if (!-e "$mgpp_stem_idx_exe") {
566 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
567 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
568 }
569 my $accent_folding_enabled = 1;
570 if ($self->{'accentfold'}) {
571 # the first time we do this, we test for accent folding enabled
572 if (system ("mgpp_stem_idx$exe -b 4096 -s4 -f \"$fullindexprefix\" $osextra") == 2) {
573 # accent folding has not been enabled in mgpp
574 $accent_folding_enabled = 0;
575 $self->{'stemindexes'} -= 4;
576 }
577 }
578 if ($self->{'casefold'}) {
579 system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
580 if ($accent_folding_enabled && $self->{'accentfold'}) {
581 system ("mgpp_stem_idx$exe -b 4096 -s5 -f \"$fullindexprefix\" $osextra");
582 }
583 }
584 if ($self->{'stem'}) {
585 system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
586 if ($accent_folding_enabled && $self->{'accentfold'}) {
587 system ("mgpp_stem_idx$exe -b 4096 -s6 -f \"$fullindexprefix\" $osextra");
588 }
589 }
590 if ($self->{'casefold'} && $self->{'stem'}) {
591 system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
592 if ($accent_folding_enabled && $self->{'accentfold'}) {
593 system ("mgpp_stem_idx$exe -b 4096 -s7 -f \"$fullindexprefix\" $osextra");
594 }
595 }
596
597 # remove unwanted files
598 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
599 opendir (DIR, $tmpdir) || die
600 "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
601 foreach my $file (readdir(DIR)) {
602 next if $file =~ /^\./;
603 my ($suffix) = $file =~ /\.([^\.]+)$/;
604 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
605 # delete it!
606 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
607 #&util::rm (&FileUtils::filenameConcatenate($tmpdir, $file));
608 }
609 }
610 closedir (DIR);
611 }
612 print STDERR "</Stage>\n" if $self->{'gli'};
613}
614
615
616sub get_collection_meta_indexes
617{
618 my $self = shift(@_);
619 my $collection_infodb = shift(@_);
620
621 # define the indexed field mapping if not already done so
622 # (i.e. if infodb called separately from build_index)
623 if (!defined $self->{'build_cfg'}) {
624 $self->read_final_field_list();
625 }
626
627 # first do the collection meta stuff - everything without a dot
628 my $collmetadefined = 0;
629 my $metadata_entry;
630 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
631 $collmetadefined = 1;
632 }
633
634 #add the index field macros to [collection]
635 # eg <TI>Title
636 # <SU>Subject
637 # these now come from collection meta. if that is not defined, uses the metadata name
638 my $collmeta = "";
639 if (defined $self->{'build_cfg'}->{'indexfields'}) {
640 foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
641 my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
642 next if $shortfield eq 1;
643
644 # we need to check if some coll meta has been defined - don't output
645 # any that have
646 $collmeta = ".$longfield";
647 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
648 if ($longfield eq "allfields") {
649 $collection_infodb->{$shortfield} = [ "_query:textallfields_" ];
650 } elsif ($longfield eq "text") {
651 $collection_infodb->{$shortfield} = [ "_query:texttextonly_" ];
652 } else {
653 $collection_infodb->{$shortfield} = [ $longfield ];
654 }
655 }
656 }
657 }
658
659 # now add the level names
660 my $level_entry = "";
661 foreach my $level (@{$self->{'collect_cfg'}->{'levels'}}) {
662 $collmeta = ".$level"; # based on the original specification
663 $level =~ tr/A-Z/a-z/; # make it lower case
664 my $levelid = $level_map{$level}; # find the actual value we used in the index
665 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
666 # use the default macro
667 $collection_infodb->{$levelid} = [ $level_map{$levelid} ];
668 }
669 }
670
671 # now add subcoll meta
672 my $subcoll_entry = "";
673 my $shortname = "";
674 my $one_entry = "";
675 foreach my $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
676 $shortname = $self->{'index_mapping'}->{$subcoll};
677 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
678 $collection_infodb->{$shortname} = [ $subcoll ];
679 }
680 }
681
682 # now add language meta
683 my $lang_entry = "";
684 foreach my $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
685 $shortname = $self->{'index_mapping'}->{$lang};
686 if (!$collmetadefined || !defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
687 $collection_infodb->{$shortname} = [ $lang ];
688 }
689 }
690}
691
692
693# default is to output the metadata sets (prefixes) used in collection
694sub output_collection_meta
695{
696 my $self = shift(@_);
697 my $infodb_handle = shift(@_);
698
699 my %collection_infodb = ();
700 $self->get_collection_meta_sets(\%collection_infodb);
701 $self->get_collection_meta_indexes(\%collection_infodb);
702 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
703}
704
705
706# at the end of building, we have an indexfieldmap with all the mappings,
707# plus some extras, and indexmap with any indexes in it that weren't
708# specified in the index definition. We want to make an ordered list of
709# fields that are indexed, and a list of mappings that are used. This will
710# be used for the build.cfg file, and for collection meta definition we
711# store these in a build.cfg bit
712sub make_final_field_list {
713 my $self = shift (@_);
714
715 $self->{'build_cfg'} = {};
716
717 # store the indexfieldmap information
718 my @indexfieldmap = ();
719 my @indexfields = ();
720 my $specifiedfields = {};
721 my @specifiedfieldorder = ();
722
723 # go through the index definition and add each thing to a map, so we
724 # can easily check if it is already specified - when doing the
725 # metadata, we print out all the individual fields, but some may
726 # already be specified in the index definition, so we dont want to add
727 # those again.
728
729 my $field;
730 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
731 # remove subcoll stuff
732 my $parts = $field;
733 $parts =~ s/:.*$//;
734 # *************
735 my @fs = split(';', $parts);
736 foreach my $f(@fs) {
737 if (!defined $specifiedfields->{$f}) {
738 $specifiedfields->{$f}=1;
739 push (@specifiedfieldorder, "$f");
740 }
741 }
742 }
743
744 #add all fields bit
745 my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
746
747 foreach $field (@specifiedfieldorder) {
748 if ($field eq "metadata") {
749 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
750 if (!defined $specifiedfields->{$newfield}) {
751 push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
752 push (@indexfields, "$newfield");
753 }
754 }
755
756 } elsif ($field eq 'text') {
757 push (@indexfieldmap, "text\-\>TX");
758 push (@indexfields, "text");
759 } elsif ($field eq 'allfields') {
760 push (@indexfieldmap, "allfields\-\>ZZ");
761 push (@indexfields, "allfields");
762 } else {
763 # we only add in the ones that have been processed
764 if (defined $ifm->{$field}) {
765 push (@indexfieldmap, "$field\-\>$ifm->{$field}");
766 push (@indexfields, "$field");
767 }
768 }
769 }
770
771 if (scalar @indexfieldmap) {
772 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
773 }
774
775 if (scalar @indexfields) {
776 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
777 }
778}
779
780
781# recreate the field list from the build.cfg file, look first in building,
782# then in index to find it. if there is no build.cfg, we can't do the field
783# list (there is unlikely to be any index anyway.)
784sub read_final_field_list {
785 my $self = shift (@_);
786 $self->{'build_cfg'} = {};
787 my @indexfieldmap = ();
788 my @indexfields = ();
789 my @indexmap = ();
790
791 # we read the stuff in from the build.cfg file - if its there
792 my $buildcfg = $self->read_build_cfg();
793 return unless defined $buildcfg;
794
795 my $field;
796 if (defined $buildcfg->{'indexfields'}) {
797 foreach $field (@{$buildcfg->{'indexfields'}}) {
798 push (@indexfields, "$field");
799 }
800 }
801
802 if (defined $buildcfg->{'indexfieldmap'}) {
803 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
804 push (@indexfieldmap, "$field");
805 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
806 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
807 }
808 }
809
810 if (defined $buildcfg->{'indexmap'}) {
811 foreach $field (@{$buildcfg->{'indexmap'}}) {
812 push (@indexmap, "$field");
813 }
814 }
815
816 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
817 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
818 $self->{'build_cfg'}->{'indexmap'} = \@indexmap;
819}
820
821
822sub build_cfg_extra {
823 my $self = shift (@_);
824 my ($build_cfg) = @_;
825
826 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
827
828 # store the level info
829 my @indexlevels = ();
830 my @levelmap = ();
831 foreach my $l (@{$self->{'levelorder'}}) {
832 push (@indexlevels, $level_map{$l});
833 push (@levelmap, "$l\-\>$level_map{$l}");
834 }
835 $build_cfg->{'indexlevels'} = \@indexlevels;
836 $build_cfg->{'levelmap'} = \@levelmap;
837
838 # text level (and database level) is always section
839 $build_cfg->{'textlevel'} = $level_map{'section'};
840
841}
842
8431;
844
845
Note: See TracBrowser for help on using the repository browser.