source: trunk/gsdl/perllib/cfgread4gs3.pm@ 14020

Last change on this file since 14020 was 14020, checked in by xiao, 17 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 20.7 KB
Line 
1###########################################################################
2#
3# cfgread4gs3.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# reads in configuration files of xml form
27
28package cfgread4gs3;
29use strict;
30no strict 'refs';
31no strict 'subs';
32use XML::Parser;
33
34# A mapping hash to resolve name descrepency between gs2 and gs3.
35my $nameMap = {"key" => "value",
36 "creator" => "creator",
37 "maintainer" => "maintainer",
38 "public" => "public",
39 "defaultIndex" => "defaultindex",
40 "defaultLevel" => "defaultlevel",
41 "name" => "collectionname",
42 "description" => "collectionextra",
43 "smallicon" => "iconcollectionsmall",
44 "icon" => "iconcollection",
45 "level" => "levels",
46 "classifier" => "classify",
47 "indexSubcollection" => "indexsubcollections",
48 "indexLanguage" => "languages",
49 "defaultIndexLanguage" => "defaultlanguage",
50 "index" => "indexes",
51 "plugin" => "plugin",
52 "indexOption" => "indexoptions",
53 "searchType" => "searchtype",
54 "languageMetadata" => "languagemetadata",
55 };
56# A hash structure which is returned by sub read_cfg_file.
57my $data = {};
58
59my $repeatedBlock = q/^(browse|pluginList)$/;
60
61# use those unique attribute values to locate the text within the elements
62# creator, public, maintainer.
63my $currentLocation = "";
64my $stringexp = q/^(creator|maintainer|public)$/;
65
66my $currentLevel = "";
67
68# Count the elements with same name within the same block
69# ("plugin", "option")
70my $currentIndex = 0;
71my $arrayexp = q/^(index|level|indexSubcollection|indexLanguage)$/;
72my $arrayarrayexp= q/^(plugin|classifier)$/;
73
74my $defaults = q/^(defaultIndex|defaultLevel|defaultIndexLanguage|languageMetadata)$/;
75
76sub StartTag {
77# Those marked with #@ will not be executed at the same time when this sub is being called
78# so that if/elsif is used to avoid unnecessary tests
79 my ($expat, $element) = @_;
80
81 my $name = $_{'name'};
82 my $value = $_{'value'};
83 my $type = $_{'type'};
84
85 # for subcollections
86 my $filter = $_{'filter'};
87
88 #@ Marking repeated block
89 if ($element =~ /$repeatedBlock/) {
90 $currentIndex = 0;
91 }
92
93 #@ handling block metadataList
94 elsif (defined $name and $name =~ /$stringexp/){
95 $currentLocation = $name;
96 }
97 #@ handling default search index/level/indexLanguage and languageMetadata
98 elsif ($element =~ /$defaults/) {
99 if (defined $name and $name =~ /\w/) {
100 $data->{$nameMap->{$element}} = $name;
101 }
102 }
103
104 #@ Handling indexer: mgpp/mg/lucene; stringexp
105 elsif ($element eq "search") {
106 $data->{'buildtype'} = $type;
107 }
108
109 #@ Handling searchtype: plain,form; arrayexp
110 #elsif ($element eq "format" and defined $name and $name =~ /searchType/) {
111 #@ Handling searchtype: plain, form
112 #$currentLocation = $name;
113 #}
114
115
116
117 #@ Handle index|level|indexSubcollection|indexLanguage
118 elsif ($element =~ /$arrayexp/) {
119 my $key = $nameMap->{$element};
120 if (!defined $data->{$key}) {
121 $data->{$key} = [];
122 }
123
124 push (@{$data->{$key}},$name);
125 }
126
127 #@ indexoptions: accentfold/casefold/stem; arrayexp
128 elsif ($element eq "indexOption") {
129 $currentLevel = "indexOption";
130 }
131 if ($currentLevel eq "indexOption" and $element eq "option") {
132 my $key = $nameMap->{$currentLevel};
133 if (!defined $data->{$key}) {
134 $data->{$key} = [];
135 }
136 push (@{$data->{$key}},$name);
137 }
138
139 #@ use hash of hash of strings: hashexp
140 elsif ($element eq "subcollection") {
141 if (!defined $data->{'subcollection'}) {
142 $data->{'subcollection'} = {};
143 }
144 if (defined $name and $name =~ /\w/) {
145 if (defined $filter and $filter =~ /\w/) {
146 $data->{'subcollection'}->{$name} = $filter;
147
148 }
149 }
150 }
151
152 #@ Handling each classifier/plugin element
153 elsif ($element =~ /$arrayarrayexp/) {
154 # find the gs2 mapping name
155 $currentLevel = $element;
156 my $key = $nameMap->{$element};
157
158 # define an array of array of strings foreach $k (@{$data->{$key}}) {
159 if (!defined $data->{$key}) {
160 $data->{$key} = [];
161 }
162 # Push classifier/plugin name (e.g. AZList) into $data as the first string
163 push (@{$data->{$key}->[$currentIndex]},$name);
164 #print $currentIndex."indexup\n";
165 }
166
167 #@ Handling the option elements in each classifier/plugin element (as the following strings)
168 elsif ($currentLevel =~ /$arrayarrayexp/ and $element eq "option") {
169 # find the gs2 mapping name for classifier and plugin
170 my $key = $nameMap->{$currentLevel};
171
172 if (defined $name and $name =~ /\w/) {
173 push (@{$data->{$key}->[$currentIndex]}, $name);
174 }
175 if (defined $value and $value =~ /\w/) {
176 push (@{$data->{$key}->[$currentIndex]}, $value);
177 }
178
179 }
180
181}
182
183sub EndTag {
184 my ($expat, $element) = @_;
185 my $endTags = q/^(browse|pluginList)$/;
186 if ($element =~ /$endTags/) {
187 $currentIndex = 0;
188 $currentLevel = "";
189 }
190 # $arrayarrayexp contains classifier|plugin
191 elsif($element =~ /$arrayarrayexp/){
192 $currentIndex = $currentIndex + 1;
193 }
194
195}
196
197sub Text {
198 #@ Handling block metadataList(creator, maintainer, public)
199 if (defined $currentLocation and $currentLocation =~ /$stringexp/){
200 #print $currentLocation;
201 my $key = $nameMap->{$currentLocation};
202 $data->{$key} = $_;
203 undef $currentLocation;
204 }
205 #@ Handling searchtype: plain,form; arrayexp
206 if (defined $currentLocation and $currentLocation =~ /searchType/) {
207 # map 'searchType' into 'searchtype'
208 my $key = $nameMap->{$currentLocation};
209 # split it by ','
210 my ($plain, $form) = split (",", $_);
211
212 if (!defined $data->{$key}) {
213 $data->{$key} = [];
214 }
215 if (defined $plain and $plain =~ /\w/) {
216 push @{ $data->{$key} }, $plain;
217 }
218 if (defined $form and $form =~ /\w/) {
219 push @{ $data->{$key} }, $form;
220 }
221 }
222}
223# This sub is for debugging purposes
224sub Display {
225 # metadataList
226
227 print $data->{'creator'}."\n" if (defined $data->{'creator'});
228 print $data->{"maintainer"}."\n" if (defined $data->{"maintainer"});
229 print $data->{"public"}."\n" if (defined $data->{"public"});
230 print $data->{"defaultindex"}."\n" if (defined $data->{"defaultindex"});
231 print $data->{"defaultlevel"}."\n" if (defined $data->{"defaultlevel"});
232 print $data->{"buildtype"}."\n" if (defined $data->{"buildtype"});
233 print join(",",@{$data->{"searchtype"}})."\n" if (defined $data->{"searchtype"});
234 print join(",",@{$data->{'levels'}})."\n" if (defined $data->{'levels'});
235 print join(",",@{$data->{'indexsubcollections'}})."\n" if (defined $data->{'indexsubcollections'});
236 print join(",",@{$data->{'indexes'}})."\n" if (defined $data->{'indexes'});
237 print join(",",@{$data->{'indexoptions'}})."\n" if (defined $data->{'indexoptions'});
238 print join(",",@{$data->{'languages'}})."\n" if (defined $data->{'languages'});
239 print join(",",@{$data->{'languagemetadata'}})."\n" if (defined $data->{'languagemetadata'});
240
241 if (defined $data->{'plugin'}) {
242 foreach $a (@{$data->{'plugin'}}) {
243 print join(",",@$a);
244 print "\n";
245 }
246 }
247 if (defined $data->{'classify'}) {
248 print "Classifiers: \n";
249 map { print join(",",@$_)."\n"; } @{$data->{'classify'}};
250 }
251
252 if (defined $data->{'subcollection'}) {
253 foreach my $key (keys %{$data->{'subcollection'}}) {
254 print "subcollection ".$key." ".$data->{'subcollection'}->{$key}."\n";
255 }
256 }
257}
258sub Doctype {
259 my ($expat, $name, $sysid, $pubid, $internal) = @_;
260
261 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
262 # to be processed as well as the "DirectoryMetadata" files which should now
263 # be created by import.pl
264 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
265}
266
267# This Char function overrides the one in XML::Parser::Stream to overcome a
268# problem where $expat->{Text} is treated as the return value, slowing
269# things down significantly in some cases.
270sub Char {
271 if ($]<5.008) {
272 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
273 }
274 $_[0]->{'Text'} .= $_[1];
275 return undef;
276}
277# Reads in the model collection configuration file, collectionConfig.xml,
278# into a structure which complies with the one used by gs2 (i.e. one read
279# in by &cfgread::read_cfg_file).
280sub read_cfg_file {
281 my ($filename) = @_;
282 $data = {};
283 if ($filename !~ /collectionConfig\.xml$/ || !-f $filename) {
284 return undef;
285 }
286
287 # create XML::Parser object for parsing metadata.xml files
288 my $parser;
289 if ($]<5.008) {
290 # Perl 5.6
291 $parser = new XML::Parser('Style' => 'Stream',
292 'Handlers' => {'Char' => \&Char,
293 'Doctype' => \&Doctype
294 });
295 }
296 else {
297 # Perl 5.8
298 $parser = new XML::Parser('Style' => 'Stream',
299 'ProtocolEncoding' => 'ISO-8859-1',
300 'Handlers' => {'Char' => \&Char,
301 'Doctype' => \&Doctype
302 });
303 }
304
305 if (!open (COLCFG, $filename)) {
306 print STDERR "cfgread::read_cfg_file couldn't read the cfg file $filename\n";
307 } else {
308
309 $parser->parsefile ($filename);# (COLCFG);
310 close (COLCFG);
311 }
312
313 #&Display; print "***********";
314 return $data;
315}
316
317
318sub write_line {
319 my ($filehandle, $line) = @_;
320 print $filehandle join ("", @$line), "\n";
321}
322
323sub write_cfg_file {
324 # information needed from $collectcfg: defaultindex, defaultlevel, classifiers,
325 # the rest is from $buildcfg
326 my ($buildoutfile, $buildcfg, $collectcfg) = @_;
327 my $line = [];
328
329 if (!open (COLCFG, ">$buildoutfile")) {
330 print STDERR "cfgread::write_cfg_file couldn't write the cfg file $buildoutfile\n";
331 die;
332 }
333
334 &write_line('COLCFG', ["<buildConfig xmlns:gsf=\"http://www.greenstone.org/greenstone3/schema/ConfigFormat\">"]);
335
336 # output building metadata to build config file
337 my $buildtype;
338 if (defined $buildcfg->{"buildtype"}) {
339 $buildtype = $buildcfg->{"buildtype"};
340 } else {
341 $buildtype = "mgpp";
342 }
343 my $numdocs;
344 if (defined $buildcfg->{"numdocs"}) {
345 $numdocs = $buildcfg->{"numdocs"};
346 }
347 &write_line('COLCFG', ["<metadataList>"]);
348 &write_line('COLCFG', ["<metadata name=\"numDocs\">", $numdocs, "</metadata>"]);
349 &write_line('COLCFG', ["<metadata name=\"buildType\">", $buildtype, "</metadata>"]);
350 &write_line('COLCFG', ["</metadataList>"]);
351
352 my $service_type = "MGPP";
353 if ($buildtype eq "mg") {
354 $service_type = "MG";
355 } elsif ($buildtype eq "lucene") {
356 $service_type = "Lucene";
357 }
358
359 # output serviceRackList
360 &write_line('COLCFG', ["<serviceRackList>"]);
361
362 # do the search service
363 &write_line('COLCFG', ["<serviceRack name=\"GS2", $service_type, "Search\">"]);
364 if (defined $buildcfg->{'indexstem'}) {
365 my $indexstem = $buildcfg->{'indexstem'};
366 &write_line('COLCFG', ["<indexStem name=\"", $indexstem, "\" />"]);
367 }
368
369 #indexes
370 # maps index name to shortname
371 my $indexmap = {};
372 # keeps the order for indexes
373 my @indexlist = ();
374
375 my $defaultindex = "";
376 my $first = 1;
377 my $maptype = "indexfieldmap";
378 if ($buildtype eq "mg") {
379 $maptype = "indexmap";
380 }
381 if (defined $buildcfg->{$maptype}) {
382 my $indexmap_t = $buildcfg->{$maptype};
383 foreach my $i (@$indexmap_t) {
384 my ($k, $v) = $i =~ /^(.*)\-\>(.*)$/;
385 $indexmap->{$k} = $v;
386 push @indexlist, $k;
387 if ($first) {
388 $defaultindex = $v;
389 $first = 0;
390 }
391 }
392 # now if the user has assigned a default index, we use it
393 if (defined $collectcfg->{"defaultindex"}) {
394 $defaultindex = $indexmap->{$collectcfg->{"defaultindex"}};
395 }
396
397 } else {
398 print STDERR "$maptype not defined";
399 }
400 #for each index in indexList, write them out
401 &write_line('COLCFG', ["<indexList>"]);
402 foreach my $i (@indexlist) {
403 my $index = $indexmap->{$i};
404 &write_line('COLCFG', ["<index name=\"", $i, "\" ", "shortname=\"", $index, "\" />"]);
405 }
406 &write_line('COLCFG', ["</indexList>"]);
407
408 # do default index only for mg
409 if ($buildtype eq "mg") {
410 &write_line('COLCFG', ["<defaultIndex shortname=\"", $defaultindex, "\" />"]);
411 }
412
413 # do indexOptionList
414 if ($buildtype eq "mg" || $buildtype eq "mgpp") {
415 &write_line('COLCFG', ["<indexOptionList>"]);
416 my $stemindexes = 3; # default is stem and casefold
417 if (defined $buildcfg->{'stemindexes'} && $buildcfg->{'stemindexes'} =~ /^\d+$/ ) {
418 $stemindexes = $buildcfg->{'stemindexes'};
419 }
420 &write_line('COLCFG', ["<indexOption name=\"stemIndexes\" value=\"", $stemindexes, "\" />"]);
421
422 my $maxnumeric = 4; # default
423 if (defined $buildcfg->{'maxnumeric'} && $buildcfg->{'maxnumeric'} =~ /^\d+$/) {
424 $maxnumeric = $buildcfg->{'maxnumeric'};
425 }
426 &write_line('COLCFG', ["<indexOption name=\"maxnumeric\" value=\"", $maxnumeric, "\" />"]);
427 &write_line('COLCFG', ["</indexOptionList>"]);
428 }
429
430 # levelList
431 my $levelmap = {};
432 my @levellist = ();
433 my $default_search_level = "";
434 my $default_retrieve_level = "Doc";#this is defaultGDBMLevel (also for the retrieve service)
435 $first = 1;
436 if ($buildtype eq "mgpp" || $buildtype eq "lucene") {
437 if (defined $buildcfg->{'levelmap'}) {
438 my $levelmap_t = $buildcfg->{'levelmap'};
439 foreach my $l (@$levelmap_t) {
440 my ($key, $val) = $l =~ /^(.*)\-\>(.*)$/;
441 $levelmap->{$key} = $val;
442 push @levellist, $key;
443 if ($first) {
444 $default_search_level = $val;
445 $first = 0;
446 }
447 }
448 }
449 # now if the user has assigned a default level, we use it
450 if (defined $collectcfg->{"defaultlevel"}) {
451 $default_search_level = $levelmap->{$collectcfg->{"defaultlevel"}};
452 $default_retrieve_level = $default_search_level;
453 }
454 #if (defined $buildcfg->{'textlevel'}) {
455 # $default_retrieve_level = $buildcfg->{'textlevel'};
456 #}
457 }
458 #for each level in levelList, write them out
459 if ($buildtype ne "mg") {
460 &write_line('COLCFG', ["<levelList>"]);
461 foreach my $lv (@levellist) {
462 my $level = $levelmap->{$lv};
463 &write_line('COLCFG', ["<level name=\"", $lv, "\" shortname=\"", $level, "\" />"]);
464 }
465 &write_line('COLCFG', ["</levelList>"]);
466 }
467 # add in defaultLevel as the same level as indexLevelList, making the reading job easier
468 if ($buildtype eq "lucene" || $buildtype eq "mgpp") {
469 &write_line('COLCFG', ["<defaultLevel shortname=\"", $default_search_level, "\" />"]);
470 }
471 &write_line('COLCFG', ["<defaultGDBMLevel shortname=\"", $default_retrieve_level, "\" />"]);
472
473 # do searchTypeList
474 if ($buildtype eq "mgpp" || $buildtype eq "lucene") {
475 &write_line('COLCFG', ["<searchTypeList>"]);
476
477 if (defined $buildcfg->{"searchtype"}) {
478 my $searchtype_t = $buildcfg->{"searchtype"};
479 foreach my $s (@$searchtype_t) {
480 &write_line('COLCFG', ["<searchType name=\"", $s, "\" />"]);
481 }
482 } else {
483 &write_line('COLCFG', ["<searchType name=\"plain\" />"]);
484 &write_line('COLCFG', ["<searchType name=\"form\" />"]);
485 }
486 &write_line('COLCFG', ["</searchTypeList>"]);
487 }
488
489 # do indexLanguageList [in collect.cfg: languages; in build.cfg: languagemap]
490 $first = 1;
491 my $default_lang = "";
492 my $default_lang_short = "";
493 if (defined $buildcfg->{"languagemap"}) {
494 &write_line('COLCFG', ["<indexLanguageList>"]);
495
496 my $langmap_t = $buildcfg->{"languagemap"};
497 foreach my $l (@$langmap_t) {
498 my ($k, $v) = $l =~ /^(.*)\-\>(.*)$/;
499
500 &write_line('COLCFG', ["<indexLanguage name=\"", $k, "\" shortname=\"", $v, "\" />"]);
501 if ($first) {
502 $default_lang = $k; #name
503 $default_lang_short = $v; #shortname
504 $first = 0;
505 }
506 }
507
508 &write_line('COLCFG', ["</indexLanguageList>"]);
509 # now if the user has assigned a default language (as "en", "ru" etc.)
510 if (defined $collectcfg->{"defaultlanguage"}) {
511 $default_lang = $collectcfg->{"defaultlanguage"};
512 }
513 &write_line('COLCFG', ["<defaultIndexLanguage name=\"", $default_lang, "\" shortname=\"", $default_lang_short, "\" />"]);
514 }
515
516
517 # do indexSubcollectionList
518 my $default_subcol = "";# make it in sub scope to be used in the concatenation
519 if (defined $buildcfg->{'subcollectionmap'}) {
520 &write_line('COLCFG', ["<indexSubcollectionList>"]);
521 my $subcolmap = {};
522 my @subcollist = ();
523 $first = 1;
524 my $subcolmap_t = $buildcfg->{'subcollectionmap'};
525 foreach my $l (@$subcolmap_t) {
526 my ($k, $v) = $l =~ /^(.*)\-\>(.*)$/;
527 $subcolmap->{$k} = $v;
528 push @subcollist, $k;
529 if ($first) {
530 $default_subcol = $v;
531 $first = 0;
532 }
533 }
534 foreach my $sl (@subcollist) {
535 my $subcol = $subcolmap->{$sl};
536 &write_line('COLCFG', ["<indexSubcollection name=\"", $sl, "\" shortname=\"", $subcol, "\" />"]);
537 }
538
539 &write_line('COLCFG', ["</indexSubcollectionList>"]);
540 &write_line('COLCFG', ["<defaultIndexSubcollection shortname=\"", $default_subcol, "\" />"]);
541 }
542
543 # close off search service
544 &write_line('COLCFG', ["</serviceRack>"]);
545
546 # do the retrieve service
547 &write_line('COLCFG', ["<serviceRack name=\"GS2", $service_type, "Retrieve\">"]);
548
549 # do default index
550 if (defined $buildcfg->{"languagemap"}) {
551 &write_line('COLCFG', ["<defaultIndexLanguage shortname=\"", $default_lang, "\" />"]);
552 }
553 if (defined $buildcfg->{'subcollectionmap'}) {
554 &write_line('COLCFG', ["<defaultIndexSubcollection shortname=\"", $default_subcol, "\" />"]);
555 }
556 if ($buildtype eq "mg") {
557 &write_line('COLCFG', ["<defaultIndex shortname=\"", $defaultindex, "\" />"]);
558 }
559
560 if (defined $buildcfg->{'indexstem'}) {
561 my $indexstem = $buildcfg->{'indexstem'};
562 &write_line('COLCFG', ["<indexStem name=\"", $indexstem, "\" />"]);
563 }
564 if ($buildtype eq "mgpp" || $buildtype eq "lucene") {
565 &write_line('COLCFG', ["<defaultLevel shortname=\"", $default_retrieve_level, "\" />"]);
566 }
567 &write_line('COLCFG', ["</serviceRack>"]);
568
569 # do the browse service
570 my $count = 1;
571 my $phind = 0;
572 my $started_classifiers = 0;
573
574 my $classifiers = $collectcfg->{"classify"};
575 foreach my $cl (@$classifiers) {
576 my $name = "CL$count";
577 $count++;
578 my ($classname) = @$cl[0];
579 if ($classname =~ /^phind$/i) {
580 $phind=1;
581 #should add it into coll config classifiers
582 next;
583 }
584
585 if (not $started_classifiers) {
586 &write_line('COLCFG', ["<serviceRack name=\"GS2Browse\">"]);
587 if (defined $buildcfg->{'indexstem'}) {
588 my $indexstem = $buildcfg->{'indexstem'};
589 &write_line('COLCFG', ["<indexStem name=\"", $indexstem, "\" />"]);
590 }
591 &write_line('COLCFG', ["<classifierList>"]);
592 $started_classifiers = 1;
593 }
594 my $content = ''; #use buttonname first, then metadata
595 if ($classname eq "DateList") {
596 $content = "Date";
597 } else {
598 for (my $j=0; $j<scalar(@$cl); $j++) {
599 my $arg = @$cl[$j];
600 if ($arg eq "-buttonname"){
601 $content = @$cl[$j+1];
602 last;
603 } elsif ($arg eq "-metadata") {
604 $content = @$cl[$j+1];
605 }
606
607 }
608 }
609 &write_line('COLCFG', ["<classifier name=\"", $name, "\" content=\"", $content, "\" />"]);
610 }
611 if ($started_classifiers) {
612 # end the classifiers
613 &write_line('COLCFG', ["</classifierList>"]);
614 # close off the Browse service
615 &write_line('COLCFG', ["</serviceRack>"]);
616 }
617
618 # the phind classifier is a separate service
619 if ($phind) {
620 # if phind classifier
621 &write_line('COLCFG', ["<serviceRack name=\"PhindPhraseBrowse\" />"]);
622 }
623
624 &write_line('COLCFG', ["</serviceRackList>"]);
625 &write_line('COLCFG', ["</buildConfig>"]);
626
627 close (COLCFG);
628 }
629
630
631#########################################################
632
6331;
Note: See TracBrowser for help on using the repository browser.