source: main/trunk/greenstone2/perllib/collConfigxml.pm@ 29176

Last change on this file since 29176 was 29176, checked in by ak19, 10 years ago

For Solr: 1. collConfig.pm now reads any option sub-elements for index elements defined in collectionConfig.xml. 2. The solrfieldtype option sub-element to an index element can now be set by hand in collectionConfig.xml and its value will be used in the solr collection/etc/conf/schema.xml file for that index in that collection, instead of the old default of text_en_splitting for all fields.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.1 KB
RevLine 
[15600]1###########################################################################
2#
[20096]3# collConfigxml.pm --
[15600]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[14741]25
[15600]26# reads in configuration files of xml form
27
[20096]28package collConfigxml;
[15600]29use strict;
30no strict 'refs';
31no strict 'subs';
32
[23895]33use XMLParser;
[15600]34
[17895]35# A mapping hash to resolve name discrepancy between gs2 and gs3.
[28034]36# the first item is the gs3 element name from collectionConfig, the second one
37# is the internal name for the option
[15600]38my $nameMap = {"key" => "value",
39 "creator" => "creator",
40 "maintainer" => "maintainer",
41 "public" => "public",
[17895]42 "infodb" => "infodbtype",
[15600]43 "defaultIndex" => "defaultindex",
44 "defaultLevel" => "defaultlevel",
45 "name" => "collectionname",
46 "description" => "collectionextra",
47 "smallicon" => "iconcollectionsmall",
48 "icon" => "iconcollection",
49 "level" => "levels",
50 "classifier" => "classify",
51 "indexSubcollection" => "indexsubcollections",
52 "indexLanguage" => "languages",
53 "defaultIndexLanguage" => "defaultlanguage",
54 "index" => "indexes",
[29176]55 "indexfieldoptions" => "indexfieldoptions",
[27803]56 "sort" => "sortfields",
57 "facet" => "facetfields",
[15600]58 "plugin" => "plugin",
[17895]59 "plugout" => "plugout",
[15600]60 "indexOption" => "indexoptions",
61 "searchType" => "searchtype",
62 "languageMetadata" => "languagemetadata",
[22456]63 "buildType" => "buildtype",
[24464]64 "orthogonalBuildTypes" => "orthogonalbuildtypes",
[15600]65 };
66# A hash structure which is returned by sub read_cfg_file.
67my $data = {};
68
[28034]69my $repeatedBlock = q/^(browse|pluginList)$/;
[15600]70
71# use those unique attribute values to locate the text within the elements
[15619]72# creator, public, maintainer and within a displayItem.
[15600]73my $currentLocation = "";
[19898]74my $stringexp = q/^(creator|maintainer|public|buildType)$/;
[15619]75my $displayItemNames = q/^(name|description)$/;
76
[28034]77# these options get set at top level
78my $topleveloptionexp = q/^(importOption|buildOption)$/;
79
[15619]80# For storing the attributes during the StartTag subroutine, so that
81# we can use it later in Text (or EndTag) subroutines
82my $currentAttrRef = undef;
[15600]83
84my $currentLevel = "";
85
86# Count the elements with same name within the same block
87# ("plugin", "option")
88my $currentIndex = 0;
[29176]89
90my $structexp = q/^(index)$/;
91# structexp contains a hashmap of option(name, value) pairs per index name like allfields/ZZ or titles/TI
92# e.g. <index name="allfields">
93# <displayItem ... />
94# <option name="solrfieldtype" value="text_ja" />
95# </index>
96
97my $arrayexp = q/^(sort|facet|level|indexOption|indexSubcollection|indexLanguage|orthogonalBuildTypes)$/;
98#my $arrayexp = q/^(index|sort|facet|level|indexOption|indexSubcollection|indexLanguage|orthogonalBuildTypes)$/;
[26451]99my $arrayarrayexp = q/^(plugin|classifier)$/; #|buildOption)$/;
[15619]100my $hashexp = q/^(subcollection)$/; # add other element names that should be represented by hash expressions here
101my $hashhashexp = q/^(displayItem)$/; # add other (collectionmeta) element names that should be represented by hashes of hashes here.
[15600]102
103my $defaults = q/^(defaultIndex|defaultLevel|defaultIndexLanguage|languageMetadata)$/;
104
[20099]105# Reads in the model collection configuration file, collectionConfig.xml,
106# into a structure which complies with the one used by gs2 (i.e. one read
107# in by &cfgread::read_cfg_file).
108sub read_cfg_file {
109 my ($filename) = @_;
110 $data = {};
111 if ($filename !~ /collectionConfig\.xml$/ || !-f $filename) {
112 return undef;
113 }
114
[23895]115 # Removed ProtocolEncoding (see MetadataXMLPlugin for details)
116
[20099]117 # create XML::Parser object for parsing metadata.xml files
[23895]118 my $parser = new XML::Parser('Style' => 'Stream',
119 'Pkg' => 'collConfigxml',
120 'Handlers' => {'Char' => \&Char,
[20099]121 'Doctype' => \&Doctype
122 });
123 if (!open (COLCFG, $filename)) {
124 print STDERR "cfgread::read_cfg_file couldn't read the cfg file $filename\n";
125 } else {
126
127 $parser->parsefile ($filename);# (COLCFG);
128 close (COLCFG);
129 }
130
[22485]131 #&Display;
[20099]132 return $data;
133}
134
[15600]135sub StartTag {
136# Those marked with #@ will not be executed at the same time when this sub is being called
137# so that if/elsif is used to avoid unnecessary tests
138 my ($expat, $element) = @_;
[15619]139
140 # See http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm#Stream
[17895]141 # %_ is a hash of all the attributes of this element, we want to store them so we can use the attributes
[15619]142 # when the textnode contents of the element are parsed in the subroutine Text (that's the handler for Text).
143 $currentAttrRef = \%_;
[15600]144
145 my $name = $_{'name'};
146 my $value = $_{'value'};
147 my $type = $_{'type'};
[24464]148 my $orthogonal = $_{'orthogonal'};
[15600]149
150 # for subcollections
151 my $filter = $_{'filter'};
[20099]152
[20104]153 # was this just a flax thing??
154 my $assigned = $_{'assigned'};
155
[15600]156 #@ Marking repeated block
157 if ($element =~ /$repeatedBlock/) {
158 $currentIndex = 0;
159 }
160
161 #@ handling block metadataList
162 elsif (defined $name and $name =~ /$stringexp/){
163 $currentLocation = $name;
164 }
165 #@ handling default search index/level/indexLanguage and languageMetadata
166 elsif ($element =~ /$defaults/) {
167 if (defined $name and $name =~ /\w/) {
168 $data->{$nameMap->{$element}} = $name;
169 }
170 }
171
[15619]172 #@ handling the displayItems name and description (known as collectionname and collectionextra in GS2)
173 elsif($element eq "displayItemList") {
174 $currentLevel = "displayItemList"; # storing the parent if it is displayItemList
175 }
176 elsif($element =~ /$hashhashexp/) { # can expand on this to check for other collectionmeta elements
177 if((!defined $assigned) || (defined $assigned and $assigned =~ /\w/ and $assigned eq "true")) {
178 # either when there is no "assigned" attribute, or when assigned=true (for displayItems):
179 $currentLocation = $name;
180 }
181 }
[17895]182
183 #@ Handling database type: gdbm or gdbm-txtgz, later jdbm.
184 elsif ($element eq "infodb") {
185 $data->{'infodbtype'} = $type;
186 }
[15619]187
[15600]188 #@ Handling indexer: mgpp/mg/lucene; stringexp
[24464]189 #@ Handling orthogonal indexers: audioDB; arrayexp
[15600]190 elsif ($element eq "search") {
[24464]191 if ((defined $orthogonal) && ($orthogonal =~ m/^(true|on|1)$/i)) {
192 push(@{$data->{'orthogonalbuildtypes'}},$type);
193 }
194 else {
195 $data->{'buildtype'} = $type;
196 }
[15600]197 }
[24055]198
[24464]199 elsif ($element eq "store_metadata_coverage")
200 {
201## print STDERR "*&*&*&*&*& HERE &*&*&*&*&*&*";
202 $data->{'store_metadata_coverage'} = $value;
203 }
[15600]204
205 #@ Handling searchtype: plain,form; arrayexp
206 #elsif ($element eq "format" and defined $name and $name =~ /searchType/) {
207 #@ Handling searchtype: plain, form
208 #$currentLocation = $name;
209 #}
210
[29176]211 #@ Handle sort|facet|level|indexOption|indexSubcollection|indexLanguage
[15600]212 elsif ($element =~ /$arrayexp/) {
[28034]213 my $key = $nameMap->{$element}; #
[15600]214 if (!defined $data->{$key}) {
215 $data->{$key} = [];
216 }
217
[27192]218 if (defined $name) {
219 push (@{$data->{$key}},$name);
220 }
[15600]221 }
[29176]222
223 #@ Handle index which can have options as children to be put in a map: <option name="name" value="value" />
224 elsif ($element =~ /$structexp/) {
225 # find the gs2 mapping name
226 $currentLevel = $element;
227
228 # for GS2, 'indexes' should be an arrayexp, so maintain that part of the code as it is
229 my $key = $nameMap->{$element}; # 'indexes'
230 if (!defined $data->{$key}) {
231 $data->{$key} = [];
232 }
233
234 if (defined $name) {
235 push (@{$data->{$key}},$name);
236 }
237 }
238
239 #@ Handling the option elements in each index structure, if any, only for GS2
240 elsif ($currentLevel =~ /$structexp/ && $element eq "option") {
241 # find the gs2 mapping name for classifier and plugin
242 my $key = $nameMap->{$currentLevel."fieldoptions"}; # my $key = $currentLevel."fieldoptions"; # indexfieldoptions
243
244 # The last element of the 'indexes' array contains the name of the index currently being processed
245 # e.g. "allfields"
246 my $indexKey = $nameMap->{$currentLevel}; # 'indexes'
247 my $arrSize = scalar( @{$data->{$indexKey}} ); # length of 'indexes' array
248 my $indexName = @{$data->{$indexKey}}[$arrSize-1]; # name of index currently being processed in prev elsif
249
250 if (!defined $data->{$key}) {
251 $data->{$key} = {}; # 'indexoptions' is a new hashmap
252 }
253 if (defined $name and $name =~ /\w/ && defined $value and $value =~ /\w/) {
254 # we have a name and value to this option, add them as options associated with the current index
255
256 if (!defined $data->{$key}->{$indexName}) {
257 $data->{$key}->{$indexName} = {}; # indexoptions -> allfields is a new hashmap
258 }
259
260 $data->{$key}->{$indexName}->{$name} = $value;
261
262 #print STDERR "@@@ Found: Value: data->{'indexfieldoptions'}->{$indexName}->{$name}: " . $data->{'indexfieldoptions'}->{$indexName}->{$name} . "\n";
263 }
264 }
265
[28034]266 # importOption and buildOption, just stored at top level, name=value,
267 # as per gs2 version
268 elsif ($element =~ /$topleveloptionexp/) {
269 if (defined $name) {
270 if (!defined $value) {
271 # flag option, set to true
272 $value = "true";
273 }
274 $data->{$name} = $value;
275 }
276 }
[15600]277
278 #@ plugout options
279 elsif ($element eq "plugout") {
280 $currentLevel = "plugout";
281 my $key = $nameMap->{$currentLevel};
282 if (!defined $data->{$key}) {
283 $data->{$key} = [];
284 }
285 if(defined $name and $name ne ""){
286 push (@{$data->{$key}},$name);
287 }
288 else{
[17747]289 push (@{$data->{$key}},"GreenstoneXMLPlugout");
[15600]290 }
291 }
292 if ($currentLevel eq "plugout" and $element eq "option") {
293 my $key = $nameMap->{$currentLevel};
294 if (defined $name and $name ne ""){
295 push (@{$data->{$key}},$name);
296 }
297 if (defined $value and $value ne ""){
298 push (@{$data->{$key}},$value);
299 }
300 }
301
302 #@ use hash of hash of strings: hashexp
[15619]303 elsif ($element =~ /$hashexp/) {
304 if (!defined $data->{$element}) {
305 $data->{$element} = {};
[15600]306 }
307 if (defined $name and $name =~ /\w/) {
308 if (defined $filter and $filter =~ /\w/) {
[15619]309 $data->{$element}->{$name} = $filter;
[15600]310
311 }
312 }
313 }
314
315 #@ Handling each classifier/plugin element
316 elsif ($element =~ /$arrayarrayexp/) {
317 # find the gs2 mapping name
318 $currentLevel = $element;
[26451]319 my $key = $nameMap->{$element};
320
[15600]321 # define an array of array of strings foreach $k (@{$data->{$key}}) {
322 if (!defined $data->{$key}) {
[26451]323 $data->{$key} = [];
[15600]324 }
[26451]325
[15600]326 # Push classifier/plugin name (e.g. AZList) into $data as the first string
[26451]327 push (@{$data->{$key}->[$currentIndex]},$name);
328 if (defined $value and $value =~ /\w/) {
329 push (@{$data->{$key}->[$currentIndex]}, $value);
330 print "$value\n";
331 }
[15600]332 #print $currentIndex."indexup\n";
[26450]333 }
334
[15600]335 #@ Handling the option elements in each classifier/plugin element (as the following strings)
336 elsif ($currentLevel =~ /$arrayarrayexp/ and $element eq "option") {
337 # find the gs2 mapping name for classifier and plugin
338 my $key = $nameMap->{$currentLevel};
339
340 if (defined $name and $name =~ /\w/) {
341 push (@{$data->{$key}->[$currentIndex]}, $name);
342 }
343 if (defined $value and $value =~ /\w/) {
344 push (@{$data->{$key}->[$currentIndex]}, $value);
345 }
346
347 }
[26451]348
349
[15600]350}
351
352sub EndTag {
353 my ($expat, $element) = @_;
[28034]354 my $endTags = q/^(browse|pluginList|displayItemList|indexOption)$/; #|buildOptionList)$/;
[15600]355 if ($element =~ /$endTags/) {
356 $currentIndex = 0;
357 $currentLevel = "";
358 }
[26450]359
[15600]360 # $arrayarrayexp contains classifier|plugin
[20102]361 elsif($element =~ /$arrayarrayexp/ ){
[15600]362 $currentIndex = $currentIndex + 1;
363 }
364}
365
366sub Text {
[15619]367 if (defined $currentLocation) {
368 #@ Handling block metadataList(creator, maintainer, public)
369 if($currentLocation =~ /$stringexp/){
370 #print $currentLocation;
371 my $key = $nameMap->{$currentLocation};
372 $data->{$key} = $_;
373 undef $currentLocation;
374 }
[15600]375
[15619]376 #@ Handling displayItem metadata that are children of displayItemList
377 # that means we will be getting the collection's name and possibly description ('collectionextra' in GS2).
378 elsif($currentLevel eq "displayItemList" && $currentLocation =~ /$displayItemNames/) {
379 my $lang = $currentAttrRef->{'lang'};
380 my $name = $currentAttrRef->{'name'};
381
382 # this is how data->collectionmeta's language is set in Greenstone 2.
383 # Need to be consistent, since export.pl accesses these values all in the same way
384 if(!defined $lang) {
385 $lang = 'default';
386 } else {
387 $lang = "[l=$lang]";
388 }
389
390 if(defined $name and $name =~ /$displayItemNames/) { # attribute name = 'name' || 'description'
391 # using $nameMap->$name resolves to 'collectionname' if $name='name' and 'collectionextra' if $name='description'
392 $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang} = $_; # the value is the Text parsed
393 #print STDERR "***Found: $nameMap->{$name} collectionmeta, lang is $lang. Value: $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang}\n";
394 }
395 undef $currentLocation;
[15600]396 }
[15619]397
398 #@ Handling searchtype: plain,form; arrayexp
399 elsif (defined $currentLocation and $currentLocation =~ /searchType/) {
400 # map 'searchType' into 'searchtype'
401 my $key = $nameMap->{$currentLocation};
402 # split it by ','
403 my ($plain, $form) = split (",", $_);
404
405 if (!defined $data->{$key}) {
406 $data->{$key} = [];
407 }
408 if (defined $plain and $plain =~ /\w/) {
409 push @{ $data->{$key} }, $plain;
410 }
411 if (defined $form and $form =~ /\w/) {
412 push @{ $data->{$key} }, $form;
413 }
[15600]414 }
[15619]415 }
[15600]416}
[15619]417
[15600]418# This sub is for debugging purposes
419sub Display {
420 # metadataList
[15619]421 foreach my $k (keys %{$data}) {
422 print STDERR "*** metadatalist key $k\n";
423 }
424
[22456]425 print STDERR "*** creator: ".$data->{'creator'}."\n" if (defined $data->{'creator'});
426 print STDERR "*** maintainer: ".$data->{"maintainer"}."\n" if (defined $data->{"maintainer"});
427 print STDERR "*** public: ".$data->{"public"}."\n" if (defined $data->{"public"});
428 print STDERR "*** default index: ".$data->{"defaultindex"}."\n" if (defined $data->{"defaultindex"});
429 print STDERR "*** default level: ".$data->{"defaultlevel"}."\n" if (defined $data->{"defaultlevel"});
430 print STDERR "*** build type: ".$data->{"buildtype"}."\n" if (defined $data->{"buildtype"});
[24464]431 print STDERR "*** orthogonal build types: ".join(",",$data->{"orthogonalbuildtypes"})."\n" if (defined $data->{"orthogonalbuildtypes"});
[22456]432 print STDERR "*** search types: \n";
433 print STDERR join(",",@{$data->{"searchtype"}})."\n" if (defined $data->{"searchtype"});
434 print STDERR "*** levels: \n";
435 print STDERR join(",",@{$data->{'levels'}})."\n" if (defined $data->{'levels'});
436 print STDERR "*** index subcollections: \n";
437 print STDERR join(",",@{$data->{'indexsubcollections'}})."\n" if (defined $data->{'indexsubcollections'});
438 print STDERR "*** indexes: \n";
439 print STDERR join(",",@{$data->{'indexes'}})."\n" if (defined $data->{'indexes'});
440 print STDERR "*** index options: \n";
441 print STDERR join(",",@{$data->{'indexoptions'}})."\n" if (defined $data->{'indexoptions'});
442 print STDERR "*** languages: \n";
443 print STDERR join(",",@{$data->{'languages'}})."\n" if (defined $data->{'languages'});
444 print STDERR "*** language metadata: \n";
445 print STDERR join(",",@{$data->{'languagemetadata'}})."\n" if (defined $data->{'languagemetadata'});
[15600]446
[22456]447 print STDERR "*** Plugins: \n";
[15600]448 if (defined $data->{'plugin'}) {
449 foreach $a (@{$data->{'plugin'}}) {
450 print join(",",@$a);
451 print "\n";
452 }
453 }
[22456]454
455 #print STDERR "*** Build options: \n";
456 #if (defined $data->{'store_metadata_coverage'}) {
457 #foreach $a (@{$data->{'store_metadata_coverage'}}) {
458 # print join(",",@$a,@$_);
459 # print "\n";
460 #}
461 #}
462
[15600]463 if (defined $data->{'classify'}) {
[22456]464 print STDERR "*** Classifiers: \n";
[15600]465 map { print join(",",@$_)."\n"; } @{$data->{'classify'}};
466 }
467
468 if (defined $data->{'subcollection'}) {
469 foreach my $key (keys %{$data->{'subcollection'}}) {
470 print "subcollection ".$key." ".$data->{'subcollection'}->{$key}."\n";
471 }
472 }
473}
[20104]474# is this actually used??
[15600]475sub Doctype {
476 my ($expat, $name, $sysid, $pubid, $internal) = @_;
477
[20104]478 die if ($name !~ /^CollectionConfig$/);
[15600]479}
480
481# This Char function overrides the one in XML::Parser::Stream to overcome a
482# problem where $expat->{Text} is treated as the return value, slowing
483# things down significantly in some cases.
484sub Char {
485 if ($]<5.008) {
486 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
487 }
488 $_[0]->{'Text'} .= $_[1];
489 return undef;
490}
[15619]491
[15600]492
493
494
495#########################################################
496
4971;
Note: See TracBrowser for help on using the repository browser.