source: main/trunk/greenstone2/perllib/collConfigxml.pm@ 32221

Last change on this file since 32221 was 29461, checked in by ak19, 9 years ago

Dr Bainbridge fixed a bug where import/build perl script arguments that are purely punctuation are discarded. This became a problem, and was noticed by a member on the mailing list, when setting the removesuffix option on the List classifier to remove all starting period marks. Since the regex param contained only punctuation characters, the regex param value was discarded by collConfigxml.pm, but needed to be preserved. Other elements' param values do not need this special handling.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.1 KB
Line 
1###########################################################################
2#
3# collConfigxml.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# reads in configuration files of xml form
27
28package collConfigxml;
29use strict;
30no strict 'refs';
31no strict 'subs';
32
33use XMLParser;
34
35# A mapping hash to resolve name discrepancy between gs2 and gs3.
36# the first item is the gs3 element name from collectionConfig, the second one
37# is the internal name for the option
38my $nameMap = {"key" => "value",
39 "creator" => "creator",
40 "maintainer" => "maintainer",
41 "public" => "public",
42 "infodb" => "infodbtype",
43 "defaultIndex" => "defaultindex",
44 "defaultLevel" => "defaultlevel",
45 "name" => "collectionname",
46 "description" => "collectionextra",
47 "smallicon" => "iconcollectionsmall",
48 "icon" => "iconcollection",
49 "level" => "levels",
50 "classifier" => "classify",
51 "indexSubcollection" => "indexsubcollections",
52 "indexLanguage" => "languages",
53 "defaultIndexLanguage" => "defaultlanguage",
54 "index" => "indexes",
55 "indexfieldoptions" => "indexfieldoptions",
56 "sort" => "sortfields",
57 "defaultSort" => "defaultsort",
58 "facet" => "facetfields",
59 "plugin" => "plugin",
60 "plugout" => "plugout",
61 "indexOption" => "indexoptions",
62 "searchType" => "searchtype",
63 "languageMetadata" => "languagemetadata",
64 "buildType" => "buildtype",
65 "orthogonalBuildTypes" => "orthogonalbuildtypes",
66 };
67# A hash structure which is returned by sub read_cfg_file.
68my $data = {};
69
70my $repeatedBlock = q/^(browse|pluginList)$/;
71
72# use those unique attribute values to locate the text within the elements
73# creator, public, maintainer and within a displayItem.
74my $currentLocation = "";
75my $stringexp = q/^(creator|maintainer|public|buildType)$/;
76my $displayItemNames = q/^(name|description)$/;
77
78# these options get set at top level
79my $topleveloptionexp = q/^(importOption|buildOption)$/;
80
81# For storing the attributes during the StartTag subroutine, so that
82# we can use it later in Text (or EndTag) subroutines
83my $currentAttrRef = undef;
84
85my $currentLevel = "";
86
87# Count the elements with same name within the same block
88# ("plugin", "option")
89my $currentIndex = 0;
90
91my $structexp = q/^(index)$/;
92# structexp contains a hashmap of option(name, value) pairs per index name like allfields/ZZ or titles/TI
93# e.g. <index name="allfields">
94# <displayItem ... />
95# <option name="solrfieldtype" value="text_ja" />
96# </index>
97
98my $arrayexp = q/^(sort|facet|level|indexOption|indexSubcollection|indexLanguage|orthogonalBuildTypes)$/;
99#my $arrayexp = q/^(index|sort|facet|level|indexOption|indexSubcollection|indexLanguage|orthogonalBuildTypes)$/;
100my $arrayarrayexp = q/^(plugin|classifier)$/; #|buildOption)$/;
101my $hashexp = q/^(subcollection)$/; # add other element names that should be represented by hash expressions here
102my $hashhashexp = q/^(displayItem)$/; # add other (collectionmeta) element names that should be represented by hashes of hashes here.
103
104my $defaults = q/^(defaultIndex|defaultLevel|defaultSort|defaultIndexLanguage|languageMetadata)$/;
105
106# Reads in the model collection configuration file, collectionConfig.xml,
107# into a structure which complies with the one used by gs2 (i.e. one read
108# in by &cfgread::read_cfg_file).
109sub read_cfg_file {
110 my ($filename) = @_;
111 $data = {};
112 if ($filename !~ /collectionConfig\.xml$/ || !-f $filename) {
113 return undef;
114 }
115
116 # Removed ProtocolEncoding (see MetadataXMLPlugin for details)
117
118 # create XML::Parser object for parsing metadata.xml files
119 my $parser = new XML::Parser('Style' => 'Stream',
120 'Pkg' => 'collConfigxml',
121 'Handlers' => {'Char' => \&Char,
122 'Doctype' => \&Doctype
123 });
124 if (!open (COLCFG, $filename)) {
125 print STDERR "cfgread::read_cfg_file couldn't read the cfg file $filename\n";
126 } else {
127
128 $parser->parsefile ($filename);# (COLCFG);
129 close (COLCFG);
130 }
131
132 #&Display;
133 return $data;
134}
135
136sub StartTag {
137# Those marked with #@ will not be executed at the same time when this sub is being called
138# so that if/elsif is used to avoid unnecessary tests
139 my ($expat, $element) = @_;
140
141 # See http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm#Stream
142 # %_ is a hash of all the attributes of this element, we want to store them so we can use the attributes
143 # when the textnode contents of the element are parsed in the subroutine Text (that's the handler for Text).
144 $currentAttrRef = \%_;
145
146 my $name = $_{'name'};
147 my $value = $_{'value'};
148 my $type = $_{'type'};
149 my $orthogonal = $_{'orthogonal'};
150
151 # for subcollections
152 my $filter = $_{'filter'};
153
154 # was this just a flax thing??
155 my $assigned = $_{'assigned'};
156
157 #@ Marking repeated block
158 if ($element =~ /$repeatedBlock/) {
159 $currentIndex = 0;
160 }
161
162 #@ handling block metadataList
163 elsif (defined $name and $name =~ /$stringexp/){
164 $currentLocation = $name;
165 }
166 #@ handling default search index/level/indexLanguage and languageMetadata
167 elsif ($element =~ /$defaults/) {
168 if (defined $name and $name =~ /\w/) {
169 $data->{$nameMap->{$element}} = $name;
170 }
171 }
172
173 #@ handling the displayItems name and description (known as collectionname and collectionextra in GS2)
174 elsif($element eq "displayItemList") {
175 $currentLevel = "displayItemList"; # storing the parent if it is displayItemList
176 }
177 elsif($element =~ /$hashhashexp/) { # can expand on this to check for other collectionmeta elements
178 if((!defined $assigned) || (defined $assigned and $assigned =~ /\w/ and $assigned eq "true")) {
179 # either when there is no "assigned" attribute, or when assigned=true (for displayItems):
180 $currentLocation = $name;
181 }
182 }
183
184 #@ Handling database type: gdbm or gdbm-txtgz, later jdbm.
185 elsif ($element eq "infodb") {
186 $data->{'infodbtype'} = $type;
187 }
188
189 #@ Handling indexer: mgpp/mg/lucene; stringexp
190 #@ Handling orthogonal indexers: audioDB; arrayexp
191 elsif ($element eq "search") {
192 if ((defined $orthogonal) && ($orthogonal =~ m/^(true|on|1)$/i)) {
193 push(@{$data->{'orthogonalbuildtypes'}},$type);
194 }
195 else {
196 $data->{'buildtype'} = $type;
197 }
198 }
199
200 elsif ($element eq "store_metadata_coverage")
201 {
202## print STDERR "*&*&*&*&*& HERE &*&*&*&*&*&*";
203 $data->{'store_metadata_coverage'} = $value;
204 }
205
206 #@ Handling searchtype: plain,form; arrayexp
207 #elsif ($element eq "format" and defined $name and $name =~ /searchType/) {
208 #@ Handling searchtype: plain, form
209 #$currentLocation = $name;
210 #}
211
212 #@ Handle sort|facet|level|indexOption|indexSubcollection|indexLanguage
213 elsif ($element =~ /$arrayexp/) {
214 my $key = $nameMap->{$element}; #
215 if (!defined $data->{$key}) {
216 $data->{$key} = [];
217 }
218
219 if (defined $name) {
220 push (@{$data->{$key}},$name);
221 }
222 }
223
224 #@ Handle index which can have options as children to be put in a map: <option name="name" value="value" />
225 elsif ($element =~ /$structexp/) {
226 # find the gs2 mapping name
227 $currentLevel = $element;
228
229 # for GS2, 'indexes' should be an arrayexp, so maintain that part of the code as it is
230 my $key = $nameMap->{$element}; # 'indexes'
231 if (!defined $data->{$key}) {
232 $data->{$key} = [];
233 }
234
235 if (defined $name) {
236 push (@{$data->{$key}},$name);
237 }
238 }
239
240 #@ Handling the option elements in each index structure, if any, only for GS2
241 elsif ($currentLevel =~ /$structexp/ && $element eq "option") {
242 # find the gs2 mapping name for classifier and plugin
243 my $key = $nameMap->{$currentLevel."fieldoptions"}; # my $key = $currentLevel."fieldoptions"; # indexfieldoptions
244
245 # The last element of the 'indexes' array contains the name of the index currently being processed
246 # e.g. "allfields"
247 my $indexKey = $nameMap->{$currentLevel}; # 'indexes'
248 my $arrSize = scalar( @{$data->{$indexKey}} ); # length of 'indexes' array
249 my $indexName = @{$data->{$indexKey}}[$arrSize-1]; # name of index currently being processed in prev elsif
250
251 if (!defined $data->{$key}) {
252 $data->{$key} = {}; # 'indexoptions' is a new hashmap
253 }
254 if (defined $name and $name =~ /\w/ && defined $value and $value =~ /\w/) {
255 # we have a name and value to this option, add them as options associated with the current index
256
257 if (!defined $data->{$key}->{$indexName}) {
258 $data->{$key}->{$indexName} = {}; # indexoptions -> allfields is a new hashmap
259 }
260
261 $data->{$key}->{$indexName}->{$name} = $value;
262
263 #print STDERR "@@@ Found: Value: data->{'indexfieldoptions'}->{$indexName}->{$name}: " . $data->{'indexfieldoptions'}->{$indexName}->{$name} . "\n";
264 }
265 }
266
267 # importOption and buildOption, just stored at top level, name=value,
268 # as per gs2 version
269 elsif ($element =~ /$topleveloptionexp/) {
270 if (defined $name) {
271 if (!defined $value) {
272 # flag option, set to true
273 $value = "true";
274 }
275 $data->{$name} = $value;
276 }
277 }
278
279 #@ plugout options
280 elsif ($element eq "plugout") {
281 $currentLevel = "plugout";
282 my $key = $nameMap->{$currentLevel};
283 if (!defined $data->{$key}) {
284 $data->{$key} = [];
285 }
286 if(defined $name and $name ne ""){
287 push (@{$data->{$key}},$name);
288 }
289 else{
290 push (@{$data->{$key}},"GreenstoneXMLPlugout");
291 }
292 }
293 if ($currentLevel eq "plugout" and $element eq "option") {
294 my $key = $nameMap->{$currentLevel};
295 if (defined $name and $name ne ""){
296 push (@{$data->{$key}},$name);
297 }
298 if (defined $value and $value ne ""){
299 push (@{$data->{$key}},$value);
300 }
301 }
302
303 #@ use hash of hash of strings: hashexp
304 elsif ($element =~ /$hashexp/) {
305 if (!defined $data->{$element}) {
306 $data->{$element} = {};
307 }
308 if (defined $name and $name =~ /\w/) {
309 if (defined $filter and $filter =~ /\w/) {
310 $data->{$element}->{$name} = $filter;
311
312 }
313 }
314 }
315
316 #@ Handling each classifier/plugin element
317 elsif ($element =~ /$arrayarrayexp/) {
318 # find the gs2 mapping name
319 $currentLevel = $element;
320 my $key = $nameMap->{$element};
321
322 # define an array of array of strings foreach $k (@{$data->{$key}}) {
323 if (!defined $data->{$key}) {
324 $data->{$key} = [];
325 }
326
327 # Push classifier/plugin name (e.g. AZList) into $data as the first string
328 push (@{$data->{$key}->[$currentIndex]},$name);
329 if (defined $value and $value =~ /\w/) {
330 push (@{$data->{$key}->[$currentIndex]}, $value);
331 print "$value\n";
332 }
333 #print $currentIndex."indexup\n";
334 }
335
336 #@ Handling the option elements in each classifier/plugin element (as the following strings)
337 elsif ($currentLevel =~ /$arrayarrayexp/ and $element eq "option") {
338 # find the gs2 mapping name for classifier and plugin
339 my $key = $nameMap->{$currentLevel};
340
341 if (defined $name and $name =~ /\w/) {
342 push (@{$data->{$key}->[$currentIndex]}, $name);
343 }
344 if (defined $value and $value !~ /^\s*$/) {
345 push (@{$data->{$key}->[$currentIndex]}, $value);
346 }
347
348 }
349
350
351}
352
353sub EndTag {
354 my ($expat, $element) = @_;
355 my $endTags = q/^(browse|pluginList|displayItemList|indexOption)$/; #|buildOptionList)$/;
356 if ($element =~ /$endTags/) {
357 $currentIndex = 0;
358 $currentLevel = "";
359 }
360
361 # $arrayarrayexp contains classifier|plugin
362 elsif($element =~ /$arrayarrayexp/ ){
363 $currentIndex = $currentIndex + 1;
364 }
365}
366
367sub Text {
368 if (defined $currentLocation) {
369 #@ Handling block metadataList(creator, maintainer, public)
370 if($currentLocation =~ /$stringexp/){
371 #print $currentLocation;
372 my $key = $nameMap->{$currentLocation};
373 $data->{$key} = $_;
374 undef $currentLocation;
375 }
376
377 #@ Handling displayItem metadata that are children of displayItemList
378 # that means we will be getting the collection's name and possibly description ('collectionextra' in GS2).
379 elsif($currentLevel eq "displayItemList" && $currentLocation =~ /$displayItemNames/) {
380 my $lang = $currentAttrRef->{'lang'};
381 my $name = $currentAttrRef->{'name'};
382
383 # this is how data->collectionmeta's language is set in Greenstone 2.
384 # Need to be consistent, since export.pl accesses these values all in the same way
385 if(!defined $lang) {
386 $lang = 'default';
387 } else {
388 $lang = "[l=$lang]";
389 }
390
391 if(defined $name and $name =~ /$displayItemNames/) { # attribute name = 'name' || 'description'
392 # using $nameMap->$name resolves to 'collectionname' if $name='name' and 'collectionextra' if $name='description'
393 $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang} = $_; # the value is the Text parsed
394 #print STDERR "***Found: $nameMap->{$name} collectionmeta, lang is $lang. Value: $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang}\n";
395 }
396 undef $currentLocation;
397 }
398
399 #@ Handling searchtype: plain,form; arrayexp
400 elsif (defined $currentLocation and $currentLocation =~ /searchType/) {
401 # map 'searchType' into 'searchtype'
402 my $key = $nameMap->{$currentLocation};
403 # split it by ','
404 my ($plain, $form) = split (",", $_);
405
406 if (!defined $data->{$key}) {
407 $data->{$key} = [];
408 }
409 if (defined $plain and $plain =~ /\w/) {
410 push @{ $data->{$key} }, $plain;
411 }
412 if (defined $form and $form =~ /\w/) {
413 push @{ $data->{$key} }, $form;
414 }
415 }
416 }
417}
418
419# This sub is for debugging purposes
420sub Display {
421 # metadataList
422 foreach my $k (keys %{$data}) {
423 print STDERR "*** metadatalist key $k\n";
424 }
425
426 print STDERR "*** creator: ".$data->{'creator'}."\n" if (defined $data->{'creator'});
427 print STDERR "*** maintainer: ".$data->{"maintainer"}."\n" if (defined $data->{"maintainer"});
428 print STDERR "*** public: ".$data->{"public"}."\n" if (defined $data->{"public"});
429 print STDERR "*** default index: ".$data->{"defaultindex"}."\n" if (defined $data->{"defaultindex"});
430 print STDERR "*** default level: ".$data->{"defaultlevel"}."\n" if (defined $data->{"defaultlevel"});
431 print STDERR "*** build type: ".$data->{"buildtype"}."\n" if (defined $data->{"buildtype"});
432 print STDERR "*** orthogonal build types: ".join(",",$data->{"orthogonalbuildtypes"})."\n" if (defined $data->{"orthogonalbuildtypes"});
433 print STDERR "*** search types: \n";
434 print STDERR join(",",@{$data->{"searchtype"}})."\n" if (defined $data->{"searchtype"});
435 print STDERR "*** levels: \n";
436 print STDERR join(",",@{$data->{'levels'}})."\n" if (defined $data->{'levels'});
437 print STDERR "*** index subcollections: \n";
438 print STDERR join(",",@{$data->{'indexsubcollections'}})."\n" if (defined $data->{'indexsubcollections'});
439 print STDERR "*** indexes: \n";
440 print STDERR join(",",@{$data->{'indexes'}})."\n" if (defined $data->{'indexes'});
441 print STDERR "*** index options: \n";
442 print STDERR join(",",@{$data->{'indexoptions'}})."\n" if (defined $data->{'indexoptions'});
443 print STDERR "*** languages: \n";
444 print STDERR join(",",@{$data->{'languages'}})."\n" if (defined $data->{'languages'});
445 print STDERR "*** language metadata: \n";
446 print STDERR join(",",@{$data->{'languagemetadata'}})."\n" if (defined $data->{'languagemetadata'});
447
448 print STDERR "*** Plugins: \n";
449 if (defined $data->{'plugin'}) {
450 foreach $a (@{$data->{'plugin'}}) {
451 print join(",",@$a);
452 print "\n";
453 }
454 }
455
456 #print STDERR "*** Build options: \n";
457 #if (defined $data->{'store_metadata_coverage'}) {
458 #foreach $a (@{$data->{'store_metadata_coverage'}}) {
459 # print join(",",@$a,@$_);
460 # print "\n";
461 #}
462 #}
463
464 if (defined $data->{'classify'}) {
465 print STDERR "*** Classifiers: \n";
466 map { print join(",",@$_)."\n"; } @{$data->{'classify'}};
467 }
468
469 if (defined $data->{'subcollection'}) {
470 foreach my $key (keys %{$data->{'subcollection'}}) {
471 print "subcollection ".$key." ".$data->{'subcollection'}->{$key}."\n";
472 }
473 }
474}
475# is this actually used??
476sub Doctype {
477 my ($expat, $name, $sysid, $pubid, $internal) = @_;
478
479 die if ($name !~ /^CollectionConfig$/);
480}
481
482# This Char function overrides the one in XML::Parser::Stream to overcome a
483# problem where $expat->{Text} is treated as the return value, slowing
484# things down significantly in some cases.
485sub Char {
486 if ($]<5.008) {
487 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
488 }
489 $_[0]->{'Text'} .= $_[1];
490 return undef;
491}
492
493
494
495
496#########################################################
497
4981;
Note: See TracBrowser for help on using the repository browser.