root/main/trunk/greenstone2/perllib/collConfigxml.pm @ 29176

Revision 29176, 17.1 KB (checked in by ak19, 5 years ago)

For Solr: 1. collConfig.pm now reads any option sub-elements for index elements defined in collectionConfig.xml. 2. The solrfieldtype option sub-element to an index element can now be set by hand in collectionConfig.xml and its value will be used in the solr collection/etc/conf/schema.xml file for that index in that collection, instead of the old default of text_en_splitting for all fields.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# collConfigxml.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# reads in configuration files of xml form
27
28package collConfigxml;
29use strict;
30no strict 'refs';
31no strict 'subs';
32
33use XMLParser;
34
35# A mapping hash to resolve name discrepancy between gs2 and gs3.
36# the first item is the gs3 element name from collectionConfig, the second one
37# is the internal name for the option
38my $nameMap = {"key" => "value",
39           "creator" => "creator",
40           "maintainer" => "maintainer",
41           "public" => "public",
42           "infodb" => "infodbtype",
43           "defaultIndex" => "defaultindex",
44           "defaultLevel" => "defaultlevel",
45           "name" => "collectionname",
46           "description" => "collectionextra",
47           "smallicon" => "iconcollectionsmall",
48           "icon" => "iconcollection",
49           "level" => "levels",
50           "classifier" => "classify",
51           "indexSubcollection" => "indexsubcollections",
52           "indexLanguage" => "languages",
53           "defaultIndexLanguage" => "defaultlanguage",
54           "index" => "indexes",
55           "indexfieldoptions" => "indexfieldoptions",
56           "sort" => "sortfields",
57           "facet" => "facetfields",
58           "plugin" => "plugin",
59           "plugout" => "plugout",
60           "indexOption" => "indexoptions",
61           "searchType" => "searchtype",
62           "languageMetadata" => "languagemetadata",
63           "buildType" => "buildtype",
64           "orthogonalBuildTypes" => "orthogonalbuildtypes",
65           };
66# A hash structure which is returned by sub read_cfg_file.
67my $data = {};
68
69my $repeatedBlock = q/^(browse|pluginList)$/;
70
71# use those unique attribute values to locate the text within the elements
72# creator, public, maintainer and within a displayItem.
73my $currentLocation = "";
74my $stringexp = q/^(creator|maintainer|public|buildType)$/;
75my $displayItemNames = q/^(name|description)$/;
76 
77# these options get set at top level
78my $topleveloptionexp = q/^(importOption|buildOption)$/;
79
80# For storing the attributes during the StartTag subroutine, so that
81# we can use it later in Text (or EndTag) subroutines
82my $currentAttrRef = undef;
83
84my $currentLevel = "";
85
86# Count the elements with same name within the same block
87# ("plugin", "option")
88my $currentIndex = 0;
89
90my $structexp = q/^(index)$/;
91# structexp contains a hashmap of option(name, value) pairs per index name like allfields/ZZ or titles/TI
92# e.g. <index name="allfields">
93#    <displayItem ... />
94#    <option name="solrfieldtype" value="text_ja" />
95#      </index>
96
97my $arrayexp = q/^(sort|facet|level|indexOption|indexSubcollection|indexLanguage|orthogonalBuildTypes)$/;
98#my $arrayexp = q/^(index|sort|facet|level|indexOption|indexSubcollection|indexLanguage|orthogonalBuildTypes)$/;
99my $arrayarrayexp = q/^(plugin|classifier)$/; #|buildOption)$/;
100my $hashexp = q/^(subcollection)$/; # add other element names that should be represented by hash expressions here
101my $hashhashexp = q/^(displayItem)$/; # add other (collectionmeta) element names that should be represented by hashes of hashes here.
102
103my $defaults = q/^(defaultIndex|defaultLevel|defaultIndexLanguage|languageMetadata)$/;
104
105# Reads in the model collection configuration file, collectionConfig.xml,
106# into a structure which complies with the one used by gs2 (i.e. one read
107# in by &cfgread::read_cfg_file).
108sub read_cfg_file {
109    my ($filename) = @_;
110    $data = {};
111    if ($filename !~ /collectionConfig\.xml$/ || !-f $filename) {
112        return undef;
113    }
114
115    # Removed ProtocolEncoding (see MetadataXMLPlugin for details)
116
117    # create XML::Parser object for parsing metadata.xml files
118    my $parser = new XML::Parser('Style' => 'Stream',
119                 'Pkg' => 'collConfigxml',
120                 'Handlers' => {'Char' => \&Char,
121                         'Doctype' => \&Doctype
122                         });
123    if (!open (COLCFG, $filename)) {
124    print STDERR "cfgread::read_cfg_file couldn't read the cfg file $filename\n";
125    } else {
126
127      $parser->parsefile ($filename);# (COLCFG);
128      close (COLCFG);
129    }
130
131    #&Display;
132    return $data;
133}
134
135sub StartTag {
136# Those marked with #@ will not be executed at the same time when this sub is being called
137# so that if/elsif is used to avoid unnecessary tests
138    my ($expat, $element) = @_;
139   
140    # See http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm#Stream
141    # %_ is a hash of all the attributes of this element, we want to store them so we can use the attributes
142    # when the textnode contents of the element are parsed in the subroutine Text (that's the handler for Text).
143    $currentAttrRef = \%_;
144
145    my $name = $_{'name'};
146    my $value = $_{'value'};
147    my $type = $_{'type'};
148    my $orthogonal = $_{'orthogonal'};
149
150    # for subcollections
151    my $filter = $_{'filter'};
152   
153    # was this just a flax thing??
154    my $assigned = $_{'assigned'};
155   
156    #@ Marking repeated block
157    if ($element =~ /$repeatedBlock/) {
158    $currentIndex = 0;
159    }
160
161    #@ handling block metadataList
162    elsif (defined $name and $name =~ /$stringexp/){
163      $currentLocation = $name;
164    }
165    #@ handling default search index/level/indexLanguage and languageMetadata
166    elsif ($element =~ /$defaults/) {
167      if (defined $name and $name =~ /\w/) {
168    $data->{$nameMap->{$element}} = $name;
169      }
170    }
171
172    #@ handling the displayItems name and description (known as collectionname and collectionextra in GS2)
173    elsif($element eq "displayItemList") {
174    $currentLevel = "displayItemList"; # storing the parent if it is displayItemList
175    }
176    elsif($element =~ /$hashhashexp/) { # can expand on this to check for other collectionmeta elements
177    if((!defined $assigned) || (defined $assigned and $assigned =~ /\w/ and $assigned eq "true")) {
178        # either when there is no "assigned" attribute, or when assigned=true (for displayItems):
179        $currentLocation = $name;
180    }
181    }
182
183    #@ Handling database type: gdbm or gdbm-txtgz, later jdbm.
184    elsif ($element eq "infodb") {
185      $data->{'infodbtype'} = $type;
186    }
187   
188    #@ Handling indexer: mgpp/mg/lucene; stringexp
189    #@ Handling orthogonal indexers: audioDB; arrayexp
190    elsif ($element eq "search") {
191    if ((defined $orthogonal) && ($orthogonal =~ m/^(true|on|1)$/i)) {
192        push(@{$data->{'orthogonalbuildtypes'}},$type);
193    }
194    else {
195        $data->{'buildtype'} = $type;
196    }
197    }
198   
199    elsif ($element eq "store_metadata_coverage")
200    {
201##  print STDERR "*&*&*&*&*& HERE &*&*&*&*&*&*";
202    $data->{'store_metadata_coverage'} = $value;
203    }
204
205    #@ Handling searchtype: plain,form; arrayexp
206    #elsif ($element eq "format" and defined $name and $name =~ /searchType/) {
207    #@ Handling searchtype: plain, form
208    #$currentLocation = $name; 
209    #}
210 
211    #@ Handle sort|facet|level|indexOption|indexSubcollection|indexLanguage
212    elsif ($element =~ /$arrayexp/) {
213      my $key = $nameMap->{$element};   #
214      if (!defined $data->{$key}) {
215    $data->{$key} = [];
216      }
217
218      if (defined $name) {
219      push (@{$data->{$key}},$name);
220      }
221    }
222
223    #@ Handle index which can have options as children to be put in a map: <option name="name" value="value" />
224    elsif ($element =~ /$structexp/) {
225    # find the gs2 mapping name
226        $currentLevel = $element;
227   
228    # for GS2, 'indexes' should be an arrayexp, so maintain that part of the code as it is 
229    my $key = $nameMap->{$element}; # 'indexes'
230    if (!defined $data->{$key}) {
231        $data->{$key} = [];     
232    }
233   
234    if (defined $name) {
235        push (@{$data->{$key}},$name);     
236    }
237    }
238
239    #@ Handling the option elements in each index structure, if any, only for GS2
240    elsif ($currentLevel =~ /$structexp/ && $element eq "option") {
241    # find the gs2 mapping name for classifier and plugin
242    my $key = $nameMap->{$currentLevel."fieldoptions"}; # my $key = $currentLevel."fieldoptions"; # indexfieldoptions
243
244    # The last element of the 'indexes' array contains the name of the index currently being processed
245    # e.g. "allfields"
246    my $indexKey = $nameMap->{$currentLevel}; # 'indexes'
247    my $arrSize = scalar( @{$data->{$indexKey}} ); # length of 'indexes' array
248    my $indexName = @{$data->{$indexKey}}[$arrSize-1]; # name of index currently being processed in prev elsif
249
250    if (!defined $data->{$key}) {
251        $data->{$key} = {}; # 'indexoptions' is a new hashmap
252    }   
253    if (defined $name and $name =~ /\w/ && defined $value and $value =~ /\w/) {
254        # we have a name and value to this option, add them as options associated with the current index
255       
256        if (!defined $data->{$key}->{$indexName}) {
257        $data->{$key}->{$indexName} = {}; # indexoptions -> allfields is a new hashmap
258        }
259       
260        $data->{$key}->{$indexName}->{$name} = $value;
261       
262        #print STDERR "@@@ Found: Value: data->{'indexfieldoptions'}->{$indexName}->{$name}: " . $data->{'indexfieldoptions'}->{$indexName}->{$name} . "\n";       
263    }
264    }
265
266    # importOption and buildOption, just stored at top level, name=value,
267    # as per gs2 version
268    elsif ($element =~ /$topleveloptionexp/) {
269    if (defined $name) {
270        if (!defined $value) {
271        # flag option, set to true
272        $value = "true";
273        }
274        $data->{$name} = $value;
275    }
276    }
277
278    #@ plugout options
279    elsif ($element eq "plugout") {
280    $currentLevel = "plugout";
281    my $key = $nameMap->{$currentLevel};   
282    if (!defined $data->{$key}) {
283        $data->{$key} = [];
284    }
285    if(defined $name and $name ne ""){
286        push (@{$data->{$key}},$name);
287    }
288    else{
289       push (@{$data->{$key}},"GreenstoneXMLPlugout");
290    }
291    }
292    if ($currentLevel eq "plugout" and $element eq "option") {     
293    my $key = $nameMap->{$currentLevel};
294    if (defined $name and $name ne ""){
295        push (@{$data->{$key}},$name);
296    }
297    if (defined $value and $value ne  ""){
298        push (@{$data->{$key}},$value);
299    }
300    }
301
302    #@ use hash of hash of strings: hashexp
303    elsif ($element =~ /$hashexp/) {
304      if (!defined $data->{$element}) {
305    $data->{$element} = {};
306      }
307      if (defined $name and $name =~ /\w/) {
308    if (defined $filter and $filter =~ /\w/) {
309      $data->{$element}->{$name} = $filter;
310
311    }
312      }
313    }
314
315    #@ Handling each classifier/plugin element
316    elsif ($element =~ /$arrayarrayexp/) {
317    # find the gs2 mapping name
318        $currentLevel = $element;
319        my $key = $nameMap->{$element};
320   
321    # define an array of array of strings   foreach $k (@{$data->{$key}}) {
322    if (!defined $data->{$key}) {
323        $data->{$key} = [];
324    }
325   
326    # Push classifier/plugin name (e.g. AZList) into $data as the first string
327    push (@{$data->{$key}->[$currentIndex]},$name);
328    if (defined $value and $value =~ /\w/) {
329        push (@{$data->{$key}->[$currentIndex]}, $value);
330        print "$value\n";
331    }   
332    #print $currentIndex."indexup\n";
333    }
334
335    #@ Handling the option elements in each classifier/plugin element (as the following strings)
336    elsif ($currentLevel =~ /$arrayarrayexp/ and $element eq "option") {
337    # find the gs2 mapping name for classifier and plugin
338        my $key = $nameMap->{$currentLevel};   
339
340    if (defined $name and $name =~ /\w/) {
341        push (@{$data->{$key}->[$currentIndex]}, $name);
342    }
343    if (defined $value and $value =~ /\w/) {
344            push (@{$data->{$key}->[$currentIndex]}, $value);
345    }
346
347    }
348
349
350}
351
352sub EndTag {
353    my ($expat, $element) = @_;
354    my $endTags = q/^(browse|pluginList|displayItemList|indexOption)$/; #|buildOptionList)$/;   
355    if ($element =~ /$endTags/) {
356        $currentIndex = 0;
357        $currentLevel = "";
358    }
359
360    # $arrayarrayexp contains classifier|plugin
361    elsif($element =~ /$arrayarrayexp/ ){
362        $currentIndex = $currentIndex + 1;
363    }
364}
365
366sub Text {
367    if (defined $currentLocation) {
368    #@ Handling block metadataList(creator, maintainer, public)
369    if($currentLocation =~ /$stringexp/){
370        #print $currentLocation;
371        my $key = $nameMap->{$currentLocation};
372        $data->{$key} = $_;
373        undef $currentLocation;
374    }
375   
376    #@ Handling displayItem metadata that are children of displayItemList
377    # that means we will be getting the collection's name and possibly description ('collectionextra' in GS2).
378    elsif($currentLevel eq "displayItemList" && $currentLocation =~ /$displayItemNames/) {
379        my $lang = $currentAttrRef->{'lang'};
380        my $name = $currentAttrRef->{'name'};
381       
382        # this is how data->collectionmeta's language is set in Greenstone 2.
383        # Need to be consistent, since export.pl accesses these values all in the same way
384        if(!defined $lang) {
385        $lang = 'default';
386        } else {
387        $lang = "[l=$lang]";
388        }
389       
390        if(defined $name and $name =~ /$displayItemNames/) { # attribute name = 'name' || 'description'
391        # using $nameMap->$name resolves to 'collectionname' if $name='name' and 'collectionextra' if $name='description'
392        $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang} = $_; # the value is the Text parsed
393        #print STDERR "***Found: $nameMap->{$name} collectionmeta, lang is $lang. Value: $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang}\n";
394        }
395        undef $currentLocation;
396    }
397 
398    #@ Handling searchtype: plain,form; arrayexp
399    elsif (defined $currentLocation and $currentLocation =~ /searchType/) {
400        # map 'searchType' into 'searchtype'
401        my $key = $nameMap->{$currentLocation};
402        # split it by ','
403        my ($plain, $form) = split (",", $_);
404       
405        if (!defined $data->{$key}) {
406        $data->{$key} = [];
407        }
408        if (defined $plain and $plain =~ /\w/) {
409        push @{ $data->{$key} }, $plain;
410        }
411        if (defined $form and $form =~ /\w/) {
412        push @{ $data->{$key} }, $form;
413        }
414    }
415    }   
416}
417
418# This sub is for debugging purposes
419sub Display {
420    # metadataList
421    foreach my $k (keys %{$data}) {
422    print STDERR "*** metadatalist key $k\n";
423    }
424 
425    print STDERR "*** creator: ".$data->{'creator'}."\n" if (defined $data->{'creator'});
426    print STDERR "*** maintainer: ".$data->{"maintainer"}."\n" if (defined $data->{"maintainer"});
427    print STDERR "*** public: ".$data->{"public"}."\n" if (defined $data->{"public"});
428    print STDERR "*** default index: ".$data->{"defaultindex"}."\n" if (defined $data->{"defaultindex"});
429    print STDERR "*** default level: ".$data->{"defaultlevel"}."\n" if (defined $data->{"defaultlevel"});
430    print STDERR "*** build type: ".$data->{"buildtype"}."\n" if (defined $data->{"buildtype"});
431    print STDERR "*** orthogonal build types: ".join(",",$data->{"orthogonalbuildtypes"})."\n" if (defined $data->{"orthogonalbuildtypes"});
432    print STDERR "*** search types: \n";
433    print STDERR join(",",@{$data->{"searchtype"}})."\n" if (defined $data->{"searchtype"});
434    print STDERR "*** levels: \n";
435    print STDERR join(",",@{$data->{'levels'}})."\n" if (defined $data->{'levels'});
436    print STDERR "*** index subcollections: \n";
437    print STDERR join(",",@{$data->{'indexsubcollections'}})."\n" if (defined $data->{'indexsubcollections'});
438    print STDERR "*** indexes: \n";
439    print STDERR join(",",@{$data->{'indexes'}})."\n" if (defined $data->{'indexes'});
440    print STDERR "*** index options: \n";
441    print STDERR join(",",@{$data->{'indexoptions'}})."\n" if (defined $data->{'indexoptions'});
442    print STDERR "*** languages: \n";
443    print STDERR join(",",@{$data->{'languages'}})."\n" if (defined $data->{'languages'});
444    print STDERR "*** language metadata: \n";
445    print STDERR join(",",@{$data->{'languagemetadata'}})."\n" if (defined $data->{'languagemetadata'});
446 
447    print STDERR "*** Plugins: \n";
448    if (defined $data->{'plugin'}) {
449    foreach $a (@{$data->{'plugin'}}) {
450        print join(",",@$a);
451        print "\n";
452    }
453    }
454
455    #print STDERR "*** Build options: \n";
456    #if (defined $data->{'store_metadata_coverage'}) {
457    #foreach $a (@{$data->{'store_metadata_coverage'}}) {
458    #    print join(",",@$a,@$_);
459    #    print "\n";
460    #}
461    #}
462
463    if (defined $data->{'classify'}) {
464    print STDERR "*** Classifiers: \n";
465    map { print join(",",@$_)."\n"; } @{$data->{'classify'}};
466    }
467   
468    if (defined $data->{'subcollection'}) {
469    foreach my $key (keys %{$data->{'subcollection'}}) {
470        print "subcollection ".$key." ".$data->{'subcollection'}->{$key}."\n";
471    }
472    }
473}
474# is this actually used??
475sub Doctype {
476    my ($expat, $name, $sysid, $pubid, $internal) = @_;
477
478    die if ($name !~ /^CollectionConfig$/);
479}
480
481# This Char function overrides the one in XML::Parser::Stream to overcome a
482# problem where $expat->{Text} is treated as the return value, slowing
483# things down significantly in some cases.
484sub Char {
485    if ($]<5.008) {
486    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
487    }
488    $_[0]->{'Text'} .= $_[1];
489    return undef;
490}
491
492
493
494
495#########################################################
496
4971;
Note: See TracBrowser for help on using the browser.