########################################################################### # # collConfigxml.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # reads in configuration files of xml form package collConfigxml; use strict; no strict 'refs'; no strict 'subs'; # Wrapper that ensures the right version of XML::Parser is loaded given # the version of Perl being used. Need to distinguish between Perl 5.6 and # Perl 5.8 sub BEGIN { my $perl_dir; # Note: $] encodes the version number of perl if ($]>=5.010) { $perl_dir = "perl-5.10"; } elsif ($]>5.008) { # perl 5.8.1 or above $perl_dir = "perl-5.8"; } elsif ($]<5.008) { # assume perl 5.6 $perl_dir = "perl-5.6"; } else { print STDERR "Warning: Perl 5.8.0 is not a maintained release.\n"; print STDERR " Please upgrade to a newer version of Perl.\n"; $perl_dir = "perl-5.8"; } if ($ENV{'GSDLOS'} !~ /^windows$/i) { # Use push to put this on the end, so an existing XML::Parser will be used by default push (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/$perl_dir"); } } use XML::Parser; # A mapping hash to resolve name discrepancy between gs2 and gs3. my $nameMap = {"key" => "value", "creator" => "creator", "maintainer" => "maintainer", "public" => "public", "infodb" => "infodbtype", "defaultIndex" => "defaultindex", "defaultLevel" => "defaultlevel", "name" => "collectionname", "description" => "collectionextra", "smallicon" => "iconcollectionsmall", "icon" => "iconcollection", "level" => "levels", "classifier" => "classify", "indexSubcollection" => "indexsubcollections", "indexLanguage" => "languages", "defaultIndexLanguage" => "defaultlanguage", "index" => "indexes", "plugin" => "plugin", "plugout" => "plugout", "indexOption" => "indexoptions", "searchType" => "searchtype", "languageMetadata" => "languagemetadata", "buildType" => "buildtype", "buildOption" => "buildOption" }; # A hash structure which is returned by sub read_cfg_file. my $data = {}; my $repeatedBlock = q/^(browse|pluginList)$/; #|buildOptionList)$/; # use those unique attribute values to locate the text within the elements # creator, public, maintainer and within a displayItem. my $currentLocation = ""; my $stringexp = q/^(creator|maintainer|public|buildType)$/; my $displayItemNames = q/^(name|description)$/; # For storing the attributes during the StartTag subroutine, so that # we can use it later in Text (or EndTag) subroutines my $currentAttrRef = undef; my $currentLevel = ""; # Count the elements with same name within the same block # ("plugin", "option") my $currentIndex = 0; my $arrayexp = q/^(index|level|indexSubcollection|indexLanguage)$/; my $arrayarrayexp= q/^(plugin|classifier)$/; #|buildOption)$/; my $hashexp = q/^(subcollection)$/; # add other element names that should be represented by hash expressions here my $hashhashexp = q/^(displayItem)$/; # add other (collectionmeta) element names that should be represented by hashes of hashes here. my $defaults = q/^(defaultIndex|defaultLevel|defaultIndexLanguage|languageMetadata)$/; # Reads in the model collection configuration file, collectionConfig.xml, # into a structure which complies with the one used by gs2 (i.e. one read # in by &cfgread::read_cfg_file). sub read_cfg_file { my ($filename) = @_; $data = {}; if ($filename !~ /collectionConfig\.xml$/ || !-f $filename) { return undef; } # create XML::Parser object for parsing metadata.xml files my $parser; if ($]<5.008) { # Perl 5.6 $parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Char' => \&Char, 'Doctype' => \&Doctype }); } else { # Perl 5.8 $parser = new XML::Parser('Style' => 'Stream', 'ProtocolEncoding' => 'ISO-8859-1', 'Handlers' => {'Char' => \&Char, 'Doctype' => \&Doctype }); } if (!open (COLCFG, $filename)) { print STDERR "cfgread::read_cfg_file couldn't read the cfg file $filename\n"; } else { $parser->parsefile ($filename);# (COLCFG); close (COLCFG); } #&Display; return $data; } sub StartTag { # Those marked with #@ will not be executed at the same time when this sub is being called # so that if/elsif is used to avoid unnecessary tests my ($expat, $element) = @_; # See http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm#Stream # %_ is a hash of all the attributes of this element, we want to store them so we can use the attributes # when the textnode contents of the element are parsed in the subroutine Text (that's the handler for Text). $currentAttrRef = \%_; my $name = $_{'name'}; my $value = $_{'value'}; my $type = $_{'type'}; # for subcollections my $filter = $_{'filter'}; # was this just a flax thing?? my $assigned = $_{'assigned'}; #@ Marking repeated block if ($element =~ /$repeatedBlock/) { $currentIndex = 0; } #@ handling block metadataList elsif (defined $name and $name =~ /$stringexp/){ $currentLocation = $name; } #@ handling default search index/level/indexLanguage and languageMetadata elsif ($element =~ /$defaults/) { if (defined $name and $name =~ /\w/) { $data->{$nameMap->{$element}} = $name; } } #@ handling the displayItems name and description (known as collectionname and collectionextra in GS2) elsif($element eq "displayItemList") { $currentLevel = "displayItemList"; # storing the parent if it is displayItemList } elsif($element =~ /$hashhashexp/) { # can expand on this to check for other collectionmeta elements if((!defined $assigned) || (defined $assigned and $assigned =~ /\w/ and $assigned eq "true")) { # either when there is no "assigned" attribute, or when assigned=true (for displayItems): $currentLocation = $name; } } #@ Handling database type: gdbm or gdbm-txtgz, later jdbm. elsif ($element eq "infodb") { $data->{'infodbtype'} = $type; } #@ Handling indexer: mgpp/mg/lucene; stringexp elsif ($element eq "search") { $data->{'buildtype'} = $type; } #@ Handling searchtype: plain,form; arrayexp #elsif ($element eq "format" and defined $name and $name =~ /searchType/) { #@ Handling searchtype: plain, form #$currentLocation = $name; #} #@ Handle index|level|indexSubcollection|indexLanguage elsif ($element =~ /$arrayexp/) { my $key = $nameMap->{$element}; if (!defined $data->{$key}) { $data->{$key} = []; } push (@{$data->{$key}},$name); } #***************************************** elsif ($element eq "buildOption") { print STDERR "**** BUILD OPTION PAIR $name $value\n"; $data->{$name} = $value; } #@ indexoptions: accentfold/casefold/stem; arrayexp elsif ($element eq "indexOption") { $currentLevel = "indexOption"; } if ($currentLevel eq "indexOption" and $element eq "option") { my $key = $nameMap->{$currentLevel}; if (!defined $data->{$key}) { $data->{$key} = []; } push (@{$data->{$key}},$name); } #@ plugout options elsif ($element eq "plugout") { $currentLevel = "plugout"; my $key = $nameMap->{$currentLevel}; if (!defined $data->{$key}) { $data->{$key} = []; } if(defined $name and $name ne ""){ push (@{$data->{$key}},$name); } else{ push (@{$data->{$key}},"GreenstoneXMLPlugout"); } } if ($currentLevel eq "plugout" and $element eq "option") { my $key = $nameMap->{$currentLevel}; if (defined $name and $name ne ""){ push (@{$data->{$key}},$name); } if (defined $value and $value ne ""){ push (@{$data->{$key}},$value); } } #@ use hash of hash of strings: hashexp elsif ($element =~ /$hashexp/) { if (!defined $data->{$element}) { $data->{$element} = {}; } if (defined $name and $name =~ /\w/) { if (defined $filter and $filter =~ /\w/) { $data->{$element}->{$name} = $filter; } } } #@ Handling each classifier/plugin element elsif ($element =~ /$arrayarrayexp/) { # find the gs2 mapping name $currentLevel = $element; my $key = $nameMap->{$element}; # define an array of array of strings foreach $k (@{$data->{$key}}) { if (!defined $data->{$key}) { $data->{$key} = []; } # Push classifier/plugin name (e.g. AZList) into $data as the first string push (@{$data->{$key}->[$currentIndex]},$name); if (defined $value and $value =~ /\w/) { push (@{$data->{$key}->[$currentIndex]}, $value); print "$value\n"; } #print $currentIndex."indexup\n"; } #@ Handling the option elements in each classifier/plugin element (as the following strings) elsif ($currentLevel =~ /$arrayarrayexp/ and $element eq "option") { # find the gs2 mapping name for classifier and plugin my $key = $nameMap->{$currentLevel}; if (defined $name and $name =~ /\w/) { push (@{$data->{$key}->[$currentIndex]}, $name); } if (defined $value and $value =~ /\w/) { push (@{$data->{$key}->[$currentIndex]}, $value); } } } sub EndTag { my ($expat, $element) = @_; my $endTags = q/^(browse|pluginList|displayItemList)$/; #|buildOptionList)$/; if ($element =~ /$endTags/) { $currentIndex = 0; $currentLevel = ""; } # $arrayarrayexp contains classifier|plugin elsif($element =~ /$arrayarrayexp/ ){ $currentIndex = $currentIndex + 1; } } sub Text { if (defined $currentLocation) { #@ Handling block metadataList(creator, maintainer, public) if($currentLocation =~ /$stringexp/){ #print $currentLocation; my $key = $nameMap->{$currentLocation}; $data->{$key} = $_; undef $currentLocation; } #@ Handling displayItem metadata that are children of displayItemList # that means we will be getting the collection's name and possibly description ('collectionextra' in GS2). elsif($currentLevel eq "displayItemList" && $currentLocation =~ /$displayItemNames/) { my $lang = $currentAttrRef->{'lang'}; my $name = $currentAttrRef->{'name'}; # this is how data->collectionmeta's language is set in Greenstone 2. # Need to be consistent, since export.pl accesses these values all in the same way if(!defined $lang) { $lang = 'default'; } else { $lang = "[l=$lang]"; } if(defined $name and $name =~ /$displayItemNames/) { # attribute name = 'name' || 'description' # using $nameMap->$name resolves to 'collectionname' if $name='name' and 'collectionextra' if $name='description' $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang} = $_; # the value is the Text parsed #print STDERR "***Found: $nameMap->{$name} collectionmeta, lang is $lang. Value: $data->{'collectionmeta'}->{$nameMap->{$name}}->{$lang}\n"; } undef $currentLocation; } #@ Handling searchtype: plain,form; arrayexp elsif (defined $currentLocation and $currentLocation =~ /searchType/) { # map 'searchType' into 'searchtype' my $key = $nameMap->{$currentLocation}; # split it by ',' my ($plain, $form) = split (",", $_); if (!defined $data->{$key}) { $data->{$key} = []; } if (defined $plain and $plain =~ /\w/) { push @{ $data->{$key} }, $plain; } if (defined $form and $form =~ /\w/) { push @{ $data->{$key} }, $form; } } } } # This sub is for debugging purposes sub Display { # metadataList foreach my $k (keys %{$data}) { print STDERR "*** metadatalist key $k\n"; } print STDERR "*** creator: ".$data->{'creator'}."\n" if (defined $data->{'creator'}); print STDERR "*** maintainer: ".$data->{"maintainer"}."\n" if (defined $data->{"maintainer"}); print STDERR "*** public: ".$data->{"public"}."\n" if (defined $data->{"public"}); print STDERR "*** default index: ".$data->{"defaultindex"}."\n" if (defined $data->{"defaultindex"}); print STDERR "*** default level: ".$data->{"defaultlevel"}."\n" if (defined $data->{"defaultlevel"}); print STDERR "*** build type: ".$data->{"buildtype"}."\n" if (defined $data->{"buildtype"}); print STDERR "*** search types: \n"; print STDERR join(",",@{$data->{"searchtype"}})."\n" if (defined $data->{"searchtype"}); print STDERR "*** levels: \n"; print STDERR join(",",@{$data->{'levels'}})."\n" if (defined $data->{'levels'}); print STDERR "*** index subcollections: \n"; print STDERR join(",",@{$data->{'indexsubcollections'}})."\n" if (defined $data->{'indexsubcollections'}); print STDERR "*** indexes: \n"; print STDERR join(",",@{$data->{'indexes'}})."\n" if (defined $data->{'indexes'}); print STDERR "*** index options: \n"; print STDERR join(",",@{$data->{'indexoptions'}})."\n" if (defined $data->{'indexoptions'}); print STDERR "*** languages: \n"; print STDERR join(",",@{$data->{'languages'}})."\n" if (defined $data->{'languages'}); print STDERR "*** language metadata: \n"; print STDERR join(",",@{$data->{'languagemetadata'}})."\n" if (defined $data->{'languagemetadata'}); print STDERR "*** Plugins: \n"; if (defined $data->{'plugin'}) { foreach $a (@{$data->{'plugin'}}) { print join(",",@$a); print "\n"; } } #print STDERR "*** Build options: \n"; #if (defined $data->{'store_metadata_coverage'}) { #foreach $a (@{$data->{'store_metadata_coverage'}}) { # print join(",",@$a,@$_); # print "\n"; #} #} if (defined $data->{'classify'}) { print STDERR "*** Classifiers: \n"; map { print join(",",@$_)."\n"; } @{$data->{'classify'}}; } if (defined $data->{'subcollection'}) { foreach my $key (keys %{$data->{'subcollection'}}) { print "subcollection ".$key." ".$data->{'subcollection'}->{$key}."\n"; } } } # is this actually used?? sub Doctype { my ($expat, $name, $sysid, $pubid, $internal) = @_; die if ($name !~ /^CollectionConfig$/); } # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { if ($]<5.008) { use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6 } $_[0]->{'Text'} .= $_[1]; return undef; } ######################################################### 1;