########################################################################### # # OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package OAIPlug; use BasPlug; use unicode; use util; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use XMLPlug; sub BEGIN { @OAIPlug::ISA = ('XMLPlug'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_process_exp() }, ]; my $options = { 'name' => "OAIPlug", 'desc' => "{OAIPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)(\.oai)$^; } sub get_doctype { my $self = shift(@_); return "OAI-PMH"; } sub xml_start_document { my $self = shift (@_); $self->{'in_metadata_node'} = 0; $self->{'rawxml'} = ""; } sub xml_end_document { } sub xml_doctype { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; ##die "" if ($name !~ /^OAI-PMH$/); my $outhandle = $self->{'outhandle'}; print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; print STDERR "\n" if $self->{'gli'}; } sub xml_start_tag { my $self = shift(@_); my ($expat,$element) = @_; my %attr_hash = %_; my $attr = ""; map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash; $self->{'rawxml'} .= "<$element$attr>"; if ($element eq "metadata") { $self->{'in_metadata_node'} = 1; $self->{'metadata_xml'} = ""; } if ($self->{'in_metadata_node'}) { $self->{'metadata_xml'} .= "<$element$attr>"; } } sub xml_end_tag { my $self = shift(@_); my ($expat, $element) = @_; $self->{'rawxml'} .= ""; if ($self->{'in_metadata_node'}) { $self->{'metadata_xml'} .= ""; } if ($element eq "metadata") { my $textref = \$self->{'metadata_xml'}; my $metadata = $self->{'metadata'}; $self->extract_oai_metadata($textref,$metadata); $self->{'in_metadata_node'} = 0; } } sub xml_text { my $self = shift(@_); my ($expat) = @_; $self->{'rawxml'} .= $_; if ($self->{'in_metadata_node'}) { $self->{'metadata_xml'} .= $_; } } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; my $outhandle = $self->{'outhandle'}; my $filename = $file; $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/)); if ($self->SUPER::read(@_)) { # Do encoding stuff my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); my $url_array = $metadata->{'URL'}; my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; my $srcdoc_exists = 0; my $srcdoc_pos = 0; my $filename_dir = &util::filename_head($filename); for (my $i=0; $i<$num_urls; $i++) { if ($url_array->[$i] !~ m/^(http|ftp):/) { my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); if (-e $src_filename) { $srcdoc_pos = $i; $srcdoc_exists = 1; } } } if ($srcdoc_exists) { print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n" if ($self->{'verbosity'}>1); # Make pretty print metadata table stick with src filename my $ppmd_table = $self->{'ppmd_table'}; $metadata->{'prettymd'} = [ $ppmd_table ]; $self->{'ppmd_table'} = undef; return &plugin::read ($pluginfo, $filename_dir, $url_array->[0], $metadata, $processor, $maxdocs, $total_count, $gli); } else { # create a new document my $doc_obj = new doc ($filename, "indexed_doc"); my $top_section = $doc_obj->get_top_section; my $plugin_type = $self->{'plugin_type'}; $doc_obj->add_utf8_metadata($top_section, "Language", $language); $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); # do plugin specific processing of doc_obj my $textref = \$self->{'rawxml'}; unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { print STDERR "\n" if ($gli); return -1; } # do any automatic metadata extraction $self->auto_extract_metadata ($doc_obj); # add an OID $doc_obj->set_OID(); my $ppmd_table = $self->{'ppmd_table'}; $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table); $self->{'ppmd_table'} = undef; # process the document $processor->process($doc_obj); $self->{'num_processed'} ++; return 1; # processed the file } } else { return undef; } } # do plugin specific processing of doc_obj sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $outhandle = $self->{'outhandle'}; print STDERR "\n" if ($gli); print $outhandle "OAIPlug: processing $file\n" if $self->{'verbosity'} > 1; my $cursection = $doc_obj->get_top_section(); ## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection); # add text to document object # $$textref =~ s/<(.*?)>/$1 /g; $$textref =~ s//>/g; ## print STDERR "*** adding text: $$textref\n"; $doc_obj->add_utf8_text($cursection, $$textref); return 1; } # Improvement is to merge this with newer version in MetadataPass sub open_prettyprint_metadata_table { my $self = shift(@_); my $att = "width=100% cellspacing=2"; my $style = "style=\'border-bottom: 4px solid #000080\'"; $self->{'ppmd_table'} = "\n"; } sub add_prettyprint_metadata_line { my $self = shift(@_); my ($metaname, $metavalue_utf8) = @_; ### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/; $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8); $self->{'ppmd_table'} .= " \n"; $self->{'ppmd_table'} .= " \n"; $self->{'ppmd_table'} .= " \n"; $self->{'ppmd_table'} .= " \n"; } sub close_prettyprint_metadata_table { my $self = shift(@_); $self->{'ppmd_table'} .= "
\n"; $self->{'ppmd_table'} .= " $metaname\n"; $self->{'ppmd_table'} .= " \n"; $self->{'ppmd_table'} .= " $metavalue_utf8\n"; $self->{'ppmd_table'} .= "
\n"; } sub remap_dcterms_metadata { my $self = shift(@_); my ($metaname) = @_; my $dcterm_mapping = { "alternative" => "dc.title", "tableOfContents" => "dc.description", "abstract" => "dc.description", "created" => "dc.date", "valid" => "dc.date", "available" => "dc.date", "issued" => "dc.date", "modified" => "dc.date", "dateAccepted" => "dc.date", "dateCopyrighted" => "dc.date", "dateSubmitted" => "dc.date", "extent" => "dc.format", "medium" => "dc.format", "isVersionOf" => "dc.relation", "hasVersion" => "dc.relation", "isReplacedBy" => "dc.relation", "replaces" => "dc.relation", "isRequiredBy" => "dc.relation", "requires" => "dc.relation", "isPartOf" => "dc.relation", "hasPart" => "dc.relation", "isReferencedBy" => "dc.relation", "references" => "dc.relation", "isFormatOf" => "dc.relation", "hasFormat" => "dc.relation", "conformsTo" => "dc.relation", "spatial" => "dc.coverage", "temporal" => "dc.coverage", "audience" => "dc.any", "accrualMethod" => "dc.any", "accrualPeriodicity" => "dc.any", "accrualPolicy" => "dc.any", "instructionalMethod" => "dc.any", "provenance" => "dc.any", "rightsHolder" => "dc.any", "mediator" => "audience", "educationLevel" => "audience", "accessRights" => "dc.rights", "license" => "dc.rights", "bibliographicCitation" => "dc.identifier" }; my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/); if ($prefix eq "dcterms") { if (defined $dcterm_mapping->{$name}) { return $dcterm_mapping->{$name}."^".$name; } } return $metaname; # didn't get a match, return param passed in unchanged } sub extract_oai_metadata { my $self = shift (@_); my ($textref, $metadata) = @_; my $outhandle = $self->{'outhandle'}; # Only handles DC metadata $self->open_prettyprint_metadata_table(); if ($$textref =~ m/(.*?)<\/metadata\s*>/s) { my $metadata_text = $1; # locate and remove outermost tag (ignoring any attribute information in top-level tag) my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s); # split tag into namespace and tag name my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/); if ($top_level_prefix !~ /dc$/) { print $outhandle "Warning: OAIPlug currently only designed for Dublin Core (or variant) metadata\n"; print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n"; print $outhandle " Metadata assumed to be in form: value and will be converted\n"; print $outhandle " into Greenstone metadata as prefix.tag = value\n"; } while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s) { # if URL given for document as identifier metadata, store it ... # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); my $metaname = $1; my $metavalue = $2; $inner_metadata_text = $3; # print STDERR "*** metaname = $metaname\n"; # print STDERR "*** metavalue = $metavalue\n"; # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter $metaname =~ s/:/\./; if ($metaname !~ m/\./) { $metaname = "$top_level_prefix.$metaname"; } $metaname = $self->remap_dcterms_metadata($metaname); # if ($metaname eq "Identifier") # { # # name clashes with GSDL reserved metadata name for hash id # $metaname = "URL"; # } if (defined $metadata->{$metaname}) { push(@{$metadata->{$metaname}},$metavalue); } else { $metadata->{$metaname} = [ $metavalue ]; } $self->add_prettyprint_metadata_line($metaname, $metavalue); } } $self->close_prettyprint_metadata_table(); } ## we know from the file extension, so doesn't need to check the doctype sub check_doctype { return 1; } 1;