########################################################################### # # OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package OAIPlug; use BasPlug; use unicode; use util; use parsargv; sub BEGIN { @ISA = ('BasPlug'); } my $options = { 'name' => "OAIPlug", 'desc' => "{OAIPlug.desc}", 'inherits' => "yes" }; # sub print_usage { # print STDERR "\n usage: plugin OAIPlug [options]\n\n"; # print STDERR " currently no options:\n"; # } sub new { my $class = shift (@_); my $self = new BasPlug ($class, @_); $self->{'plugin_type'} = "OAIPlug"; # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); if (!parsargv::parse(\@_, "allow_extra_options")) { print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n"; $self->print_txt_usage(""); # Use default resource bundle die "\n"; } return bless $self, $class; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)(\.oai)$^; } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $filename = $file; $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; return 0 if ((-d $filename) && ($filename =~ m/.orig$/)); if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { $self->{'num_blocked'} ++; return 0; } if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { return undef; } $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up # Do encoding stuff my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); #### # Above code exactly the same as in BasPlug # => consider making supporting function? ### # read in file ($text will be in utf8) my $text = ""; $self->read_file ($filename, $encoding, $language, \$text); if (!length ($text)) { print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; return 0; } print $outhandle "OAIPlug: extracting metadata from $file\n" if ($self->{'verbosity'}>1); $self->extract_oai_metadata(\$text,$metadata); my $url_array = $metadata->{'URL'}; if (defined $url_array && ($url_array->[0] !~ m/^http:/)) { ## my $source_file = &util::filename_cat($base_dir, $file); my $url_base_dir = $filename; $url_base_dir =~ s/^(.*)\/(.*?)$/$1/; ## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n"; print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n" if ($self->{'verbosity'}>1); return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0], $metadata, $processor, $maxdocs); } else { # create a new document my $doc_obj = new doc ($filename, "indexed_doc"); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1"); # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); # do plugin specific processing of doc_obj return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); # do any automatic metadata extraction $self->auto_extract_metadata ($doc_obj); # add an OID $doc_obj->set_OID(); # process the document $processor->process($doc_obj); return 1; # processed the file } } # do plugin specific processing of doc_obj sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle "OAIPlug: processing $file\n" if $self->{'verbosity'} > 1; my $cursection = $doc_obj->get_top_section(); ## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection); # add text to document object # $$textref =~ s/<(.*?)>/$1 /g; $$textref =~ s//>/g; ## print STDERR "*** adding text: $$textref\n"; $doc_obj->add_utf8_text($cursection, $$textref); return 1; } sub extract_oai_metadata { my $self = shift (@_); my ($textref, $metadata) = @_; my $outhandle = $self->{'outhandle'}; if ($$textref =~ m/(.*?)<\/metadata>/s) { $metadata_text = $1; $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s; while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s) { # if URL given for document as identifier metadata, store it ... # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); my $metaname = $1; my $metavalue = $2; $metadata_text = $4; $metaname =~ s/^(dc:)?(.)/\u$2/; if ($metaname eq "Identifier") { # name clashes with GSDL reserved metadata name for hash id $metaname = "URL"; } if (defined $metadata->{$metaname}) { push(@{$metadata->{$metaname}},$metavalue); } else { $metadata->{$metaname} = [ $metavalue ]; } } } } 1;