########################################################################### # # WebDownload.pm -- base class for all the import plugins # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package OAIDownload; eval {require bytes}; # suppress the annoying "subroutine redefined" warning that various # plugins cause under perl 5.6 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)}; use strict; use WgetDownload; use XMLParser; use POSIX qw(tmpnam); use util; sub BEGIN { @OAIDownload::ISA = ('WgetDownload'); } my $arguments = [ { 'name' => "url", 'disp' => "{OAIDownload.url_disp}", 'desc' => "{OAIDownload.url}", 'type' => "string", 'reqd' => "yes"}, { 'name' => "set", 'disp' => "{OAIDownload.set_disp}", 'desc' => "{OAIDownload.set}", 'type' => "string", 'reqd' => "no"}, { 'name' => "get_doc", 'disp' => "{OAIDownload.get_doc_disp}", 'desc' => "{OAIDownload.get_doc}", 'type' => "flag", 'reqd' => "no"}, { 'name' => "max_records", 'disp' => "{OAIDownload.max_records_disp}", 'desc' => "{OAIDownload.max_records}", 'type' => "int", 'deft' => "500", 'range' => "1,", 'reqd' => "no"} ]; my $options = { 'name' => "OAIDownload", 'desc' => "{OAIDownload.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; my $self; my $strWgetOptions=""; sub new { my ($class) = shift (@_); my ($getlist,$inputargs,$hashArgOptLists) = @_; push(@$getlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } my $parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Char' => \&Char, 'Start' => \&OAI_StartTag, 'End' => \&OAI_EndTag }); $self->{'parser'} = $parser; # make sure the tmp directory that we will use later exists my $tmp_dir = "$ENV{GSDLHOME}/tmp"; if (! -e $tmp_dir) { &util::mk_dir($tmp_dir); } return bless $self, $class; } sub download { my ($self) = shift (@_); my ($hashGeneralOptions) = @_; print STDERR "here2"; $strWgetOptions = $self->getWgetOptions(); my $cmdWget = $strWgetOptions; my $strOutputDir =""; $strOutputDir = $hashGeneralOptions->{"cache_dir"}; my $strBasURL = $self->{'url'}; my $intMaxRecords = $self->{'max_records'}; my $blnDownloadDoc = $self->{'get_doc'}; print STDERR "<>\n"; my $strIDs = $self->getOAIIDs($strBasURL); if($strIDs eq "") { print STDERR "Error: No ID being found\n"; return 0; } my $aryIDs = $self->parseOAIIDs($strIDs); my $intIDs = 0; if($self->{'max_records'} < scalar(@$aryIDs)) { $intIDs = $self->{'max_records'}; } else { $intIDs = scalar(@$aryIDs); } print STDERR "<>\n"; $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc); my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp"; &util::rm($tmp_file); return 1; } sub getOAIIDs { my ($self,$strBasURL) = @_; my ($cmdWget); my $wgetOptions = $self->getWgetOptions(); $cmdWget = $wgetOptions; print STDERR "Gathering OAI identifiers.....\n"; if($self->{'set'} ne "") { $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" "; } else { $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" "; } my $strIDs = $self->useWget($cmdWget); if (!defined $strIDs or $strIDs eq "" ){ print STDERR "Server information is unavailable.\n"; print STDERR "<>\n"; return; } print STDERR "<>\n"; $self->parse_xml($strIDs); return $strIDs; } sub parseOAIIDs { my ($self,$strIDs) = @_; print STDERR "Parsing OAI identifiers.....\n"; $strIDs =~ s/^.*?//s; $strIDs =~ s/^(.*<\/identifier>).*$/$1/s; my @aryIDs = (); while ($strIDs =~ m/(.*?)<\/identifier>(.*)$/s) { $strIDs = $2; push(@aryIDs,$1); } return \@aryIDs; } sub dirFileSplit { my ($self,$strFile) = @_; my @aryDirs = split("[/\]",$strFile); my $strLocalFile = pop(@aryDirs); my $strSubDirs = join("/",@aryDirs); return ($strSubDirs,$strLocalFile); } sub getOAIDoc { my ($self,$strRecord, $strSubDirPath) = @_; print STDERR "Gathering source documents.....\n"; # look out for identifier tag in metadata section if ($strRecord =~ m/(.*)<\/metadata>/s) { my $strMetaTag = $1; if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s) { my $strDocURL = $2; my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL); my $strSoureDirPath =""; $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs"); &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath"); my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile); my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\""; my $strResponse = $self->useWget($wget_cmd,1); if($strResponse ne "") { print STDERR "Error occured while retriving OAI souce documents: $strResponse\n"; exit(-1); } $strRecord =~ s/(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/$1$strDocURL<\/OrigURL>\n srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s; } else { print STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n"; } } else { print STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n"; } } sub getOAIRecords { my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_; my $intDocCounter = 0; foreach my $strID ( @$aryIDs) { print STDERR "Gathering OAI record with ID:$strID.....\n"; my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\""; my $strRecord = $self->useWget($cmdWget); my @fileDirs = split(":",$strID); # setup directories $strOutputDir =~ s/"//g; #" my $host =$self->{'url'}; $host =~ s/http:\/\///g; $host =~ s/:.*//g; my $midDir = join ("/",@fileDirs); my $strFileURL = "$strOutputDir/$host/".$midDir.".oai"; # prepare subdirectory for record (if needed) my ($strSubDirPath,$unused) = ("", ""); ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL); &util::mk_all_dir($strSubDirPath); my $ds = &util::get_dirsep(); if($blnDownloadDoc) { $self->getOAIDoc($strRecord,$strSubDirPath); } # save record open (OAIOUT,">$strFileURL") || die "Unable to save oai metadata record: $!\n"; print OAIOUT $strRecord; close(OAIOUT); print STDERR "Saving records to $strFileURL\n"; print STDERR "<>\n"; $intDocCounter ++; last if ($intDocCounter >= $intMaxRecords); } ($intDocCounter >= $intMaxRecords) ? print STDERR "Reach maximum download records, use -max_records to set the maximum.\n": print STDERR "Complete download meta record from $strBasURL\n"; print STDERR "<>\n"; } sub url_information { my ($self) = shift (@_); if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";} my $wgetOptions = $self->getWgetOptions(); my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\""; my $strIdentify = "verb=Identify"; my $strListSets = "verb=ListSets"; my $strIdentifyCMD = $strBaseCMD; $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; my $strIdentifyText = $self->useWget($strIdentifyCMD); if (!defined $strIdentifyText or $strIdentifyText eq "" ){ print STDERR "Server information is unavailable.\n"; print STDERR "<>\n"; return; } print STDERR "General information:\n"; $self->parse_xml($strIdentifyText); my $strListSetCMD = $strBaseCMD; $strListSetCMD =~ s/_OPTS_/$strListSets/; my $strListSetsText = $self->useWget($strListSetCMD); print STDERR "List Information:\n"; $self->parse_xml($strListSetsText); } sub parse_xml { my ($self) = shift (@_); my ($strOutputText) = @_; #Open a temporary file to store OAI information, and store the information to the temp file my $name = "$ENV{GSDLHOME}/tmp/oai.tmp"; open(*OAIOUT,"> $name"); print OAIOUT $strOutputText; close(OAIOUT); $self->{'temp_file_name'} = $name; eval { $self->{'parser'}->parsefile("$name"); }; if ($@) { die "OAI: $name is not a well formed XML file ($@)\n"; } } END{ if($self->{'info'}) { unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!"; } } # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ $_[0]->{'Text'} .= $_[1]; if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) { $self->{'text'} .= $_[1]; $self->{'text'} =~ s/[\n]|([ ]{2,})//g; if($self->{'text'} ne "") { print STDERR " $self->{'subfield'}:($self->{'text'})\n"; } } return undef; } sub OAI_StartTag { my ($expat, $element, %attr) = @_; $self->{'subfield'} = $element; } sub OAI_EndTag { my ($expat, $element) = @_; $self->{'text'} = ""; $self->{'subfield'} = ""; } sub error { my ($self,$strFunctionName,$strError) = @_; { print "Error occoured in OAIDownload.pm\n". "In Function:".$strFunctionName."\n". "Error Message:".$strError."\n"; exit(-1); } } 1;