#!/usr/bin/perl -w ########################################################################### # # importfrom.pl -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This program will contact the named DL server # and export its metadata and (optionally) it documents. # Currently only designed for OAI exporting BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); } use colcfg; use util; use parsargv; use FileHandle; sub print_usage { print STDERR "\n usage: $0 [options] collection-name\n\n"; print STDERR " options:\n"; print STDERR " -verbosity number 0=none, 3=lots\n"; print STDERR " -getdoc Also download if source document if present\n"; print STDERR " -importdir directory Where the original material lives\n"; print STDERR " -keepold Will not destroy the current contents of the\n"; print STDERR " import directory (the default)\n"; print STDERR " -removeold Will remove the old contents of the import\n"; print STDERR " directory -- use with care\n"; print STDERR " -gzip Use gzip to compress exported documents\n"; print STDERR " (don't forget to include ZIPPlug in your plugin\n"; print STDERR " -maxdocs number Maximum number of documents to import\n"; print STDERR " -debug Print imported text to STDOUT\n"; print STDERR " -collectdir directory Collection directory (defaults to " . &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n"; print STDERR " -out Filename or handle to print output status to.\n"; print STDERR " The default is STDERR\n\n"; } sub print_usage_old { my ($prog_name) = @_; print STDERR "Usage: $prog_name OAI-base-URL\n"; exit 1; } sub get_oai_ids { my ($base_url, $out) = @_; print $out "Requesting list of identifiers ...\n"; open (OAIIN,"wget -q -O - \"$base_url?verb=ListIdentifiers&metadataPrefix=oai_dc\" |") || die "wget request failed: $!\n"; my $li_record = ""; my $line; while (defined($line=)) { $li_record .= $line; # print $out $line; } close(OAIIN); print $out "... Done.\n"; return $li_record; } sub parse_oai_ids { my ($li_record, $out) = @_; # extract identifier list $li_record =~ s/^.*?//s; $li_record =~ s/^(.*<\/identifier>).*$/$1/s; my @ids = (); while ($li_record =~ m/(.*?)<\/identifier>(.*)$/s) { $li_record = $2; push(@ids,$1); } return \@ids; } sub dir_file_split { my ($file) = @_; my @dirs = split("/",$file); my $local_file = pop(@dirs); my $sub_dirs = join("/",@dirs); return ($sub_dirs,$local_file); } sub get_oai_document { my ($doc_url,$output_dir, $out) = @_; my ($id_dir,$id_fname) = dir_file_split($doc_url); print $out "Getting document $doc_url\n"; `mkdir $output_dir/.orig` if (!-e "$output_dir/.orig"); my $wget_cmd = "wget -q -O $output_dir/.orig/$id_fname \"$doc_url\""; (system($wget_cmd)==0) || print STDERR "Error: failed to execute $wget_cmd\n"; } sub get_oai_records { my ($base_url,$ids,$output_dir, $get_id, $maxdocs, $out) = @_; my $doc_count = 0; my $i; foreach $i ( @$ids ) { # wget it; my $url = "$base_url?verb=GetRecord&metadataPrefix=oai_dc"; $url .= "&identifier=$i"; print $out "Downloading metadata record for $i\n"; my $file_i = "$output_dir/$i.oai"; $file_i =~ s/:/\//g; # obtain record my $wget_cmd = "wget -q -O - \"$url\""; open (OAIIN,"$wget_cmd|") || die "wget request failed: $!\n"; my $i_record = ""; my $line; while (defined($line=)) { $i_record .= $line; } close(OAIIN); # prepare subdirectory for record (if needed) my ($i_dir,$unused) = dir_file_split($file_i); `mkdir -p $i_dir`; # look out for identifier tag in metadata section if ($i_record =~ m/(.*)<\/metadata>/s) { my $m_record = $1; if ($get_id) { if ($m_record =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s) { my $doc_url = $2; get_oai_document($doc_url,$i_dir, $out); my ($id_dir,$id_fname) = dir_file_split($doc_url); $i_record =~ s/(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/$1$doc_url<\/OrigURL>\n .orig\/$id_fname<\/identifier>$4<\/metadata>/s; } } } # save record open (OAIOUT,">$file_i") || die "Unable to save oai metadata record: $!\n"; print OAIOUT $i_record; close(OAIOUT); $doc_count++; last if ($doc_count == $maxdocs); } } sub main { my ($verbosity, $importdir, $keepold, $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection, $configfilename, $collectcfg, $out, $collectdir); if (!parsargv::parse(\@ARGV, 'verbosity/\d+/2', \$verbosity, 'getdoc', \$getdoc, 'importdir/.*/', \$importdir, 'keepold', \$keepold, 'removeold', \$removeold, 'gzip', \$gzip, 'debug', \$debug, 'maxdocs/^\-?\d+/-1', \$maxdocs, 'collectdir/.*/', \$collectdir, 'out/.*/STDERR', \$out)) { &print_usage(); die "\n"; } my $close_out = 0; if ($out !~ /^(STDERR|STDOUT)$/i) { open (OUT, ">$out") || die "Couldn't open output file $out\n"; $out = 'import::OUT'; $close_out = 1; } $out->autoflush(1); # set removeold to false if it has been defined $removeold = 0 if ($keepold); # get and check the collection name if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") { &print_usage(); die "\n"; } # get acquire list my $acquire = []; $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg"); if (-e $configfilename) { $collectcfg = &colcfg::read_collect_cfg ($configfilename); if (defined $collectcfg->{'acquire'}) { $acquire = $collectcfg->{'acquire'}; } if (defined $collectcfg->{'importdir'} && $importdir eq "") { $importdir = $collectcfg->{'importdir'}; } if (defined $collectcfg->{'removeold'}) { if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) { $removeold = 1; } if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) { $removeold = 0; } } } else { die "Couldn't find the configuration file $configfilename\n"; } # fill in the default import directory if none # were supplied, turn all \ into / and remove trailing / $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; $importdir =~ s/[\\\/]+/\//g; $importdir =~ s/\/$//; # remove the old contents of the import directory if needed if ($removeold && -e $importdir) { print $out "Warning - removing current contents of the import directory\n"; print $out " in preparation for the acquire\n"; sleep(5); # just in case... &util::rm_r ($importdir); } my $e; foreach $e ( @$acquire ) { my $acquire_type = shift @$e; my $acquire_src = undef; if ($acquire_type ne "OAI") { print STDERR "Warning: $acquire_type not currently supported. Skipping.\n"; next; } my $store_getdoc = $getdoc; if (!parsargv::parse($e, 'getdoc', \$getdoc, 'src/.*/', \$acquire_src)) { &print_usage(); die "\n"; } if (!defined $acquire_src) { print STDERR "Warning: Not -src flag defined. Skipping.\n"; next; } print $out "$acquire_type Acquire: from $acquire_src\n"; my $li_record = get_oai_ids($acquire_src,$out); my $ids = parse_oai_ids($li_record,$out); get_oai_records($acquire_src,$ids,$importdir, $getdoc, $maxdocs, $out); $getdoc = $store_getdoc; } close OUT if $close_out; } &main();