Context Navigation

← Previous Changeset
Next Changeset →

Changeset 9952

Timestamp:

2005-05-25T17:07:04+12:00 (19 years ago)

Author:

davidb

Message:

Additional features added to importfrom.pl. Can now download a named set
from an OAI server, specify the metadata set to use and gain general info
about the server.

File:

: 1 edited

trunk/gsdl/bin/script/importfrom.pl (modified) (13 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/bin/script/importfrom.pl

-              r9699
+              r9952
 my $wgetopt = "";
+my $num_processed = 0;
 sub print_usage {
 …
+sub print_usage_old
+{
+    my ($prog_name) = @_;
+    print STDERR "Usage: $prog_name OAI-base-URL\n";
+    exit 1;
+}
+sub get_oai_ids
+{
+    my ($base_url, $out) = @_;
+    print $out "Requesting list of identifiers ...\n";
+    open (OAIIN,"wget $wgetopt -q -O - \"$base_url?verb=ListIdentifiers&metadataPrefix=oai_dc\" |")
+sub xml_pretty_print
+{
+    my ($text,$out,$verbosity) = @_;
+    if (system("xmllint --version >/dev/null 2>&1")!=0) {
+    if ($verbosity>1) {
+        print STDERR "Warning: Unable to find xmllint for pretty printing.\n";
+        print STDERR "         XML will be shown verbatim.\n\n";
+    }
+    print $out $text;
+    }
+    else {
+    if (!open (PPOUT,"|xmllint --format -")) {
+        print STDERR "Error running xmllint: $!\n\n";
+        print $out $text;
+        return;
+    }
+    print PPOUT $text;
+    close(PPOUT);
+    }
+}
+sub wget_oai_url
+{
+    my ($wget_cmd,$out,$verbosity) = @_;
+    if ($verbosity>2) {
+    print $out "  $wget_cmd\n";
+    }
+    open (OAIIN,"$wget_cmd |")
     || die "wget request failed: $!\n";
 …
     # print $out $line;
+    }
+    close(OAIIN);
+    return $li_record;
+}
+sub oai_info
+{
+    my ($base_url,$out,$verbosity) = @_;
+    my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
+    my $identify = "verb=Identify";
+    my $list_sets = "verb=ListSets";
+    my $list_md_formats = "ListMetadataFormats"; # not currently used
+    my $identify_cmd = $base_wget_cmd;
+    $identify_cmd =~ s/_OPTS_/$identify/;
+    print $out "-------------------\n";
+    print $out "General Information\n";
+    print $out "-------------------\n";
+    my $identify_text = wget_oai_url($identify_cmd,$out,$verbosity);
+    xml_pretty_print($identify_text,$out,$verbosity);
+    close(OAIIN);
+    my $list_sets_cmd = $base_wget_cmd;
+    $list_sets_cmd =~ s/_OPTS_/$list_sets/;
+    print $out "-------------------\n";
+    print $out "Set Information\n";
+    print $out "-------------------\n";
+    my $list_sets_text = wget_oai_url($list_sets_cmd,$out,$verbosity);
+    xml_pretty_print($list_sets_text,$out,$verbosity);
+}
+sub get_oai_ids
+{
+    my ($base_url, $set, $format, $out, $verbosity) = @_;
+    print $out "Requesting list of identifiers ...\n";
+    my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
+    my $identifiers_cmd = $base_wget_cmd;
+    my $identifiers_opts = "verb=ListIdentifiers&metadataPrefix=$format";
+    if (defined $set && ($set ne "")) {
+    $identifiers_opts .= "&set=$set";
+    }
+    $identifiers_cmd =~ s/_OPTS_/$identifiers_opts/;
+    my $li_record = wget_oai_url($identifiers_cmd,$out,$verbosity);
     print $out "... Done.\n";
 …
 sub parse_oai_ids
+{
     my ($li_record, $out) = @_;
+    my ($li_record, $out, $verbosity) = @_;
     # extract identifier list
 …
     print $out "Getting document $doc_url\n";
+    my $srcdocs_dir = &util::filename_cat($output_dir,"srcdocs");
+    &util::mk_dir($srcdocs_dir)  if (!-e "$srcdocs_dir");
+    my $full_id_fname = &util::filename_cat($srcdocs_dir,$id_fname);
+    my $wget_cmd = "wget $wgetopt -q -O \"$full_id_fname\" \"$doc_url\"";
+    (system($wget_cmd)==0)
+    || print STDERR "Error: failed to execute $wget_cmd\n";
+    &util::mk_dir($output_dir)  if (!-e "$output_dir");
+    my $full_id_fname = &util::filename_cat($output_dir,$id_fname);
+    my $wget_cmd = "wget $wgetopt --quiet -O $full_id_fname \"$doc_url\"";
+    if (system($wget_cmd)!=0) {
+    print STDERR "Error: failed to execute $wget_cmd\n";
+    return 0;
+    }
+    return 1;
+}
 sub get_oai_records
+{
     my ($base_url,$ids,$output_dir, $get_id, $maxdocs, $out) = @_;
+    my ($base_url,$format, $ids,$output_dir, $get_id, $maxdocs, $out) = @_;
     my $doc_count = 0;
 …
+    {
     # wget it;
     my $url = "$base_url?verb=GetRecord&metadataPrefix=oai_dc";
+    my $url = "$base_url?verb=GetRecord&metadataPrefix=$format";
     $url .= "&identifier=$i";
     print $out "Downloading metadata record for $i\n";
 …
     close(OAIIN);
+    $num_processed++;
     # prepare subdirectory for record (if needed)
 …
         if ($get_id)
+        {
+        if ($m_record =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
+        my $got_doc = 0;
+        my @url_matches = ($m_record =~ m/<(?:dc:)?identifier>(.*?)<\/(?:dc:)?identifier>/gs);
+        foreach my $doc_url (@url_matches)
+        {
+            my $doc_url = $2;
+            get_oai_document($doc_url,$i_dir, $out);
+            my ($id_dir,$id_fname) = dir_file_split($doc_url);
+            $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n   <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
+            if ($doc_url =~ m/^(http|ftp):/) {
+            my $revised_doc_url = $doc_url;
+##          $revised_doc_url =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
+            my $srcdocs_dir = &util::filename_cat($i_dir,"srcdocs");
+            if (get_oai_document($revised_doc_url,$srcdocs_dir, $out)) {
+                $got_doc = 1;
+                my ($id_dir,$id_fname) = dir_file_split($revised_doc_url);
+                $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n   <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
+            }
+            }
+            if (!$got_doc) {
+            $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigIdentifier>$doc_url<\/OrigIdentifier>$4<\/metadata>/s;
+            }
+        }
+        }
 …
 sub main {
     my ($verbosity, $importdir, $keepold,
+    $getdoc, $acquire_info, $acquire_set,
     $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
     $configfilename, $collectcfg,
 …
              'verbosity/\d+/2', \$verbosity,
              'getdoc', \$getdoc,
+             'info', \$acquire_info,
              'importdir/.*/', \$importdir,
              'keepold', \$keepold,
 …
     print $out "Warning - removing current contents of the import directory\n";
     print $out "          in preparation for the acquire\n";
-    sleep(5); # just in case...
     &util::rm_r ($importdir);
+    }
 …
     if (!parsargv::parse($e,
+                 'getdoc', \$getdoc,
+                 'getdoc',  \$getdoc,
+                 'set/.*/', \$acquire_set,
+                 'format/.*/oai_dc', \$metadata_format,
                  'src/.*/', \$acquire_src)) {
         &print_usage();
 …
+    }
+    if (defined $acquire_info && ($acquire_info)) {
+        oai_info($acquire_src,$out,$verbosity);
+        next;
+    }
     print $out "$acquire_type Acquire: from $acquire_src\n";
+    my $li_record = get_oai_ids($acquire_src,$out);
+    my $ids = parse_oai_ids($li_record,$out);
+    get_oai_records($acquire_src,$ids,$importdir, $getdoc, $maxdocs, $out);
+    my $li_record = get_oai_ids($acquire_src,$acquire_set,$metadata_format,
+                    $out,$verbosity);
+    my $ids = parse_oai_ids($li_record,$out,$verbosity);
+    get_oai_records($acquire_src,$metadata_format, $ids,$importdir,
+            $getdoc, $maxdocs, $out);
     $getdoc = $store_getdoc;
+    }
+    print "\nNumber of documents processed: $num_processed\n";
     close OUT if $close_out;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 9952

Legend:

trunk/gsdl/bin/script/importfrom.pl

Download in other formats: