Changeset 9952
- Timestamp:
- 2005-05-25T17:07:04+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/importfrom.pl
r9699 r9952 44 44 45 45 my $wgetopt = ""; 46 47 my $num_processed = 0; 46 48 47 49 sub print_usage { … … 67 69 68 70 69 70 sub print_usage_old 71 { 72 my ($prog_name) = @_; 73 74 print STDERR "Usage: $prog_name OAI-base-URL\n"; 75 exit 1; 76 } 77 78 sub get_oai_ids 79 { 80 my ($base_url, $out) = @_; 81 82 print $out "Requesting list of identifiers ...\n"; 83 84 open (OAIIN,"wget $wgetopt -q -O - \"$base_url?verb=ListIdentifiers&metadataPrefix=oai_dc\" |") 71 sub xml_pretty_print 72 { 73 my ($text,$out,$verbosity) = @_; 74 75 if (system("xmllint --version >/dev/null 2>&1")!=0) { 76 if ($verbosity>1) { 77 print STDERR "Warning: Unable to find xmllint for pretty printing.\n"; 78 print STDERR " XML will be shown verbatim.\n\n"; 79 } 80 print $out $text; 81 } 82 else { 83 84 if (!open (PPOUT,"|xmllint --format -")) { 85 print STDERR "Error running xmllint: $!\n\n"; 86 print $out $text; 87 return; 88 } 89 90 print PPOUT $text; 91 close(PPOUT); 92 } 93 } 94 95 sub wget_oai_url 96 { 97 my ($wget_cmd,$out,$verbosity) = @_; 98 99 if ($verbosity>2) { 100 print $out " $wget_cmd\n"; 101 } 102 103 open (OAIIN,"$wget_cmd |") 85 104 || die "wget request failed: $!\n"; 86 105 … … 93 112 # print $out $line; 94 113 } 114 115 close(OAIIN); 116 117 return $li_record; 118 } 119 120 sub oai_info 121 { 122 my ($base_url,$out,$verbosity) = @_; 123 124 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\""; 125 126 my $identify = "verb=Identify"; 127 my $list_sets = "verb=ListSets"; 128 my $list_md_formats = "ListMetadataFormats"; # not currently used 129 130 my $identify_cmd = $base_wget_cmd; 131 $identify_cmd =~ s/_OPTS_/$identify/; 132 print $out "-------------------\n"; 133 print $out "General Information\n"; 134 print $out "-------------------\n"; 135 my $identify_text = wget_oai_url($identify_cmd,$out,$verbosity); 136 xml_pretty_print($identify_text,$out,$verbosity); 137 95 138 96 close(OAIIN); 139 my $list_sets_cmd = $base_wget_cmd; 140 $list_sets_cmd =~ s/_OPTS_/$list_sets/; 141 print $out "-------------------\n"; 142 print $out "Set Information\n"; 143 print $out "-------------------\n"; 144 my $list_sets_text = wget_oai_url($list_sets_cmd,$out,$verbosity); 145 xml_pretty_print($list_sets_text,$out,$verbosity); 146 } 147 148 149 sub get_oai_ids 150 { 151 my ($base_url, $set, $format, $out, $verbosity) = @_; 152 153 print $out "Requesting list of identifiers ...\n"; 154 155 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\""; 156 my $identifiers_cmd = $base_wget_cmd; 157 158 my $identifiers_opts = "verb=ListIdentifiers&metadataPrefix=$format"; 159 160 if (defined $set && ($set ne "")) { 161 $identifiers_opts .= "&set=$set"; 162 } 163 164 $identifiers_cmd =~ s/_OPTS_/$identifiers_opts/; 165 166 my $li_record = wget_oai_url($identifiers_cmd,$out,$verbosity); 167 97 168 print $out "... Done.\n"; 98 169 … … 102 173 sub parse_oai_ids 103 174 { 104 my ($li_record, $out ) = @_;175 my ($li_record, $out, $verbosity) = @_; 105 176 106 177 # extract identifier list … … 139 210 print $out "Getting document $doc_url\n"; 140 211 141 my $srcdocs_dir = &util::filename_cat($output_dir,"srcdocs"); 142 &util::mk_dir($srcdocs_dir) if (!-e "$srcdocs_dir"); 143 144 my $full_id_fname = &util::filename_cat($srcdocs_dir,$id_fname); 145 146 my $wget_cmd = "wget $wgetopt -q -O \"$full_id_fname\" \"$doc_url\""; 147 148 (system($wget_cmd)==0) 149 || print STDERR "Error: failed to execute $wget_cmd\n"; 150 212 &util::mk_dir($output_dir) if (!-e "$output_dir"); 213 214 my $full_id_fname = &util::filename_cat($output_dir,$id_fname); 215 216 my $wget_cmd = "wget $wgetopt --quiet -O $full_id_fname \"$doc_url\""; 217 218 if (system($wget_cmd)!=0) { 219 print STDERR "Error: failed to execute $wget_cmd\n"; 220 return 0; 221 } 222 223 return 1; 151 224 } 152 225 153 226 sub get_oai_records 154 227 { 155 my ($base_url,$ ids,$output_dir, $get_id, $maxdocs, $out) = @_;228 my ($base_url,$format, $ids,$output_dir, $get_id, $maxdocs, $out) = @_; 156 229 157 230 my $doc_count = 0; … … 161 234 { 162 235 # wget it; 163 my $url = "$base_url?verb=GetRecord&metadataPrefix= oai_dc";236 my $url = "$base_url?verb=GetRecord&metadataPrefix=$format"; 164 237 $url .= "&identifier=$i"; 165 238 print $out "Downloading metadata record for $i\n"; … … 188 261 189 262 close(OAIIN); 263 264 $num_processed++; 190 265 191 266 # prepare subdirectory for record (if needed) … … 201 276 if ($get_id) 202 277 { 203 if ($m_record =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s) 278 my $got_doc = 0; 279 280 my @url_matches = ($m_record =~ m/<(?:dc:)?identifier>(.*?)<\/(?:dc:)?identifier>/gs); 281 foreach my $doc_url (@url_matches) 204 282 { 205 my $doc_url = $2; 206 get_oai_document($doc_url,$i_dir, $out); 207 208 209 my ($id_dir,$id_fname) = dir_file_split($doc_url); 210 211 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s; 283 if ($doc_url =~ m/^(http|ftp):/) { 284 285 my $revised_doc_url = $doc_url; 286 ## $revised_doc_url =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/; 287 288 my $srcdocs_dir = &util::filename_cat($i_dir,"srcdocs"); 289 290 if (get_oai_document($revised_doc_url,$srcdocs_dir, $out)) { 291 292 $got_doc = 1; 293 my ($id_dir,$id_fname) = dir_file_split($revised_doc_url); 294 295 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s; 296 297 } 298 } 299 300 if (!$got_doc) { 301 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigIdentifier>$doc_url<\/OrigIdentifier>$4<\/metadata>/s; 302 } 212 303 } 213 304 } … … 228 319 sub main { 229 320 my ($verbosity, $importdir, $keepold, 321 $getdoc, $acquire_info, $acquire_set, 230 322 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection, 231 323 $configfilename, $collectcfg, … … 235 327 'verbosity/\d+/2', \$verbosity, 236 328 'getdoc', \$getdoc, 329 'info', \$acquire_info, 237 330 'importdir/.*/', \$importdir, 238 331 'keepold', \$keepold, … … 298 391 print $out "Warning - removing current contents of the import directory\n"; 299 392 print $out " in preparation for the acquire\n"; 300 sleep(5); # just in case...301 393 &util::rm_r ($importdir); 302 394 } … … 316 408 317 409 if (!parsargv::parse($e, 318 'getdoc', \$getdoc, 410 'getdoc', \$getdoc, 411 'set/.*/', \$acquire_set, 412 'format/.*/oai_dc', \$metadata_format, 319 413 'src/.*/', \$acquire_src)) { 320 414 &print_usage(); … … 327 421 } 328 422 423 if (defined $acquire_info && ($acquire_info)) { 424 oai_info($acquire_src,$out,$verbosity); 425 next; 426 } 427 329 428 print $out "$acquire_type Acquire: from $acquire_src\n"; 330 429 331 my $li_record = get_oai_ids($acquire_src,$out); 332 my $ids = parse_oai_ids($li_record,$out); 333 334 get_oai_records($acquire_src,$ids,$importdir, $getdoc, $maxdocs, $out); 430 my $li_record = get_oai_ids($acquire_src,$acquire_set,$metadata_format, 431 $out,$verbosity); 432 my $ids = parse_oai_ids($li_record,$out,$verbosity); 433 434 get_oai_records($acquire_src,$metadata_format, $ids,$importdir, 435 $getdoc, $maxdocs, $out); 335 436 $getdoc = $store_getdoc; 336 437 } 337 438 439 print "\nNumber of documents processed: $num_processed\n"; 440 338 441 close OUT if $close_out; 339 442 }
Note:
See TracChangeset
for help on using the changeset viewer.