source: gsdl/trunk/bin/script/importfrom.pl@ 14178

Last change on this file since 14178 was 14178, checked in by xiao, 17 years ago

The directory arguments of wget command need to be quoted to work properly on Windows OS

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# importfrom.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will contact the named DL server
30# and export its metadata and (optionally) it documents.
31
32# Currently only designed for OAI exporting
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38}
39
40use colcfg;
41use util;
42use parsargv;
43use FileHandle;
44
45my $wgetopt = "";
46
47my $num_processed = 0;
48
49sub print_usage {
50 print STDERR "\n usage: $0 [options] collection-name\n\n";
51 print STDERR " options:\n";
52 print STDERR " -verbosity number 0=none, 3=lots\n";
53 print STDERR " -getdoc Also download if source document if present\n";
54 print STDERR " -importdir directory Where the original material lives\n";
55 print STDERR " -keepold Will not destroy the current contents of the\n";
56 print STDERR " import directory (the default)\n";
57 print STDERR " -removeold Will remove the old contents of the import\n";
58 print STDERR " directory -- use with care\n";
59 print STDERR " -gzip Use gzip to compress exported documents\n";
60 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
61 print STDERR " -maxdocs number Maximum number of documents to import\n";
62 print STDERR " -debug Print imported text to STDOUT\n";
63 print STDERR " -collectdir directory Collection directory (defaults to " .
64 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
65 print STDERR " -out Filename or handle to print output status to.\n";
66 print STDERR " The default is STDERR\n\n";
67}
68
69
70
71sub xml_pretty_print
72{
73 my ($text,$out,$verbosity) = @_;
74
75 if (system("xmllint --version >/dev/null 2>&1")!=0) {
76 if ($verbosity>1) {
77 print STDERR "Warning: Unable to find xmllint for pretty printing.\n";
78 print STDERR " XML will be shown verbatim.\n\n";
79 }
80 print $out $text;
81 }
82 else {
83
84 if (!open (PPOUT,"|xmllint --format -")) {
85 print STDERR "Error running xmllint: $!\n\n";
86 print $out $text;
87 return;
88 }
89
90 print PPOUT $text;
91 close(PPOUT);
92 }
93}
94
95sub wget_oai_url
96{
97 my ($wget_cmd,$out,$verbosity) = @_;
98
99 if ($verbosity>2) {
100 print $out " $wget_cmd\n";
101 }
102
103 open (OAIIN,"$wget_cmd |")
104 || die "wget request failed: $!\n";
105
106 my $li_record = "";
107
108 my $line;
109 while (defined($line=<OAIIN>))
110 {
111 $li_record .= $line;
112 # print $out $line;
113 }
114
115 close(OAIIN);
116
117 return $li_record;
118}
119
120sub oai_info
121{
122 my ($base_url,$out,$verbosity) = @_;
123
124 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
125
126 my $identify = "verb=Identify";
127 my $list_sets = "verb=ListSets";
128 my $list_md_formats = "ListMetadataFormats"; # not currently used
129
130 my $identify_cmd = $base_wget_cmd;
131 $identify_cmd =~ s/_OPTS_/$identify/;
132 print $out "-------------------\n";
133 print $out "General Information\n";
134 print $out "-------------------\n";
135 my $identify_text = wget_oai_url($identify_cmd,$out,$verbosity);
136 xml_pretty_print($identify_text,$out,$verbosity);
137
138
139 my $list_sets_cmd = $base_wget_cmd;
140 $list_sets_cmd =~ s/_OPTS_/$list_sets/;
141 print $out "-------------------\n";
142 print $out "Set Information\n";
143 print $out "-------------------\n";
144 my $list_sets_text = wget_oai_url($list_sets_cmd,$out,$verbosity);
145 xml_pretty_print($list_sets_text,$out,$verbosity);
146}
147
148
149sub get_oai_ids
150{
151 my ($base_url, $set, $format, $out, $verbosity) = @_;
152
153 print $out "Requesting list of identifiers ...\n";
154
155 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
156 my $identifiers_cmd = $base_wget_cmd;
157
158 my $identifiers_opts = "verb=ListIdentifiers&metadataPrefix=$format";
159
160 if (defined $set && ($set ne "")) {
161 $identifiers_opts .= "&set=$set";
162 }
163
164 $identifiers_cmd =~ s/_OPTS_/$identifiers_opts/;
165
166 my $li_record = wget_oai_url($identifiers_cmd,$out,$verbosity);
167
168 print $out "... Done.\n";
169
170 return $li_record;
171}
172
173sub parse_oai_ids
174{
175 my ($li_record, $out, $verbosity) = @_;
176
177 # extract identifier list
178 $li_record =~ s/^.*?<identifier>/<identifier>/s;
179 $li_record =~ s/^(.*<\/identifier>).*$/$1/s;
180
181 my @ids = ();
182
183 while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
184 {
185 $li_record = $2;
186 push(@ids,$1);
187 }
188
189 return \@ids;
190}
191
192
193sub dir_file_split
194{
195 my ($file) = @_;
196
197 my @dirs = split("/",$file);
198 my $local_file = pop(@dirs);
199 my $sub_dirs = join("/",@dirs);
200
201 return ($sub_dirs,$local_file);
202}
203
204sub get_oai_document
205{
206 my ($doc_url,$output_dir, $out) = @_;
207
208 my ($id_dir,$id_fname) = dir_file_split($doc_url);
209
210 print $out "Getting document $doc_url\n";
211
212 &util::mk_dir($output_dir) if (!-e "$output_dir");
213
214 my $full_id_fname = &util::filename_cat($output_dir,$id_fname);
215
216 my $wget_cmd = "wget $wgetopt --quiet -O \"$full_id_fname\" \"$doc_url\"";
217
218 if (system($wget_cmd)!=0) {
219 print STDERR "Error: failed to execute $wget_cmd\n";
220 return 0;
221 }
222
223 return 1;
224}
225
226sub get_oai_records
227{
228 my ($base_url,$format, $ids,$output_dir, $get_id, $maxdocs, $out) = @_;
229
230 my $doc_count = 0;
231
232 my $i;
233 foreach $i ( @$ids )
234 {
235 # wget it;
236 my $url = "$base_url?verb=GetRecord&metadataPrefix=$format";
237 $url .= "&identifier=$i";
238 print $out "Downloading metadata record for $i\n";
239
240 my $i_url = $i; #convert OAI set separators (:) to directory sep
241 $i_url =~ s/:/\//g;
242 my $file_i_url = "$output_dir/$i_url.oai";
243
244 my $ds = &util::get_dirsep();
245 my $i_os = $i; #convert OAI set separators (:) to OS dir sep
246 $i_os =~ s/:/$ds/g;
247 my $file_i = &util::filename_cat($output_dir,"$i_os.oai");
248
249 # obtain record
250 my $wget_cmd = "wget $wgetopt -q -O - \"$url\"";
251
252 open (OAIIN,"$wget_cmd|")
253 || die "wget request failed: $!\n";
254 my $i_record = "";
255
256 my $line;
257 while (defined($line=<OAIIN>))
258 {
259 $i_record .= $line;
260 }
261
262 close(OAIIN);
263
264 $num_processed++;
265
266 # prepare subdirectory for record (if needed)
267 my ($i_dir,$unused) = dir_file_split($file_i_url);
268
269 &util::mk_all_dir($i_dir);
270
271 # look out for identifier tag in metadata section
272 if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
273 {
274 my $m_record = $1;
275
276 if ($get_id)
277 {
278 my $got_doc = 0;
279
280 my @url_matches = ($m_record =~ m/<(?:dc:)?identifier>(.*?)<\/(?:dc:)?identifier>/gs);
281 foreach my $doc_url (@url_matches)
282 {
283 if ($doc_url =~ m/^(http|ftp):/) {
284
285 my $revised_doc_url = $doc_url;
286## $revised_doc_url =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
287
288 my $srcdocs_dir = &util::filename_cat($i_dir,"srcdocs");
289
290 if (get_oai_document($revised_doc_url,$srcdocs_dir, $out)) {
291
292 $got_doc = 1;
293 my ($id_dir,$id_fname) = dir_file_split($revised_doc_url);
294
295 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
296
297 }
298 }
299
300 if (!$got_doc) {
301 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigIdentifier>$doc_url<\/OrigIdentifier>$4<\/metadata>/s;
302 }
303 }
304 }
305 }
306
307 # save record
308 open (OAIOUT,">$file_i")
309 || die "Unable to save oai metadata record: $!\n";
310 print OAIOUT $i_record;
311 close(OAIOUT);
312
313 $doc_count++;
314 last if ($doc_count == $maxdocs);
315 }
316}
317
318
319sub main {
320 my ($verbosity, $importdir, $keepold,
321 $getdoc, $acquire_info, $acquire_set,
322 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
323 $configfilename, $collectcfg,
324 $out, $collectdir);
325
326 if (!parsargv::parse(\@ARGV,
327 'verbosity/\d+/2', \$verbosity,
328 'getdoc', \$getdoc,
329 'info', \$acquire_info,
330 'importdir/.*/', \$importdir,
331 'keepold', \$keepold,
332 'removeold', \$removeold,
333 'gzip', \$gzip,
334 'debug', \$debug,
335 'maxdocs/^\-?\d+/-1', \$maxdocs,
336 'collectdir/.*/', \$collectdir,
337 'out/.*/STDERR', \$out)) {
338 &print_usage();
339 die "\n";
340 }
341
342 my $close_out = 0;
343 if ($out !~ /^(STDERR|STDOUT)$/i) {
344 open (OUT, ">$out") || die "Couldn't open output file $out\n";
345 $out = 'import::OUT';
346 $close_out = 1;
347 }
348 $out->autoflush(1);
349
350 # set removeold to false if it has been defined
351 $removeold = 0 if ($keepold);
352
353 # get and check the collection name
354 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
355 &print_usage();
356 die "\n";
357 }
358
359
360 # get acquire list
361 my $acquire = [];
362 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
363 if (-e $configfilename) {
364 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
365 if (defined $collectcfg->{'acquire'}) {
366 $acquire = $collectcfg->{'acquire'};
367 }
368 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
369 $importdir = $collectcfg->{'importdir'};
370 }
371 if (defined $collectcfg->{'removeold'}) {
372 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
373 $removeold = 1;
374 }
375 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
376 $removeold = 0;
377 }
378 }
379 } else {
380 die "Couldn't find the configuration file $configfilename\n";
381 }
382
383 # fill in the default import directory if none
384 # were supplied, turn all \ into / and remove trailing /
385 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
386 $importdir =~ s/[\\\/]+/\//g;
387 $importdir =~ s/\/$//;
388
389 # remove the old contents of the import directory if needed
390 if ($removeold && -e $importdir) {
391 print $out "Warning - removing current contents of the import directory\n";
392 print $out " in preparation for the acquire\n";
393 &util::rm_r ($importdir);
394 }
395
396 my $e;
397 foreach $e ( @$acquire )
398 {
399 my $acquire_type = shift @$e;
400 my $acquire_src = undef;
401
402 if ($acquire_type ne "OAI") {
403 print STDERR "Warning: $acquire_type not currently supported. Skipping.\n";
404 next;
405 }
406
407 my $store_getdoc = $getdoc;
408
409 if (!parsargv::parse($e,
410 'getdoc', \$getdoc,
411 'set/.*/', \$acquire_set,
412 'format/.*/oai_dc', \$metadata_format,
413 'src/.*/', \$acquire_src)) {
414 &print_usage();
415 die "\n";
416 }
417
418 if (!defined $acquire_src) {
419 print STDERR "Warning: Not -src flag defined. Skipping.\n";
420 next;
421 }
422
423 if (defined $acquire_info && ($acquire_info)) {
424 oai_info($acquire_src,$out,$verbosity);
425 next;
426 }
427
428 print $out "$acquire_type Acquire: from $acquire_src\n";
429
430 my $li_record = get_oai_ids($acquire_src,$acquire_set,$metadata_format,
431 $out,$verbosity);
432 my $ids = parse_oai_ids($li_record,$out,$verbosity);
433
434 get_oai_records($acquire_src,$metadata_format, $ids,$importdir,
435 $getdoc, $maxdocs, $out);
436 $getdoc = $store_getdoc;
437 }
438
439 print "\nNumber of documents processed: $num_processed\n";
440
441 close OUT if $close_out;
442}
443
444
445&main();
446
447
448
449
450
451
452
453
Note: See TracBrowser for help on using the repository browser.