source: gsdl/trunk/bin/script/importfrom.pl@ 17198

Last change on this file since 17198 was 17198, checked in by kjdon, 16 years ago

changed ZIPPlug to ZIPPlugin in print usage

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# importfrom.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will contact the named DL server
30# and export its metadata and (optionally) it documents.
31
32# Currently only designed for OAI exporting
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38}
39
40use colcfg;
41use util;
42use parsargv;
43use FileHandle;
44
45my $wgetopt = "";
46
47my $num_processed = 0;
48
49sub print_usage {
50 print STDERR "\n usage: $0 [options] collection-name\n\n";
51 print STDERR " options:\n";
52 print STDERR " -verbosity number 0=none, 3=lots\n";
53 print STDERR " -getdoc Also download if source document if present\n";
54 print STDERR " -importdir directory Where the original material lives\n";
55 print STDERR " -keepold Will not destroy the current contents of the\n";
56 print STDERR " import directory (the default)\n";
57 print STDERR " -removeold Will remove the old contents of the import\n";
58 print STDERR " directory -- use with care\n";
59 print STDERR " -gzip Use gzip to compress exported documents\n";
60 print STDERR " (don't forget to include ZIPPlugin in your plugin\n";
61 print STDERR " -maxdocs number Maximum number of documents to import\n";
62 print STDERR " -debug Print imported text to STDOUT\n";
63 print STDERR " -collectdir directory Collection directory (defaults to " .
64 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
65 print STDERR " -out Filename or handle to print output status to.\n";
66 print STDERR " The default is STDERR\n\n";
67}
68
69
70
71sub xml_pretty_print
72{
73 my ($text,$out,$verbosity) = @_;
74
75 if (system("xmllint --version >/dev/null 2>&1")!=0) {
76 if ($verbosity>1) {
77 print STDERR "Warning: Unable to find xmllint for pretty printing.\n";
78 print STDERR " XML will be shown verbatim.\n\n";
79 }
80 print $out $text;
81 }
82 else {
83
84 if (!open (PPOUT,"|xmllint --format -")) {
85 print STDERR "Error running xmllint: $!\n\n";
86 print $out $text;
87 return;
88 }
89
90 print PPOUT $text;
91 close(PPOUT);
92 }
93}
94
95sub wget_oai_url
96{
97 my ($wget_cmd,$out,$verbosity) = @_;
98
99 if ($verbosity>2) {
100 print $out " $wget_cmd\n";
101 }
102
103 open (OAIIN,"$wget_cmd |")
104 || die "wget request failed: $!\n";
105
106 my $li_record = "";
107
108 my $line;
109 while (defined($line=<OAIIN>))
110 {
111 $li_record .= $line;
112 # print $out $line;
113 }
114
115 close(OAIIN);
116
117 return $li_record;
118}
119
120sub oai_info
121{
122 my ($base_url,$out,$verbosity) = @_;
123
124 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
125
126 my $identify = "verb=Identify";
127 my $list_sets = "verb=ListSets";
128 my $list_md_formats = "ListMetadataFormats"; # not currently used
129
130 my $identify_cmd = $base_wget_cmd;
131 $identify_cmd =~ s/_OPTS_/$identify/;
132 print $out "-------------------\n";
133 print $out "General Information\n";
134 print $out "-------------------\n";
135 my $identify_text = wget_oai_url($identify_cmd,$out,$verbosity);
136 xml_pretty_print($identify_text,$out,$verbosity);
137
138
139 my $list_sets_cmd = $base_wget_cmd;
140 $list_sets_cmd =~ s/_OPTS_/$list_sets/;
141 print $out "-------------------\n";
142 print $out "Set Information\n";
143 print $out "-------------------\n";
144 my $list_sets_text = wget_oai_url($list_sets_cmd,$out,$verbosity);
145 xml_pretty_print($list_sets_text,$out,$verbosity);
146}
147
148
149sub get_oai_ids
150{
151 my ($base_url, $set, $format, $out, $verbosity) = @_;
152
153 print $out "Requesting list of identifiers ...\n";
154
155 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
156 my $identifiers_cmd = $base_wget_cmd;
157
158 my $identifiers_opts = "verb=ListIdentifiers&metadataPrefix=$format";
159
160 if (defined $set && ($set ne "")) {
161 $identifiers_opts .= "&set=$set";
162 }
163
164 $identifiers_cmd =~ s/_OPTS_/$identifiers_opts/;
165
166 my $li_record = wget_oai_url($identifiers_cmd,$out,$verbosity);
167
168 print $out "... Done.\n";
169
170 return $li_record;
171}
172
173sub parse_oai_ids
174{
175 my ($li_record, $out, $verbosity) = @_;
176
177 # extract identifier list
178 $li_record =~ s/^.*?<identifier>/<identifier>/s;
179 $li_record =~ s/^(.*<\/identifier>).*$/$1/s;
180
181 my @ids = ();
182
183 while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
184 {
185 $li_record = $2;
186 push(@ids,$1);
187 }
188
189 return \@ids;
190}
191
192
193sub dir_file_split
194{
195 my ($file) = @_;
196
197 my @dirs = split("/",$file);
198 my $local_file = pop(@dirs);
199 my $sub_dirs = join("/",@dirs);
200
201 return ($sub_dirs,$local_file);
202}
203
204sub get_oai_document
205{
206 my ($doc_url,$output_dir, $out) = @_;
207
208 my ($id_dir,$id_fname) = dir_file_split($doc_url);
209
210 print $out "Getting document $doc_url\n";
211
212 &util::mk_dir($output_dir) if (!-e "$output_dir");
213
214 my $full_id_fname = &util::filename_cat($output_dir,$id_fname);
215
216 my $wget_cmd = "wget $wgetopt --quiet -O \"$full_id_fname\" \"$doc_url\"";
217
218 if (system($wget_cmd)!=0) {
219 print STDERR "Error: failed to execute $wget_cmd\n";
220 return 0;
221 }
222
223 return 1;
224}
225
226sub get_oai_records
227{
228 my ($base_url,$format, $ids,$output_dir, $get_id, $maxdocs, $out) = @_;
229
230 my $doc_count = 0;
231
232 my $i;
233 foreach $i ( @$ids )
234 {
235 # wget it;
236 my $url = "$base_url?verb=GetRecord&metadataPrefix=$format";
237 $url .= "&identifier=$i";
238 print $out "Downloading metadata record for $i\n";
239
240 my $i_url = $i; #convert OAI set separators (:) to directory sep
241 $i_url =~ s/:/\//g;
242 my $file_i_url = "$output_dir/$i_url.oai";
243
244 my $ds = &util::get_dirsep();
245 my $i_os = $i; #convert OAI set separators (:) to OS dir sep
246 $i_os =~ s/:/$ds/g;
247 my $file_i = &util::filename_cat($output_dir,"$i_os.oai");
248
249 # obtain record
250 my $wget_cmd = "wget $wgetopt -q -O - \"$url\"";
251
252 open (OAIIN,"$wget_cmd|")
253 || die "wget request failed: $!\n";
254 my $i_record = "";
255
256 my $line;
257 while (defined($line=<OAIIN>))
258 {
259 $i_record .= $line;
260 }
261
262 close(OAIIN);
263
264 $num_processed++;
265
266 # prepare subdirectory for record (if needed)
267 my ($i_dir,$unused) = dir_file_split($file_i_url);
268
269 &util::mk_all_dir($i_dir);
270
271 # look out for identifier tag in metadata section
272 if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
273 {
274 my $m_record = $1;
275
276 if ($get_id)
277 {
278 my $got_doc = 0;
279
280 my @url_matches = ($m_record =~ m/<(?:dc:)?identifier>(.*?)<\/(?:dc:)?identifier>/gs);
281 foreach my $doc_url (@url_matches)
282 {
283 if ($doc_url =~ m/^(http|ftp):/) {
284
285 my $revised_doc_url = $doc_url;
286## $revised_doc_url =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
287
288 my $srcdocs_dir = &util::filename_cat($i_dir,"srcdocs");
289
290 if (get_oai_document($revised_doc_url,$srcdocs_dir, $out)) {
291
292 $got_doc = 1;
293 my ($id_dir,$id_fname) = dir_file_split($revised_doc_url);
294
295 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
296
297 }
298 }
299
300 if (!$got_doc) {
301 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigIdentifier>$doc_url<\/OrigIdentifier>$4<\/metadata>/s;
302 }
303 }
304 }
305 }
306
307 # save record
308 open (OAIOUT,">$file_i")
309 || die "Unable to save oai metadata record: $!\n";
310 print OAIOUT $i_record;
311 close(OAIOUT);
312
313 $doc_count++;
314 last if ($doc_count == $maxdocs);
315 }
316}
317
318
319sub main {
320 my ($verbosity, $importdir, $keepold,
321 $getdoc, $acquire_info, $acquire_set,
322 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
323 $configfilename, $collectcfg,
324 $out, $collectdir);
325
326 if (!parsargv::parse(\@ARGV,
327 'verbosity/\d+/2', \$verbosity,
328 'getdoc', \$getdoc,
329 'info', \$acquire_info,
330 'importdir/.*/', \$importdir,
331 'keepold', \$keepold,
332 'removeold', \$removeold,
333 'gzip', \$gzip,
334 'debug', \$debug,
335 'maxdocs/^\-?\d+/-1', \$maxdocs,
336 'collectdir/.*/', \$collectdir,
337 'out/.*/STDERR', \$out)) {
338 &print_usage();
339 die "\n";
340 }
341
342 my $close_out = 0;
343 if ($out !~ /^(STDERR|STDOUT)$/i) {
344 open (OUT, ">$out") || die "Couldn't open output file $out\n";
345 $out = 'import::OUT';
346 $close_out = 1;
347 }
348 $out->autoflush(1);
349
350 # set removeold to false if it has been defined
351 $removeold = 0 if ($keepold);
352
353 # get and check the collection name
354 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
355 &print_usage();
356 die "\n";
357 }
358
359
360 # get acquire list
361 my $acquire = [];
362 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
363 if (-e $configfilename) {
364 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
365 if (defined $collectcfg->{'acquire'}) {
366 $acquire = $collectcfg->{'acquire'};
367 }
368 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
369 $importdir = $collectcfg->{'importdir'};
370 }
371 if (defined $collectcfg->{'removeold'}) {
372 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
373 $removeold = 1;
374 }
375 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
376 $removeold = 0;
377 }
378 }
379 } else {
380 die "Couldn't find the configuration file $configfilename\n";
381 }
382
383 # fill in the default import directory if none
384 # were supplied, turn all \ into / and remove trailing /
385 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
386 $importdir =~ s/[\\\/]+/\//g;
387 $importdir =~ s/\/$//;
388
389 # remove the old contents of the import directory if needed
390 if ($removeold && -e $importdir) {
391 print $out "Warning - removing current contents of the import directory\n";
392 print $out " in preparation for the acquire\n";
393 &util::rm_r ($importdir);
394 }
395
396 my $e;
397 foreach $e ( @$acquire )
398 {
399 my $acquire_type = shift @$e;
400 my $acquire_src = undef;
401
402 if ($acquire_type ne "OAI") {
403 print STDERR "Warning: $acquire_type not currently supported. Skipping.\n";
404 next;
405 }
406
407 my $store_getdoc = $getdoc;
408
409 if (!parsargv::parse($e,
410 'getdoc', \$getdoc,
411 'set/.*/', \$acquire_set,
412 'format/.*/oai_dc', \$metadata_format,
413 'src/.*/', \$acquire_src)) {
414 &print_usage();
415 die "\n";
416 }
417
418 if (!defined $acquire_src) {
419 print STDERR "Warning: Not -src flag defined. Skipping.\n";
420 next;
421 }
422
423 if (defined $acquire_info && ($acquire_info)) {
424 oai_info($acquire_src,$out,$verbosity);
425 next;
426 }
427
428 print $out "$acquire_type Acquire: from $acquire_src\n";
429
430 my $li_record = get_oai_ids($acquire_src,$acquire_set,$metadata_format,
431 $out,$verbosity);
432 my $ids = parse_oai_ids($li_record,$out,$verbosity);
433
434 get_oai_records($acquire_src,$metadata_format, $ids,$importdir,
435 $getdoc, $maxdocs, $out);
436 $getdoc = $store_getdoc;
437 }
438
439 print "\nNumber of documents processed: $num_processed\n";
440
441 close OUT if $close_out;
442}
443
444
445&main();
446
447
448
449
450
451
452
453
Note: See TracBrowser for help on using the repository browser.