source: main/trunk/greenstone2/bin/script/importfrom.pl@ 28560

Last change on this file since 28560 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# importfrom.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will contact the named DL server
30# and export its metadata and (optionally) it documents.
31
32# Currently only designed for OAI exporting
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38}
39
40use colcfg;
41use util;
42use FileUtils;
43use parsargv;
44use FileHandle;
45
46my $wgetopt = "";
47
48my $num_processed = 0;
49
50sub print_usage {
51 print STDERR "\n usage: $0 [options] collection-name\n\n";
52 print STDERR " options:\n";
53 print STDERR " -verbosity number 0=none, 3=lots\n";
54 print STDERR " -getdoc Also download if source document if present\n";
55 print STDERR " -importdir directory Where the original material lives\n";
56 print STDERR " -keepold Will not destroy the current contents of the\n";
57 print STDERR " import directory (the default)\n";
58 print STDERR " -removeold Will remove the old contents of the import\n";
59 print STDERR " directory -- use with care\n";
60 print STDERR " -gzip Use gzip to compress exported documents\n";
61 print STDERR " (don't forget to include ZIPPlugin in your plugin\n";
62 print STDERR " -maxdocs number Maximum number of documents to import\n";
63 print STDERR " -debug Print imported text to STDOUT\n";
64 print STDERR " -collectdir directory Collection directory (defaults to " .
65 &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect") . ")\n";
66 print STDERR " -out Filename or handle to print output status to.\n";
67 print STDERR " The default is STDERR\n\n";
68}
69
70
71
72sub xml_pretty_print
73{
74 my ($text,$out,$verbosity) = @_;
75
76 if (system("xmllint --version >/dev/null 2>&1")!=0) {
77 if ($verbosity>1) {
78 print STDERR "Warning: Unable to find xmllint for pretty printing.\n";
79 print STDERR " XML will be shown verbatim.\n\n";
80 }
81 print $out $text;
82 }
83 else {
84
85 if (!open (PPOUT,"|xmllint --format -")) {
86 print STDERR "Error running xmllint: $!\n\n";
87 print $out $text;
88 return;
89 }
90
91 print PPOUT $text;
92 close(PPOUT);
93 }
94}
95
96sub wget_oai_url
97{
98 my ($wget_cmd,$out,$verbosity) = @_;
99
100 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
101 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
102
103 if ($verbosity>2) {
104 print $out " $wget_cmd\n";
105 }
106
107 open (OAIIN,"$wget_cmd |")
108 || die "wget request failed: $!\n";
109
110 my $li_record = "";
111
112 my $line;
113 while (defined($line=<OAIIN>))
114 {
115 $li_record .= $line;
116 # print $out $line;
117 }
118
119 close(OAIIN);
120
121 return $li_record;
122}
123
124sub oai_info
125{
126 my ($base_url,$out,$verbosity) = @_;
127
128 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
129
130 my $identify = "verb=Identify";
131 my $list_sets = "verb=ListSets";
132 my $list_md_formats = "ListMetadataFormats"; # not currently used
133
134 my $identify_cmd = $base_wget_cmd;
135 $identify_cmd =~ s/_OPTS_/$identify/;
136 print $out "-------------------\n";
137 print $out "General Information\n";
138 print $out "-------------------\n";
139 my $identify_text = wget_oai_url($identify_cmd,$out,$verbosity);
140 xml_pretty_print($identify_text,$out,$verbosity);
141
142
143 my $list_sets_cmd = $base_wget_cmd;
144 $list_sets_cmd =~ s/_OPTS_/$list_sets/;
145 print $out "-------------------\n";
146 print $out "Set Information\n";
147 print $out "-------------------\n";
148 my $list_sets_text = wget_oai_url($list_sets_cmd,$out,$verbosity);
149 xml_pretty_print($list_sets_text,$out,$verbosity);
150}
151
152
153sub get_oai_ids
154{
155 my ($base_url, $set, $format, $out, $verbosity) = @_;
156
157 print $out "Requesting list of identifiers ...\n";
158
159 my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
160 my $identifiers_cmd = $base_wget_cmd;
161
162 my $identifiers_opts = "verb=ListIdentifiers&metadataPrefix=$format";
163
164 if (defined $set && ($set ne "")) {
165 $identifiers_opts .= "&set=$set";
166 }
167
168 $identifiers_cmd =~ s/_OPTS_/$identifiers_opts/;
169
170 my $li_record = wget_oai_url($identifiers_cmd,$out,$verbosity);
171
172 print $out "... Done.\n";
173
174 return $li_record;
175}
176
177sub parse_oai_ids
178{
179 my ($li_record, $out, $verbosity) = @_;
180
181 # extract identifier list
182 $li_record =~ s/^.*?<identifier>/<identifier>/s;
183 $li_record =~ s/^(.*<\/identifier>).*$/$1/s;
184
185 my @ids = ();
186
187 while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
188 {
189 $li_record = $2;
190 push(@ids,$1);
191 }
192
193 return \@ids;
194}
195
196
197sub dir_file_split
198{
199 my ($file) = @_;
200
201 my @dirs = split("/",$file);
202 my $local_file = pop(@dirs);
203 my $sub_dirs = join("/",@dirs);
204
205 return ($sub_dirs,$local_file);
206}
207
208sub get_oai_document
209{
210 my ($doc_url,$output_dir, $out) = @_;
211
212 my ($id_dir,$id_fname) = dir_file_split($doc_url);
213
214 print $out "Getting document $doc_url\n";
215
216 &FileUtils::makeDirectory($output_dir) if (!-e "$output_dir");
217
218 my $full_id_fname = &FileUtils::filenameConcatenate($output_dir,$id_fname);
219
220 my $wget_cmd = "wget $wgetopt --quiet -O \"$full_id_fname\" \"$doc_url\"";
221
222 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
223 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
224
225 if (system($wget_cmd)!=0) {
226 print STDERR "Error: failed to execute $wget_cmd\n";
227 return 0;
228 }
229
230 return 1;
231}
232
233sub get_oai_records
234{
235 my ($base_url,$format, $ids,$output_dir, $get_id, $maxdocs, $out) = @_;
236
237 my $doc_count = 0;
238
239 my $i;
240 foreach $i ( @$ids )
241 {
242 # wget it;
243 my $url = "$base_url?verb=GetRecord&metadataPrefix=$format";
244 $url .= "&identifier=$i";
245 print $out "Downloading metadata record for $i\n";
246
247 my $i_url = $i; #convert OAI set separators (:) to directory sep
248 $i_url =~ s/:/\//g;
249 my $file_i_url = "$output_dir/$i_url.oai";
250
251 my $ds = &util::get_dirsep();
252 my $i_os = $i; #convert OAI set separators (:) to OS dir sep
253 $i_os =~ s/:/$ds/g;
254 my $file_i = &FileUtils::filenameConcatenate($output_dir,"$i_os.oai");
255
256 # obtain record
257 my $wget_cmd = "wget $wgetopt -q -O - \"$url\"";
258
259 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
260 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
261
262 open (OAIIN,"$wget_cmd|")
263 || die "wget request failed: $!\n";
264 my $i_record = "";
265
266 my $line;
267 while (defined($line=<OAIIN>))
268 {
269 $i_record .= $line;
270 }
271
272 close(OAIIN);
273
274 $num_processed++;
275
276 # prepare subdirectory for record (if needed)
277 my ($i_dir,$unused) = dir_file_split($file_i_url);
278
279 &FileUtils::makeAllDirectories($i_dir);
280
281 # look out for identifier tag in metadata section
282 if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
283 {
284 my $m_record = $1;
285
286 if ($get_id)
287 {
288 my $got_doc = 0;
289
290 my @url_matches = ($m_record =~ m/<(?:dc:)?identifier>(.*?)<\/(?:dc:)?identifier>/gs);
291 foreach my $doc_url (@url_matches)
292 {
293 if ($doc_url =~ m/^(http|ftp):/) {
294
295 my $revised_doc_url = $doc_url;
296## $revised_doc_url =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
297
298 my $srcdocs_dir = &FileUtils::filenameConcatenate($i_dir,"srcdocs");
299
300 if (get_oai_document($revised_doc_url,$srcdocs_dir, $out)) {
301
302 $got_doc = 1;
303 my ($id_dir,$id_fname) = dir_file_split($revised_doc_url);
304
305 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
306
307 }
308 }
309
310 if (!$got_doc) {
311 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigIdentifier>$doc_url<\/OrigIdentifier>$4<\/metadata>/s;
312 }
313 }
314 }
315 }
316
317 # save record
318 open (OAIOUT,">$file_i")
319 || die "Unable to save oai metadata record: $!\n";
320 print OAIOUT $i_record;
321 close(OAIOUT);
322
323 $doc_count++;
324 last if ($doc_count == $maxdocs);
325 }
326}
327
328
329sub main {
330 my ($verbosity, $importdir, $keepold,
331 $getdoc, $acquire_info, $acquire_set,
332 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
333 $configfilename, $collectcfg,
334 $out, $collectdir);
335
336 if (!parsargv::parse(\@ARGV,
337 'verbosity/\d+/2', \$verbosity,
338 'getdoc', \$getdoc,
339 'info', \$acquire_info,
340 'importdir/.*/', \$importdir,
341 'keepold', \$keepold,
342 'removeold', \$removeold,
343 'gzip', \$gzip,
344 'debug', \$debug,
345 'maxdocs/^\-?\d+/-1', \$maxdocs,
346 'collectdir/.*/', \$collectdir,
347 'out/.*/STDERR', \$out)) {
348 &print_usage();
349 die "\n";
350 }
351
352 my $close_out = 0;
353 if ($out !~ /^(STDERR|STDOUT)$/i) {
354 open (OUT, ">$out") || die "Couldn't open output file $out\n";
355 $out = 'import::OUT';
356 $close_out = 1;
357 }
358 $out->autoflush(1);
359
360 # set removeold to false if it has been defined
361 $removeold = 0 if ($keepold);
362
363 # get and check the collection name
364 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
365 &print_usage();
366 die "\n";
367 }
368
369
370 # get acquire list
371 my $acquire = [];
372 $configfilename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
373 if (-e $configfilename) {
374 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
375 if (defined $collectcfg->{'acquire'}) {
376 $acquire = $collectcfg->{'acquire'};
377 }
378 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
379 $importdir = $collectcfg->{'importdir'};
380 }
381 if (defined $collectcfg->{'removeold'}) {
382 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
383 $removeold = 1;
384 }
385 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
386 $removeold = 0;
387 }
388 }
389 } else {
390 die "Couldn't find the configuration file $configfilename\n";
391 }
392
393 # fill in the default import directory if none
394 # were supplied, turn all \ into / and remove trailing /
395 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
396 $importdir =~ s/[\\\/]+/\//g;
397 $importdir =~ s/\/$//;
398
399 # remove the old contents of the import directory if needed
400 if ($removeold && -e $importdir) {
401 print $out "Warning - removing current contents of the import directory\n";
402 print $out " in preparation for the acquire\n";
403 &FileUtils::removeFilesRecursive($importdir);
404 }
405
406 my $e;
407 foreach $e ( @$acquire )
408 {
409 my $acquire_type = shift @$e;
410 my $acquire_src = undef;
411
412 if ($acquire_type ne "OAI") {
413 print STDERR "Warning: $acquire_type not currently supported. Skipping.\n";
414 next;
415 }
416
417 my $store_getdoc = $getdoc;
418
419 if (!parsargv::parse($e,
420 'getdoc', \$getdoc,
421 'set/.*/', \$acquire_set,
422 'format/.*/oai_dc', \$metadata_format,
423 'src/.*/', \$acquire_src)) {
424 &print_usage();
425 die "\n";
426 }
427
428 if (!defined $acquire_src) {
429 print STDERR "Warning: Not -src flag defined. Skipping.\n";
430 next;
431 }
432
433 if (defined $acquire_info && ($acquire_info)) {
434 oai_info($acquire_src,$out,$verbosity);
435 next;
436 }
437
438 print $out "$acquire_type Acquire: from $acquire_src\n";
439
440 my $li_record = get_oai_ids($acquire_src,$acquire_set,$metadata_format,
441 $out,$verbosity);
442 my $ids = parse_oai_ids($li_record,$out,$verbosity);
443
444 get_oai_records($acquire_src,$metadata_format, $ids,$importdir,
445 $getdoc, $maxdocs, $out);
446 $getdoc = $store_getdoc;
447 }
448
449 print "\nNumber of documents processed: $num_processed\n";
450
451 close OUT if $close_out;
452}
453
454
455&main();
456
457
458
459
460
461
462
463
Note: See TracBrowser for help on using the repository browser.