source: main/tags/2.60/gsdl/bin/script/importfrom.pl@ 25196

Last change on this file since 25196 was 9699, checked in by mdewsnip, 19 years ago

Added quotes around -O argument to wget, so it works on Windows with spaces in the filename.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# importfrom.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will contact the named DL server
30# and export its metadata and (optionally) it documents.
31
32# Currently only designed for OAI exporting
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38}
39
40use colcfg;
41use util;
42use parsargv;
43use FileHandle;
44
45my $wgetopt = "";
46
47sub print_usage {
48 print STDERR "\n usage: $0 [options] collection-name\n\n";
49 print STDERR " options:\n";
50 print STDERR " -verbosity number 0=none, 3=lots\n";
51 print STDERR " -getdoc Also download if source document if present\n";
52 print STDERR " -importdir directory Where the original material lives\n";
53 print STDERR " -keepold Will not destroy the current contents of the\n";
54 print STDERR " import directory (the default)\n";
55 print STDERR " -removeold Will remove the old contents of the import\n";
56 print STDERR " directory -- use with care\n";
57 print STDERR " -gzip Use gzip to compress exported documents\n";
58 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
59 print STDERR " -maxdocs number Maximum number of documents to import\n";
60 print STDERR " -debug Print imported text to STDOUT\n";
61 print STDERR " -collectdir directory Collection directory (defaults to " .
62 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
63 print STDERR " -out Filename or handle to print output status to.\n";
64 print STDERR " The default is STDERR\n\n";
65}
66
67
68
69
70sub print_usage_old
71{
72 my ($prog_name) = @_;
73
74 print STDERR "Usage: $prog_name OAI-base-URL\n";
75 exit 1;
76}
77
78sub get_oai_ids
79{
80 my ($base_url, $out) = @_;
81
82 print $out "Requesting list of identifiers ...\n";
83
84 open (OAIIN,"wget $wgetopt -q -O - \"$base_url?verb=ListIdentifiers&metadataPrefix=oai_dc\" |")
85 || die "wget request failed: $!\n";
86
87 my $li_record = "";
88
89 my $line;
90 while (defined($line=<OAIIN>))
91 {
92 $li_record .= $line;
93 # print $out $line;
94 }
95
96 close(OAIIN);
97 print $out "... Done.\n";
98
99 return $li_record;
100}
101
102sub parse_oai_ids
103{
104 my ($li_record, $out) = @_;
105
106 # extract identifier list
107 $li_record =~ s/^.*?<identifier>/<identifier>/s;
108 $li_record =~ s/^(.*<\/identifier>).*$/$1/s;
109
110 my @ids = ();
111
112 while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
113 {
114 $li_record = $2;
115 push(@ids,$1);
116 }
117
118 return \@ids;
119}
120
121
122sub dir_file_split
123{
124 my ($file) = @_;
125
126 my @dirs = split("/",$file);
127 my $local_file = pop(@dirs);
128 my $sub_dirs = join("/",@dirs);
129
130 return ($sub_dirs,$local_file);
131}
132
133sub get_oai_document
134{
135 my ($doc_url,$output_dir, $out) = @_;
136
137 my ($id_dir,$id_fname) = dir_file_split($doc_url);
138
139 print $out "Getting document $doc_url\n";
140
141 my $srcdocs_dir = &util::filename_cat($output_dir,"srcdocs");
142 &util::mk_dir($srcdocs_dir) if (!-e "$srcdocs_dir");
143
144 my $full_id_fname = &util::filename_cat($srcdocs_dir,$id_fname);
145
146 my $wget_cmd = "wget $wgetopt -q -O \"$full_id_fname\" \"$doc_url\"";
147
148 (system($wget_cmd)==0)
149 || print STDERR "Error: failed to execute $wget_cmd\n";
150
151}
152
153sub get_oai_records
154{
155 my ($base_url,$ids,$output_dir, $get_id, $maxdocs, $out) = @_;
156
157 my $doc_count = 0;
158
159 my $i;
160 foreach $i ( @$ids )
161 {
162 # wget it;
163 my $url = "$base_url?verb=GetRecord&metadataPrefix=oai_dc";
164 $url .= "&identifier=$i";
165 print $out "Downloading metadata record for $i\n";
166
167 my $i_url = $i; #convert OAI set separators (:) to directory sep
168 $i_url =~ s/:/\//g;
169 my $file_i_url = "$output_dir/$i_url.oai";
170
171 my $ds = &util::get_dirsep();
172 my $i_os = $i; #convert OAI set separators (:) to OS dir sep
173 $i_os =~ s/:/$ds/g;
174 my $file_i = &util::filename_cat($output_dir,"$i_os.oai");
175
176 # obtain record
177 my $wget_cmd = "wget $wgetopt -q -O - \"$url\"";
178
179 open (OAIIN,"$wget_cmd|")
180 || die "wget request failed: $!\n";
181 my $i_record = "";
182
183 my $line;
184 while (defined($line=<OAIIN>))
185 {
186 $i_record .= $line;
187 }
188
189 close(OAIIN);
190
191 # prepare subdirectory for record (if needed)
192 my ($i_dir,$unused) = dir_file_split($file_i_url);
193
194 &util::mk_all_dir($i_dir);
195
196 # look out for identifier tag in metadata section
197 if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
198 {
199 my $m_record = $1;
200
201 if ($get_id)
202 {
203 if ($m_record =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
204 {
205 my $doc_url = $2;
206 get_oai_document($doc_url,$i_dir, $out);
207
208
209 my ($id_dir,$id_fname) = dir_file_split($doc_url);
210
211 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
212 }
213 }
214 }
215
216 # save record
217 open (OAIOUT,">$file_i")
218 || die "Unable to save oai metadata record: $!\n";
219 print OAIOUT $i_record;
220 close(OAIOUT);
221
222 $doc_count++;
223 last if ($doc_count == $maxdocs);
224 }
225}
226
227
228sub main {
229 my ($verbosity, $importdir, $keepold,
230 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
231 $configfilename, $collectcfg,
232 $out, $collectdir);
233
234 if (!parsargv::parse(\@ARGV,
235 'verbosity/\d+/2', \$verbosity,
236 'getdoc', \$getdoc,
237 'importdir/.*/', \$importdir,
238 'keepold', \$keepold,
239 'removeold', \$removeold,
240 'gzip', \$gzip,
241 'debug', \$debug,
242 'maxdocs/^\-?\d+/-1', \$maxdocs,
243 'collectdir/.*/', \$collectdir,
244 'out/.*/STDERR', \$out)) {
245 &print_usage();
246 die "\n";
247 }
248
249 my $close_out = 0;
250 if ($out !~ /^(STDERR|STDOUT)$/i) {
251 open (OUT, ">$out") || die "Couldn't open output file $out\n";
252 $out = 'import::OUT';
253 $close_out = 1;
254 }
255 $out->autoflush(1);
256
257 # set removeold to false if it has been defined
258 $removeold = 0 if ($keepold);
259
260 # get and check the collection name
261 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
262 &print_usage();
263 die "\n";
264 }
265
266
267 # get acquire list
268 my $acquire = [];
269 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
270 if (-e $configfilename) {
271 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
272 if (defined $collectcfg->{'acquire'}) {
273 $acquire = $collectcfg->{'acquire'};
274 }
275 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
276 $importdir = $collectcfg->{'importdir'};
277 }
278 if (defined $collectcfg->{'removeold'}) {
279 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
280 $removeold = 1;
281 }
282 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
283 $removeold = 0;
284 }
285 }
286 } else {
287 die "Couldn't find the configuration file $configfilename\n";
288 }
289
290 # fill in the default import directory if none
291 # were supplied, turn all \ into / and remove trailing /
292 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
293 $importdir =~ s/[\\\/]+/\//g;
294 $importdir =~ s/\/$//;
295
296 # remove the old contents of the import directory if needed
297 if ($removeold && -e $importdir) {
298 print $out "Warning - removing current contents of the import directory\n";
299 print $out " in preparation for the acquire\n";
300 sleep(5); # just in case...
301 &util::rm_r ($importdir);
302 }
303
304 my $e;
305 foreach $e ( @$acquire )
306 {
307 my $acquire_type = shift @$e;
308 my $acquire_src = undef;
309
310 if ($acquire_type ne "OAI") {
311 print STDERR "Warning: $acquire_type not currently supported. Skipping.\n";
312 next;
313 }
314
315 my $store_getdoc = $getdoc;
316
317 if (!parsargv::parse($e,
318 'getdoc', \$getdoc,
319 'src/.*/', \$acquire_src)) {
320 &print_usage();
321 die "\n";
322 }
323
324 if (!defined $acquire_src) {
325 print STDERR "Warning: Not -src flag defined. Skipping.\n";
326 next;
327 }
328
329 print $out "$acquire_type Acquire: from $acquire_src\n";
330
331 my $li_record = get_oai_ids($acquire_src,$out);
332 my $ids = parse_oai_ids($li_record,$out);
333
334 get_oai_records($acquire_src,$ids,$importdir, $getdoc, $maxdocs, $out);
335 $getdoc = $store_getdoc;
336 }
337
338 close OUT if $close_out;
339}
340
341
342&main();
343
344
345
346
347
348
349
350
Note: See TracBrowser for help on using the repository browser.