source: trunk/gsdl/bin/script/importfrom.pl@ 4819

Last change on this file since 4819 was 4819, checked in by davidb, 21 years ago

Additional script that downloads metadata records (and documents if
available and requested) from external site into the import directory.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.0 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# importfrom.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will contact the named DL server
30# and export its metadata and (optionally) it documents.
31
32# Currently only designed for OAI exporting
33
34
35BEGIN {
36 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
37 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
39}
40
41use colcfg;
42use util;
43use parsargv;
44use FileHandle;
45
46
47sub print_usage {
48 print STDERR "\n usage: $0 [options] collection-name\n\n";
49 print STDERR " options:\n";
50 print STDERR " -verbosity number 0=none, 3=lots\n";
51 print STDERR " -getdoc Also download if source document if present\n";
52 print STDERR " -importdir directory Where the original material lives\n";
53 print STDERR " -keepold Will not destroy the current contents of the\n";
54 print STDERR " import directory (the default)\n";
55 print STDERR " -removeold Will remove the old contents of the import\n";
56 print STDERR " directory -- use with care\n";
57 print STDERR " -gzip Use gzip to compress exported documents\n";
58 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
59 print STDERR " -maxdocs number Maximum number of documents to import\n";
60 print STDERR " -debug Print imported text to STDOUT\n";
61 print STDERR " -collectdir directory Collection directory (defaults to " .
62 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
63 print STDERR " -out Filename or handle to print output status to.\n";
64 print STDERR " The default is STDERR\n\n";
65}
66
67
68
69
70sub print_usage_old
71{
72 my ($prog_name) = @_;
73
74 print STDERR "Usage: $prog_name OAI-base-URL\n";
75 exit 1;
76}
77
78sub get_oai_ids
79{
80 my ($base_url, $out) = @_;
81
82 print $out "Requesting list of identifiers ...\n";
83
84 open (OAIIN,"wget -q -O - \"$base_url?verb=ListIdentifiers&metadataPrefix=oai_dc\" |")
85 || die "wget request failed: $!\n";
86
87 my $li_record = "";
88
89 my $line;
90 while (defined($line=<OAIIN>))
91 {
92 $li_record .= $line;
93 # print $out $line;
94 }
95
96 close(OAIIN);
97 print $out "... Done.\n";
98
99 return $li_record;
100}
101
102sub parse_oai_ids
103{
104 my ($li_record, $out) = @_;
105
106 # extract identifier list
107 $li_record =~ s/^.*?<identifier>/<identifier>/s;
108 $li_record =~ s/^(.*<\/identifier>).*$/$1/s;
109
110 my @ids = ();
111
112 while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
113 {
114 $li_record = $2;
115 push(@ids,$1);
116 }
117
118 return \@ids;
119}
120
121
122sub dir_file_split
123{
124 my ($file) = @_;
125
126 my @dirs = split("/",$file);
127 my $local_file = pop(@dirs);
128 my $sub_dirs = join("/",@dirs);
129
130 return ($sub_dirs,$local_file);
131}
132
133sub get_oai_document
134{
135 my ($doc_url,$output_dir, $out) = @_;
136
137 my ($id_dir,$id_fname) = dir_file_split($doc_url);
138
139 print $out "Getting document $doc_url\n";
140
141 `mkdir $output_dir/.orig` if (!-e "$output_dir/.orig");
142
143 my $wget_cmd = "wget -q -O $output_dir/.orig/$id_fname \"$doc_url\"";
144
145 (system($wget_cmd)==0)
146 || print STDERR "Error: failed to execute $wget_cmd\n";
147
148}
149
150sub get_oai_records
151{
152 my ($base_url,$ids,$output_dir, $get_id, $maxdocs, $out) = @_;
153
154 my $doc_count = 0;
155
156 my $i;
157 foreach $i ( @$ids )
158 {
159 # wget it;
160 my $url = "$base_url?verb=GetRecord&metadataPrefix=oai_dc";
161 $url .= "&identifier=$i";
162 print $out "Downloading metadata record for $i\n";
163
164 my $file_i = "$output_dir/$i.oai";
165 $file_i =~ s/:/\//g;
166
167 # obtain record
168 my $wget_cmd = "wget -q -O - \"$url\"";
169
170 open (OAIIN,"$wget_cmd|")
171 || die "wget request failed: $!\n";
172 my $i_record = "";
173
174 my $line;
175 while (defined($line=<OAIIN>))
176 {
177 $i_record .= $line;
178 }
179
180 close(OAIIN);
181
182 # prepare subdirectory for record (if needed)
183 my ($i_dir,$unused) = dir_file_split($file_i);
184 `mkdir -p $i_dir`;
185
186 # look out for identifier tag in metadata section
187 if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
188 {
189 my $m_record = $1;
190
191 if ($get_id)
192 {
193 if ($m_record =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
194 {
195 my $doc_url = $2;
196 get_oai_document($doc_url,$i_dir, $out);
197
198
199 my ($id_dir,$id_fname) = dir_file_split($doc_url);
200
201 $i_record =~ s/<metadata>(.*?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>.orig\/$id_fname<\/identifier>$4<\/metadata>/s;
202 }
203 }
204 }
205
206 # save record
207 open (OAIOUT,">$file_i")
208 || die "Unable to save oai metadata record: $!\n";
209 print OAIOUT $i_record;
210 close(OAIOUT);
211
212 $doc_count++;
213 last if ($doc_count == $maxdocs);
214 }
215}
216
217
218sub main {
219 my ($verbosity, $importdir, $keepold,
220 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
221 $configfilename, $collectcfg,
222 $out, $collectdir);
223
224 if (!parsargv::parse(\@ARGV,
225 'verbosity/\d+/2', \$verbosity,
226 'getdoc', \$getdoc,
227 'importdir/.*/', \$importdir,
228 'keepold', \$keepold,
229 'removeold', \$removeold,
230 'gzip', \$gzip,
231 'debug', \$debug,
232 'maxdocs/^\-?\d+/-1', \$maxdocs,
233 'collectdir/.*/', \$collectdir,
234 'out/.*/STDERR', \$out)) {
235 &print_usage();
236 die "\n";
237 }
238
239 my $close_out = 0;
240 if ($out !~ /^(STDERR|STDOUT)$/i) {
241 open (OUT, ">$out") || die "Couldn't open output file $out\n";
242 $out = 'import::OUT';
243 $close_out = 1;
244 }
245 $out->autoflush(1);
246
247 # set removeold to false if it has been defined
248 $removeold = 0 if ($keepold);
249
250 # get and check the collection name
251 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
252 &print_usage();
253 die "\n";
254 }
255
256
257 # get acquire list
258 my $acquire = [];
259 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
260 if (-e $configfilename) {
261 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
262 if (defined $collectcfg->{'acquire'}) {
263 $acquire = $collectcfg->{'acquire'};
264 }
265 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
266 $importdir = $collectcfg->{'importdir'};
267 }
268 if (defined $collectcfg->{'removeold'}) {
269 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
270 $removeold = 1;
271 }
272 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
273 $removeold = 0;
274 }
275 }
276 } else {
277 die "Couldn't find the configuration file $configfilename\n";
278 }
279
280 # fill in the default import directory if none
281 # were supplied, turn all \ into / and remove trailing /
282 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
283 $importdir =~ s/[\\\/]+/\//g;
284 $importdir =~ s/\/$//;
285
286 # remove the old contents of the import directory if needed
287 if ($removeold && -e $importdir) {
288 print $out "Warning - removing current contents of the import directory\n";
289 print $out " in preparation for the acquire\n";
290 sleep(5); # just in case...
291 &util::rm_r ($importdir);
292 }
293
294 my $e;
295 foreach $e ( @$acquire )
296 {
297 my $acquire_type = shift @$e;
298 my $acquire_src = undef;
299
300 if ($acquire_type ne "OAI") {
301 print STDERR "Warning: $acquire_type not currently supported. Skipping.\n";
302 next;
303 }
304
305 my $store_getdoc = $getdoc;
306
307 if (!parsargv::parse($e,
308 'getdoc', \$getdoc,
309 'src/.*/', \$acquire_src)) {
310 &print_usage();
311 die "\n";
312 }
313
314 if (!defined $acquire_src) {
315 print STDERR "Warning: Not -src flag defined. Skipping.\n";
316 next;
317 }
318
319 print $out "$acquire_type Acquire: from $acquire_src\n";
320
321 my $li_record = get_oai_ids($acquire_src,$out);
322 my $ids = parse_oai_ids($li_record,$out);
323
324 get_oai_records($acquire_src,$ids,$importdir, $getdoc, $maxdocs, $out);
325 $getdoc = $store_getdoc;
326 }
327
328 close OUT if $close_out;
329}
330
331
332&main();
333
334
335
336
337
338
339
340
Note: See TracBrowser for help on using the repository browser.