source: documented-example-collections/trunk/oai-e/exportfrom.pl@ 18738

Last change on this file since 18738 was 18738, checked in by oranfry, 15 years ago

the rest of the documented example collections

  • Property svn:executable set to *
File size: 8.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# getoai.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will contact the named DL server
30# and export its metadata and (optionally) it documents.
31
32# Currently only designed for OAI exporting
33
34
35BEGIN {
36 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
37 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
39}
40
41use colcfg;
42use util;
43use parsargv;
44use FileHandle;
45
46
47sub print_usage {
48 print STDERR "\n usage: $0 [options] collection-name\n\n";
49 print STDERR " options:\n";
50 print STDERR " -verbosity number 0=none, 3=lots\n";
51 print STDERR " -getid Also download if source document if present\n";
52 print STDERR " -importdir directory Where the original material lives\n";
53 print STDERR " -keepold Will not destroy the current contents of the\n";
54 print STDERR " import directory (the default)\n";
55 print STDERR " -removeold Will remove the old contents of the import\n";
56 print STDERR " directory -- use with care\n";
57 print STDERR " -gzip Use gzip to compress exported documents\n";
58 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
59 print STDERR " -maxdocs number Maximum number of documents to import\n";
60 print STDERR " -debug Print imported text to STDOUT\n";
61 print STDERR " -collectdir directory Collection directory (defaults to " .
62 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
63 print STDERR " -out Filename or handle to print output status to.\n";
64 print STDERR " The default is STDERR\n\n";
65}
66
67
68
69
70sub print_usage_old
71{
72 my ($prog_name) = @_;
73
74 print STDERR "Usage: $prog_name OAI-base-URL\n";
75 exit 1;
76}
77
78sub get_oai_ids
79{
80 my ($base_url, $out) = @_;
81
82 print $out "Requesting list of identifiers ...\n";
83
84 open (OAIIN,"wget -q -O - $base_url?verb=ListIdentifiers|")
85 || die "wget request failed: $!\n";
86
87 my $li_record = "";
88
89 my $line;
90 while (defined($line=<OAIIN>))
91 {
92 $li_record .= $line;
93 # print $out $line;
94 }
95
96 close(OAIIN);
97 print $out "... Done.\n";
98
99 return $li_record;
100}
101
102sub parse_oai_ids
103{
104 my ($li_record, $out) = @_;
105
106 # extract identifier list
107 $li_record =~ s/^.*?<identifier>/<identifier>/s;
108 $li_record =~ s/^(.*<\/identifier>).*$/$1/s;
109
110 my @ids = ();
111
112 while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
113 {
114 $li_record = $2;
115 push(@ids,$1);
116 }
117
118 return \@ids;
119}
120
121
122sub dir_file_split
123{
124 my ($file) = @_;
125
126 my @dirs = split("/",$file);
127 my $local_file = pop(@dirs);
128 my $sub_dirs = join("/",@dirs);
129
130 return ($sub_dirs,$local_file);
131}
132
133sub get_oai_document
134{
135 my ($doc_url,$output_dir, $out) = @_;
136
137 my ($id_dir,$id_fname) = dir_file_split($doc_url);
138
139 print $out "Getting document $doc_url\n";
140
141 `mkdir $output_dir/srcdocs` if (!-e "$output_dir/srcdocs");
142
143 my $wget_cmd = "wget -q -O $output_dir/srcdocs/$id_fname \"$doc_url\"";
144
145 (system($wget_cmd)==0)
146 || print STDERR "Error: failed to execute $wget_cmd\n";
147
148}
149
150sub get_oai_records
151{
152 my ($base_url,$ids,$output_dir, $get_id, $maxdocs, $out) = @_;
153
154 my $doc_count = 0;
155
156 my $i;
157 foreach $i ( @$ids )
158 {
159 # wget it;
160 my $url = "$base_url?verb=GetRecord&metadataPrefix=oai_dc";
161 $url .= "&identifier=$i";
162 print $out "Downloading metadata record for $i\n";
163
164 my $file_i = "$output_dir/$i.oai";
165 $file_i =~ s/:/\//g;
166
167 # obtain record
168 my $wget_cmd = "wget -q -O - \"$url\"";
169
170 open (OAIIN,"$wget_cmd|")
171 || die "wget request failed: $!\n";
172 my $i_record = "";
173
174 my $line;
175 while (defined($line=<OAIIN>))
176 {
177 $i_record .= $line;
178 }
179
180 close(OAIIN);
181
182 # prepare subdirectory for record (if needed)
183 my ($i_dir,$unused) = dir_file_split($file_i);
184 `mkdir -p $i_dir`;
185
186 # look out for identifier tag in metadata section
187 if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
188 {
189 my $m_record = $1;
190
191 if ($get_id)
192 {
193 if ($m_record =~ m/<identifier>(.*?)<\/identifier>/s)
194 {
195 my $doc_url = $1;
196 get_oai_document($doc_url,$i_dir, $out);
197
198
199 my ($id_dir,$id_fname) = dir_file_split($doc_url);
200
201 $i_record =~ s/<metadata>(.*?)<identifier>$doc_url<\/identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL><identifier>srcdocs\/$id_fname<\/identifier>$2<\/metadata>/s;
202 }
203 }
204 }
205
206 # save record
207 open (OAIOUT,">$file_i")
208 || die "Unable to save oai metadata record: $!\n";
209 print OAIOUT $i_record;
210 close(OAIOUT);
211
212 $doc_count++;
213 last if ($doc_count == $maxdocs);
214 }
215}
216
217
218sub main {
219 my ($verbosity, $importdir, $keepold,
220 $removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
221 $configfilename, $collectcfg,
222 $out, $collectdir);
223
224 if (!parsargv::parse(\@ARGV,
225 'verbosity/\d+/2', \$verbosity,
226 'getid', \$getid,
227 'importdir/.*/', \$importdir,
228 'keepold', \$keepold,
229 'removeold', \$removeold,
230 'gzip', \$gzip,
231 'debug', \$debug,
232 'maxdocs/^\-?\d+/-1', \$maxdocs,
233 'collectdir/.*/', \$collectdir,
234 'out/.*/STDERR', \$out)) {
235 &print_usage();
236 die "\n";
237 }
238
239 my $close_out = 0;
240 if ($out !~ /^(STDERR|STDOUT)$/i) {
241 open (OUT, ">$out") || die "Couldn't open output file $out\n";
242 $out = 'import::OUT';
243 $close_out = 1;
244 }
245 $out->autoflush(1);
246
247 # set removeold to false if it has been defined
248 $removeold = 0 if ($keepold);
249
250 # get and check the collection name
251 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
252 &print_usage();
253 die "\n";
254 }
255
256
257 # get the list of export servers for this collection
258 my $export = [];
259 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
260 if (-e $configfilename) {
261 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
262 if (defined $collectcfg->{'export'}) {
263 $export = $collectcfg->{'export'};
264 }
265 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
266 $importdir = $collectcfg->{'importdir'};
267 }
268 if (defined $collectcfg->{'removeold'}) {
269 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
270 $removeold = 1;
271 }
272 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
273 $removeold = 0;
274 }
275 }
276 } else {
277 die "Couldn't find the configuration file $configfilename\n";
278 }
279
280 # fill in the default import directory if none
281 # were supplied, turn all \ into / and remove trailing /
282 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
283 $importdir =~ s/[\\\/]+/\//g;
284 $importdir =~ s/\/$//;
285
286 # remove the old contents of the import directory if needed
287 if ($removeold && -e $importdir) {
288 print $out "Warning - removing current contents of the import directory\n";
289 print $out " in preparation for the export\n";
290 sleep(5); # just in case...
291 &util::rm_r ($importdir);
292 }
293
294 my $e;
295 foreach $e ( @$export )
296 {
297 print $out "Exporting from $e->[0]\n";
298
299 my $base_url = $e->[0];
300
301 my $li_record = get_oai_ids($base_url,$out);
302 my $ids = parse_oai_ids($li_record,$out);
303
304 get_oai_records($base_url,$ids,$importdir, $getid, $maxdocs, $out);
305 }
306
307 close OUT if $close_out;
308}
309
310
311&main();
312
313
314
315
316
317
318
319
Note: See TracBrowser for help on using the repository browser.