Context Navigation

source: gsdl/trunk/bin/script/importfrom.pl@ 17198

Last change on this file since 17198 was 17198, checked in by kjdon, 16 years ago
changed ZIPPlug to ZIPPlugin in print usage
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.9 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# importfrom.pl --
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# This program will contact the named DL server
30	# and export its metadata and (optionally) it documents.
31
32	# Currently only designed for OAI exporting
33
34	BEGIN {
35	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38	}
39
40	use colcfg;
41	use util;
42	use parsargv;
43	use FileHandle;
44
45	my $wgetopt = "";
46
47	my $num_processed = 0;
48
49	sub print_usage {
50	print STDERR "\n usage: $0 [options] collection-name\n\n";
51	print STDERR " options:\n";
52	print STDERR " -verbosity number 0=none, 3=lots\n";
53	print STDERR " -getdoc Also download if source document if present\n";
54	print STDERR " -importdir directory Where the original material lives\n";
55	print STDERR " -keepold Will not destroy the current contents of the\n";
56	print STDERR " import directory (the default)\n";
57	print STDERR " -removeold Will remove the old contents of the import\n";
58	print STDERR " directory -- use with care\n";
59	print STDERR " -gzip Use gzip to compress exported documents\n";
60	print STDERR " (don't forget to include ZIPPlugin in your plugin\n";
61	print STDERR " -maxdocs number Maximum number of documents to import\n";
62	print STDERR " -debug Print imported text to STDOUT\n";
63	print STDERR " -collectdir directory Collection directory (defaults to " .
64	&util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
65	print STDERR " -out Filename or handle to print output status to.\n";
66	print STDERR " The default is STDERR\n\n";
67	}
68
69
70
71	sub xml_pretty_print
72	{
73	my ($text,$out,$verbosity) = @_;
74
75	if (system("xmllint --version >/dev/null 2>&1")!=0) {
76	if ($verbosity>1) {
77	print STDERR "Warning: Unable to find xmllint for pretty printing.\n";
78	print STDERR " XML will be shown verbatim.\n\n";
79	}
80	print $out $text;
81	}
82	else {
83
84	if (!open (PPOUT,"\|xmllint --format -")) {
85	print STDERR "Error running xmllint: $!\n\n";
86	print $out $text;
87	return;
88	}
89
90	print PPOUT $text;
91	close(PPOUT);
92	}
93	}
94
95	sub wget_oai_url
96	{
97	my ($wget_cmd,$out,$verbosity) = @_;
98
99	if ($verbosity>2) {
100	print $out " $wget_cmd\n";
101	}
102
103	open (OAIIN,"$wget_cmd \|")
104	\|\| die "wget request failed: $!\n";
105
106	my $li_record = "";
107
108	my $line;
109	while (defined($line=<OAIIN>))
110	{
111	$li_record .= $line;
112	# print $out $line;
113	}
114
115	close(OAIIN);
116
117	return $li_record;
118	}
119
120	sub oai_info
121	{
122	my ($base_url,$out,$verbosity) = @_;
123
124	my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
125
126	my $identify = "verb=Identify";
127	my $list_sets = "verb=ListSets";
128	my $list_md_formats = "ListMetadataFormats"; # not currently used
129
130	my $identify_cmd = $base_wget_cmd;
131	$identify_cmd =~ s/_OPTS_/$identify/;
132	print $out "-------------------\n";
133	print $out "General Information\n";
134	print $out "-------------------\n";
135	my $identify_text = wget_oai_url($identify_cmd,$out,$verbosity);
136	xml_pretty_print($identify_text,$out,$verbosity);
137
138
139	my $list_sets_cmd = $base_wget_cmd;
140	$list_sets_cmd =~ s/_OPTS_/$list_sets/;
141	print $out "-------------------\n";
142	print $out "Set Information\n";
143	print $out "-------------------\n";
144	my $list_sets_text = wget_oai_url($list_sets_cmd,$out,$verbosity);
145	xml_pretty_print($list_sets_text,$out,$verbosity);
146	}
147
148
149	sub get_oai_ids
150	{
151	my ($base_url, $set, $format, $out, $verbosity) = @_;
152
153	print $out "Requesting list of identifiers ...\n";
154
155	my $base_wget_cmd = "wget $wgetopt -q -O - \"$base_url?_OPTS_\"";
156	my $identifiers_cmd = $base_wget_cmd;
157
158	my $identifiers_opts = "verb=ListIdentifiers&metadataPrefix=$format";
159
160	if (defined $set && ($set ne "")) {
161	$identifiers_opts .= "&set=$set";
162	}
163
164	$identifiers_cmd =~ s/_OPTS_/$identifiers_opts/;
165
166	my $li_record = wget_oai_url($identifiers_cmd,$out,$verbosity);
167
168	print $out "... Done.\n";
169
170	return $li_record;
171	}
172
173	sub parse_oai_ids
174	{
175	my ($li_record, $out, $verbosity) = @_;
176
177	# extract identifier list
178	$li_record =~ s/^.*?<identifier>/<identifier>/s;
179	$li_record =~ s/^(.<\/identifier>).$/$1/s;
180
181	my @ids = ();
182
183	while ($li_record =~ m/<identifier>(.?)<\/identifier>(.)$/s)
184	{
185	$li_record = $2;
186	push(@ids,$1);
187	}
188
189	return \@ids;
190	}
191
192
193	sub dir_file_split
194	{
195	my ($file) = @_;
196
197	my @dirs = split("/",$file);
198	my $local_file = pop(@dirs);
199	my $sub_dirs = join("/",@dirs);
200
201	return ($sub_dirs,$local_file);
202	}
203
204	sub get_oai_document
205	{
206	my ($doc_url,$output_dir, $out) = @_;
207
208	my ($id_dir,$id_fname) = dir_file_split($doc_url);
209
210	print $out "Getting document $doc_url\n";
211
212	&util::mk_dir($output_dir) if (!-e "$output_dir");
213
214	my $full_id_fname = &util::filename_cat($output_dir,$id_fname);
215
216	my $wget_cmd = "wget $wgetopt --quiet -O \"$full_id_fname\" \"$doc_url\"";
217
218	if (system($wget_cmd)!=0) {
219	print STDERR "Error: failed to execute $wget_cmd\n";
220	return 0;
221	}
222
223	return 1;
224	}
225
226	sub get_oai_records
227	{
228	my ($base_url,$format, $ids,$output_dir, $get_id, $maxdocs, $out) = @_;
229
230	my $doc_count = 0;
231
232	my $i;
233	foreach $i ( @$ids )
234	{
235	# wget it;
236	my $url = "$base_url?verb=GetRecord&metadataPrefix=$format";
237	$url .= "&identifier=$i";
238	print $out "Downloading metadata record for $i\n";
239
240	my $i_url = $i; #convert OAI set separators (:) to directory sep
241	$i_url =~ s/:/\//g;
242	my $file_i_url = "$output_dir/$i_url.oai";
243
244	my $ds = &util::get_dirsep();
245	my $i_os = $i; #convert OAI set separators (:) to OS dir sep
246	$i_os =~ s/:/$ds/g;
247	my $file_i = &util::filename_cat($output_dir,"$i_os.oai");
248
249	# obtain record
250	my $wget_cmd = "wget $wgetopt -q -O - \"$url\"";
251
252	open (OAIIN,"$wget_cmd\|")
253	\|\| die "wget request failed: $!\n";
254	my $i_record = "";
255
256	my $line;
257	while (defined($line=<OAIIN>))
258	{
259	$i_record .= $line;
260	}
261
262	close(OAIIN);
263
264	$num_processed++;
265
266	# prepare subdirectory for record (if needed)
267	my ($i_dir,$unused) = dir_file_split($file_i_url);
268
269	&util::mk_all_dir($i_dir);
270
271	# look out for identifier tag in metadata section
272	if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
273	{
274	my $m_record = $1;
275
276	if ($get_id)
277	{
278	my $got_doc = 0;
279
280	my @url_matches = ($m_record =~ m/<(?:dc:)?identifier>(.*?)<\/(?:dc:)?identifier>/gs);
281	foreach my $doc_url (@url_matches)
282	{
283	if ($doc_url =~ m/^(http\|ftp):/) {
284
285	my $revised_doc_url = $doc_url;
286	## $revised_doc_url =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
287
288	my $srcdocs_dir = &util::filename_cat($i_dir,"srcdocs");
289
290	if (get_oai_document($revised_doc_url,$srcdocs_dir, $out)) {
291
292	$got_doc = 1;
293	my ($id_dir,$id_fname) = dir_file_split($revised_doc_url);
294
295	$i_record =~ s/<metadata>(.?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL>\n <identifier>srcdocs\/$id_fname<\/identifier>$4<\/metadata>/s;
296
297	}
298	}
299
300	if (!$got_doc) {
301	$i_record =~ s/<metadata>(.?)<(dc:)?identifier>$doc_url<\/(dc:)?identifier>(.?)<\/metadata>/<metadata>$1<OrigIdentifier>$doc_url<\/OrigIdentifier>$4<\/metadata>/s;
302	}
303	}
304	}
305	}
306
307	# save record
308	open (OAIOUT,">$file_i")
309	\|\| die "Unable to save oai metadata record: $!\n";
310	print OAIOUT $i_record;
311	close(OAIOUT);
312
313	$doc_count++;
314	last if ($doc_count == $maxdocs);
315	}
316	}
317
318
319	sub main {
320	my ($verbosity, $importdir, $keepold,
321	$getdoc, $acquire_info, $acquire_set,
322	$removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
323	$configfilename, $collectcfg,
324	$out, $collectdir);
325
326	if (!parsargv::parse(\@ARGV,
327	'verbosity/\d+/2', \$verbosity,
328	'getdoc', \$getdoc,
329	'info', \$acquire_info,
330	'importdir/.*/', \$importdir,
331	'keepold', \$keepold,
332	'removeold', \$removeold,
333	'gzip', \$gzip,
334	'debug', \$debug,
335	'maxdocs/^\-?\d+/-1', \$maxdocs,
336	'collectdir/.*/', \$collectdir,
337	'out/.*/STDERR', \$out)) {
338	&print_usage();
339	die "\n";
340	}
341
342	my $close_out = 0;
343	if ($out !~ /^(STDERR\|STDOUT)$/i) {
344	open (OUT, ">$out") \|\| die "Couldn't open output file $out\n";
345	$out = 'import::OUT';
346	$close_out = 1;
347	}
348	$out->autoflush(1);
349
350	# set removeold to false if it has been defined
351	$removeold = 0 if ($keepold);
352
353	# get and check the collection name
354	if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
355	&print_usage();
356	die "\n";
357	}
358
359
360	# get acquire list
361	my $acquire = [];
362	$configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
363	if (-e $configfilename) {
364	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
365	if (defined $collectcfg->{'acquire'}) {
366	$acquire = $collectcfg->{'acquire'};
367	}
368	if (defined $collectcfg->{'importdir'} && $importdir eq "") {
369	$importdir = $collectcfg->{'importdir'};
370	}
371	if (defined $collectcfg->{'removeold'}) {
372	if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
373	$removeold = 1;
374	}
375	if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
376	$removeold = 0;
377	}
378	}
379	} else {
380	die "Couldn't find the configuration file $configfilename\n";
381	}
382
383	# fill in the default import directory if none
384	# were supplied, turn all \ into / and remove trailing /
385	$importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
386	$importdir =~ s/[\\\/]+/\//g;
387	$importdir =~ s/\/$//;
388
389	# remove the old contents of the import directory if needed
390	if ($removeold && -e $importdir) {
391	print $out "Warning - removing current contents of the import directory\n";
392	print $out " in preparation for the acquire\n";
393	&util::rm_r ($importdir);
394	}
395
396	my $e;
397	foreach $e ( @$acquire )
398	{
399	my $acquire_type = shift @$e;
400	my $acquire_src = undef;
401
402	if ($acquire_type ne "OAI") {
403	print STDERR "Warning: $acquire_type not currently supported. Skipping.\n";
404	next;
405	}
406
407	my $store_getdoc = $getdoc;
408
409	if (!parsargv::parse($e,
410	'getdoc', \$getdoc,
411	'set/.*/', \$acquire_set,
412	'format/.*/oai_dc', \$metadata_format,
413	'src/.*/', \$acquire_src)) {
414	&print_usage();
415	die "\n";
416	}
417
418	if (!defined $acquire_src) {
419	print STDERR "Warning: Not -src flag defined. Skipping.\n";
420	next;
421	}
422
423	if (defined $acquire_info && ($acquire_info)) {
424	oai_info($acquire_src,$out,$verbosity);
425	next;
426	}
427
428	print $out "$acquire_type Acquire: from $acquire_src\n";
429
430	my $li_record = get_oai_ids($acquire_src,$acquire_set,$metadata_format,
431	$out,$verbosity);
432	my $ids = parse_oai_ids($li_record,$out,$verbosity);
433
434	get_oai_records($acquire_src,$metadata_format, $ids,$importdir,
435	$getdoc, $maxdocs, $out);
436	$getdoc = $store_getdoc;
437	}
438
439	print "\nNumber of documents processed: $num_processed\n";
440
441	close OUT if $close_out;
442	}
443
444
445	&main();
446
447
448
449
450
451
452
453

Note: See TracBrowser for help on using the repository browser.

Download in other formats: