source: trunk/gsdl/bin/script/import.pl@ 5882

Last change on this file since 5882 was 5882, checked in by davidb, 20 years ago

'cpan' added to perllib path to help plugins and classifiers find supporting
files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
40}
41
42use arcinfo;
43use colcfg;
44use plugin;
45use docprint;
46use util;
47use parsargv;
48use FileHandle;
49use gsprintf;
50use printusage;
51
52my $oidtype_list =
53 [ { 'name' => "hash",
54 'desc' => "{import.OIDtype.hash}" },
55 { 'name' => "incremental",
56 'desc' => "{import.OIDtype.incremental}" } ];
57
58my $arguments =
59 [ { 'name' => "archivedir",
60 'desc' => "{import.achivedir}",
61 'type' => "string",
62 'reqd' => "no" },
63 { 'name' => "collectdir",
64 'desc' => "{import.collectdir}",
65 'type' => "string",
66 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
67 'reqd' => "no" },
68 { 'name' => "debug",
69 'desc' => "{import.debug}",
70 'type' => "flag",
71 'reqd' => "no" },
72 { 'name' => "faillog",
73 'desc' => "{import.faillog}",
74 'type' => "string",
75 'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
76 'reqd' => "no" },
77 { 'name' => "groupsize",
78 'desc' => "{import.groupsize}",
79 'type' => "int",
80 'deft' => "1",
81 'reqd' => "no" },
82 { 'name' => "gzip",
83 'desc' => "{import.gzip}",
84 'type' => "flag",
85 'reqd' => "no" },
86 { 'name' => "importdir",
87 'desc' => "{import.importdir}",
88 'type' => "string",
89 'reqd' => "no" },
90 { 'name' => "keepold",
91 'desc' => "{import.keepold}",
92 'type' => "flag",
93 'reqd' => "no" },
94 { 'name' => "maxdocs",
95 'desc' => "{import.maxdocs}",
96 'type' => "int",
97 'reqd' => "no" },
98 { 'name' => "OIDtype",
99 'desc' => "{import.OIDtype}",
100 'type' => "enum",
101 'list' => $oidtype_list,
102 'deft' => "hash",
103 'reqd' => "no" },
104 { 'name' => "out",
105 'desc' => "{import.out}",
106 'type' => "string",
107 'deft' => "STDERR",
108 'reqd' => "no" },
109 { 'name' => "removeold",
110 'desc' => "{import.removeold}",
111 'type' => "flag",
112 'reqd' => "no" },
113 { 'name' => "sortmeta",
114 'desc' => "{import.sortmeta}",
115 'type' => "string",
116 'reqd' => "no" },
117 { 'name' => "statsfile",
118 'desc' => "{import.statsfile}",
119 'type' => "string",
120 'deft' => "STDERR",
121 'reqd' => "no" },
122 { 'name' => "verbosity",
123 'desc' => "{import.verbosity}",
124 'type' => "int",
125 'deft' => "2",
126 'reqd' => "no" },
127 { 'name' => "language",
128 'desc' => "{scripts.language}",
129 'type' => "string",
130 'reqd' => "no" } ];
131
132my $options = { 'name' => "import.pl",
133 'desc' => "{import.desc}",
134 'args' => $arguments };
135
136
137sub print_xml_usage
138{
139 local $language = shift(@_);
140
141 &PrintUsage::print_xml_header();
142
143 print STDERR "<Info>\n";
144 print STDERR " <Name>$options->{'name'}</Name>\n";
145 print STDERR " <Desc>" . &lookup_string($options->{'desc'}) . "</Desc>\n";
146 print STDERR " <Arguments>\n";
147 if (defined($options->{'args'})) {
148 &PrintUsage::print_options_xml($language, $options->{'args'});
149 }
150 print STDERR " </Arguments>\n";
151 print STDERR "</Info>\n";
152}
153
154
155sub print_txt_usage
156{
157 local $language = shift(@_);
158
159 local $programname = $options->{'name'};
160 local $programargs = $options->{'args'};
161
162 # Find the length of the longest option string
163 local $descoffset = 0;
164 if (defined($programargs)) {
165 $descoffset = &PrintUsage::find_longest_option_string($programargs);
166 }
167
168 # Produce the usage information using the data structure above
169 print STDERR " " . &lookup_string("{common.usage}") . ": $programname";
170 print STDERR " " . &lookup_string("{import.params}") . "\n\n";
171
172 # Display the program options, if there are some
173 if (defined($programargs)) {
174 # Calculate the column offset of the option descriptions
175 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
176
177 print STDERR " " . &lookup_string("{common.options}") . ":\n";
178
179 # Display the program options
180 &PrintUsage::print_options_txt($language, $programargs, $optiondescoffset);
181 }
182}
183
184
185sub lookup_string
186{
187 return &gsprintf::lookup_string($language, shift(@_));
188}
189
190
191# sub print_usage {
192# print STDOUT "\n";
193# print STDOUT "import.pl: Converts documents in collections -importdir directory into\n";
194# print STDOUT " xml documents which are written to the -archivedir directory.\n\n";
195# print STDOUT " usage: $0 [options] collection-name\n\n";
196# print STDOUT " options:\n";
197# print STDOUT " -verbosity number 0=none, 3=lots\n";
198# print STDOUT " -importdir directory Where the original material lives\n";
199# print STDOUT " -archivedir directory Where the converted material ends up\n";
200# print STDOUT " -keepold Will not destroy the current contents of the\n";
201# print STDOUT " archives directory (the default)\n";
202# print STDOUT " -removeold Will remove the old contents of the archives\n";
203# print STDOUT " directory -- use with care\n";
204# print STDOUT " -gzip Use gzip to compress resulting xml documents\n";
205# print STDOUT " (don't forget to include ZIPPlug in your plugin\n";
206# print STDOUT " list when building from compressed documents)\n";
207# print STDOUT " -maxdocs number Maximum number of documents to import\n";
208# print STDOUT " -groupsize number Number of import documents to group into one XML file\n";
209# print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n";
210# print STDOUT " identifiers for each document. \"hash\" (the\n";
211# print STDOUT " default) hashes the contents of the file and so\n";
212# print STDOUT " will be the same every time the collection is\n";
213# print STDOUT " imported. \"incremental\" is a simple document\n";
214# print STDOUT " count and so will be significantly faster than\n";
215# print STDOUT " \"hash\". It is not guaranteed to always assign\n";
216# print STDOUT " the same identifier to a given document though\n";
217# print STDOUT " and does not allow further documents to be added\n";
218# print STDOUT " to existing xml archives\n";
219# print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n";
220# print STDOUT " building. This will be disabled if groupsize > 1\n";
221# print STDOUT " -debug Print imported text to STDOUT\n";
222# print STDOUT " -collectdir directory Collection directory (defaults to " .
223# &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
224# print STDOUT " -out name Filename or handle to print output status to.\n";
225# print STDOUT " The default is STDERR\n";
226# print STDOUT " -statsfile name Filename or handle to print import statistics to.\n";
227# print STDOUT " The default is STDERR\n";
228# print STDOUT " -faillog name Fail log filename. This log receives the filenames\n";
229# print STDOUT " of any files which fail to be processed (defaults\n";
230# print STDOUT " to " .
231# &util::filename_cat("<collectdir>", "colname", "etc", "fail.log") . ")\n";
232# print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
233# print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
234# }
235
236&main();
237
238sub main {
239 my ($verbosity, $importdir, $archivedir, $keepold,
240 $removeold, $gzip, $groupsize, $OIDtype, $debug,
241 $maxdocs, $collection, $configfilename, $collectcfg,
242 $pluginfo, $sortmeta, $archive_info_filename, $statsfile,
243 $archive_info, $processor, $out, $faillog, $collectdir);
244
245 # ***** 11-04-03 - John Thompson *****
246 my $xml = 0;
247 # ************************************
248
249 # note that no defaults are passed for most options as they're set
250 # later (after we check the collect.cfg file)
251 if (!parsargv::parse(\@ARGV,
252 'language/.*/', \$language,
253 'verbosity/\d+/', \$verbosity,
254 'importdir/.*/', \$importdir,
255 'archivedir/.*/', \$archivedir,
256 'keepold', \$keepold,
257 'removeold', \$removeold,
258 'gzip', \$gzip,
259 'groupsize/\d+/1', \$groupsize,
260 'OIDtype/^(hash|incremental)$/', \$OIDtype,
261 'sortmeta/.*/', \$sortmeta,
262 'debug', \$debug,
263 'maxdocs/^\-?\d+/', \$maxdocs,
264 'collectdir/.*/', \$collectdir,
265 'out/.*/STDERR', \$out,
266 'statsfile/.*/STDERR', \$statsfile,
267 'faillog/.*/', \$faillog,
268 q^xml^, \$xml)) {
269 &print_txt_usage($language);
270 die "\n";
271 }
272
273 if ($xml) {
274 &print_xml_usage($language);
275 die "\n";
276 }
277
278 my $close_out = 0;
279 if ($out !~ /^(STDERR|STDOUT)$/i) {
280 open (OUT, ">$out") ||
281 die &lookup_string("{common.cannot_open_output_file}") . " $out\n";
282 $out = 'import::OUT';
283 $close_out = 1;
284 }
285 $out->autoflush(1);
286
287 # set removeold to false if it has been defined
288 $removeold = 0 if ($keepold);
289
290 # get and check the collection name
291 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
292 &print_txt_usage($language);
293 die "\n";
294 }
295
296 if ($faillog eq "") {
297 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
298 }
299 open (FAILLOG, ">$faillog") ||
300 die &lookup_string("{import.cannot_open_fail_log}") . " $faillog\n";
301 my $faillogname = $faillog;
302 $faillog = 'import::FAILLOG';
303 $faillog->autoflush(1);
304
305 # check sortmeta
306 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
307 if (defined $sortmeta && $groupsize > 1) {
308 print $out &lookup_string("{import.cannot_sort}") . "\n\n";
309 $sortmeta = undef;
310 }
311
312 # dynamically load 'docsave' module so it can pick up on a collection
313 # specific docsave.pm is specified.
314
315 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
316 require docsave;
317
318
319 # get the list of plugins for this collection and set any options that
320 # were specified in the collect.cfg (all import.pl options except
321 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
322 # options must be known before we read the collect.cfg))
323 my $plugins = [];
324 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
325 if (-e $configfilename) {
326 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
327 if (defined $collectcfg->{'plugin'}) {
328 $plugins = $collectcfg->{'plugin'};
329 }
330
331 if ($verbosity !~ /\d+/) {
332 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
333 $verbosity = $collectcfg->{'verbosity'};
334 } else {
335 $verbosity = 2; # the default
336 }
337 }
338 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
339 $importdir = $collectcfg->{'importdir'};
340 }
341 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
342 $archivedir = $collectcfg->{'archivedir'};
343 }
344 if (defined $collectcfg->{'removeold'}) {
345 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
346 $removeold = 1;
347 }
348 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
349 $removeold = 0;
350 }
351 }
352 if (defined $collectcfg->{'keepold'}) {
353 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
354 $removeold = 1;
355 }
356 }
357 if (defined $collectcfg->{'gzip'} && !$gzip) {
358 if ($collectcfg->{'gzip'} =~ /^true$/i) {
359 $gzip = 1;
360 }
361 }
362 if ($maxdocs !~ /\-?\d+/) {
363 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
364 $maxdocs = $collectcfg->{'maxdocs'};
365 } else {
366 $maxdocs = -1; # the default
367 }
368 }
369 if ($groupsize == 1) {
370 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
371 $groupsize = $collectcfg->{'groupsize'};
372 }
373 }
374 if ($OIDtype !~ /^(hash|incremental)$/) {
375 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
376 $OIDtype = $collectcfg->{'OIDtype'};
377 } else {
378 $OIDtype = "hash"; # the default
379 }
380 }
381 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
382 $sortmeta = $collectcfg->{'sortmeta'};
383 }
384 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
385 $debug = 1;
386 }
387
388 } else {
389 die &lookup_string("{import.cannot_find_cfg_file}") . " $configfilename\n";
390 }
391
392 # fill in the default import and archives directories if none
393 # were supplied, turn all \ into / and remove trailing /
394 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
395 $importdir =~ s/[\\\/]+/\//g;
396 $importdir =~ s/\/$//;
397 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
398 $archivedir =~ s/[\\\/]+/\//g;
399 $archivedir =~ s/\/$//;
400
401 # load all the plugins
402 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog);
403 if (scalar(@$pluginfo) == 0) {
404 print $out &lookup_string("{import.no_plugins_loaded}") . "\n";
405 die "\n";
406 }
407
408 # remove the old contents of the archives directory if needed
409 if ($removeold && -e $archivedir) {
410 print $out &lookup_string("{import.removing_archives}") . "\n";
411 sleep(3); # just in case...
412 &util::rm_r ($archivedir);
413 }
414
415 # read the archive information file
416 if (!$debug) {
417 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
418 $archive_info = new arcinfo ();
419 $archive_info->load_info ($archive_info_filename);
420
421 # create a docsave object to process the documents
422 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
423 $processor->setarchivedir ($archivedir);
424 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
425 $processor->set_OIDtype ($OIDtype);
426 } else {
427 $processor = new docprint ();
428 }
429
430 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
431
432 # process the import directory
433 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
434
435 &plugin::end($pluginfo, $processor);
436
437 # write out the archive information file
438 if (!$debug) {
439 $processor->close_file_output() if $groupsize > 1;
440 $archive_info->save_info($archive_info_filename);
441 }
442
443 # write out import stats
444 my $close_stats = 0;
445 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
446 if (open (STATS, ">$statsfile")) {
447 $statsfile = 'import::STATS';
448 $close_stats = 1;
449 } else {
450 print $out &lookup_string("{import.cannot_open_stats_file}") . " $statsfile.";
451 print $out &lookup_string("{import.stats_backup}") . "\n";
452 $statsfile = 'STDERR';
453 }
454 }
455
456 print $out "\n";
457 print $out "*********************************************\n";
458 print $out &lookup_string("{import.complete}") . "\n";
459 print $out "*********************************************\n";
460
461 &plugin::write_stats($pluginfo, $statsfile, $faillogname);
462 if ($close_stats) {
463 close STATS;
464 }
465
466 close OUT if $close_out;
467 close FAILLOG;
468}
Note: See TracBrowser for help on using the repository browser.