source: trunk/gsdl/bin/script/import.pl@ 12373

Last change on this file since 12373 was 12370, checked in by kjdon, 18 years ago

now create the archives directory here rather than expecting plugouts to create it. If no docs were output, then the directory wasn't created and arcinfo crapped out cos it couldn't save

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
41}
42
43use arcinfo;
44use colcfg;
45use plugin;
46use plugout;
47use docprint;
48use manifest;
49use util;
50use scriptutil;
51use FileHandle;
52use gsprintf 'gsprintf';
53use printusage;
54use parse2;
55
56
57
58use strict;
59no strict 'refs'; # allow filehandles to be variables and vice versa
60no strict 'subs'; # allow barewords (eg STDERR) as function arguments
61
62my $oidtype_list =
63 [ { 'name' => "hash",
64 'desc' => "{import.OIDtype.hash}" },
65 { 'name' => "incremental",
66 'desc' => "{import.OIDtype.incremental}" },
67 { 'name' => "assigned",
68 'desc' => "{import.OIDtype.assigned}" },
69 { 'name' => "dirname",
70 'desc' => "{import.OIDtype.dirname}" } ];
71
72#** define to use the original GA format or METS format
73my $saveas_list =
74 [ { 'name' => "GA",
75 'desc' => "{import.saveas.GA}" },
76 { 'name' => "METS",
77 'desc' => "{import.saveas.METS}" } ];
78
79
80# Possible attributes for each argument
81# name: The name of the argument
82# desc: A description (or more likely a reference to a description) for this argument
83# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
84# reqd: Is this argument required?
85# hiddengli: Is this argument hidden in GLI?
86# modegli: The lowest detail mode this argument is visible at in GLI
87
88my $arguments =
89 [ { 'name' => "archivedir",
90 'desc' => "{import.archivedir}",
91 'type' => "string",
92 'reqd' => "no",
93 'hiddengli' => "yes" },
94 { 'name' => "collectdir",
95 'desc' => "{import.collectdir}",
96 'type' => "string",
97 # parsearg left "" as default
98 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
99 'deft' => "",
100 'reqd' => "no",
101 'hiddengli' => "yes" },
102 { 'name' => "manifest",
103 'desc' => "{import.manifest}",
104 'type' => "string",
105 'deft' => "",
106 'reqd' => "no",
107 'hiddengli' => "yes" },
108 { 'name' => "debug",
109 'desc' => "{import.debug}",
110 'type' => "flag",
111 'reqd' => "no",
112 'hiddengli' => "yes" },
113 { 'name' => "faillog",
114 'desc' => "{import.faillog}",
115 'type' => "string",
116 # parsearg left "" as default
117 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
118 'deft' => "",
119 'reqd' => "no",
120 'modegli' => "4" },
121 { 'name' => "groupsize",
122 'desc' => "{import.groupsize}",
123 'type' => "int",
124 'deft' => "1",
125 'reqd' => "no",
126 'modegli' => "3" },
127 { 'name' => "gzip",
128 'desc' => "{import.gzip}",
129 'type' => "flag",
130 'reqd' => "no",
131 'modegli' => "4" },
132 { 'name' => "importdir",
133 'desc' => "{import.importdir}",
134 'type' => "string",
135 'reqd' => "no",
136 'hiddengli' => "yes" },
137 { 'name' => "keepold",
138 'desc' => "{import.keepold}",
139 'type' => "flag",
140 'reqd' => "no",
141 'modegli' => "3" },
142 { 'name' => "removeold",
143 'desc' => "{import.removeold}",
144 'type' => "flag",
145 'reqd' => "no",
146 'modegli' => "3" },
147 { 'name' => "language",
148 'desc' => "{scripts.language}",
149 'type' => "string",
150 'reqd' => "no",
151 'modegli' => "4" },
152 { 'name' => "maxdocs",
153 'desc' => "{import.maxdocs}",
154 'type' => "int",
155 'reqd' => "no",
156 # parsearg left "" as default
157 #'deft' => "-1",
158 'range' => "1,",
159 'modegli' => "1" },
160 { 'name' => "OIDtype",
161 'desc' => "{import.OIDtype}",
162 'type' => "enum",
163 'list' => $oidtype_list,
164 # parsearg left "" as default
165 #'deft' => "hash",
166 'reqd' => "no",
167 'modegli' => "3" },
168 { 'name' => "OIDmetadata",
169 'desc' => "{import.OIDmetadata}",
170 'type' => "metadata",
171 'deft' => "dc.Identifier",
172 'reqd' => "no",
173 'modegli' => "3" },
174 { 'name' => "out",
175 'desc' => "{import.out}",
176 'type' => "string",
177 'deft' => "STDERR",
178 'reqd' => "no",
179 'hiddengli' => "yes" },
180 { 'name' => "saveas",
181 'desc' => "{import.saveas}",
182 'type' => "enum",
183 'list' => $saveas_list,
184 'deft' => "GA",
185 'reqd' => "no",
186 'modegli' => "3" },
187 { 'name' => "sortmeta",
188 'desc' => "{import.sortmeta}",
189 'type' => "metadata",
190# 'type' => "string",
191 'reqd' => "no",
192 'modegli' => "3" },
193 { 'name' => "removeprefix",
194 'desc' => "{BasClas.removeprefix}",
195 'type' => "regexp",
196 'deft' => "",
197 'reqd' => "no",
198 'modegli' => "3" },
199 { 'name' => "removesuffix",
200 'desc' => "{BasClas.removesuffix}",
201 'type' => "regexp",
202 'deft' => "",
203 'reqd' => "no",
204 'modegli' => "3" },
205 { 'name' => "statsfile",
206 'desc' => "{import.statsfile}",
207 'type' => "string",
208 'deft' => "STDERR",
209 'reqd' => "no",
210 'hiddengli' => "yes" },
211 { 'name' => "verbosity",
212 'desc' => "{import.verbosity}",
213 'type' => "int",
214 'range' => "0,",
215 # parsearg left "" as default
216 #'deft' => "2",
217 'reqd' => "no",
218 'modegli' => "4" },
219 { 'name' => "gli",
220 'desc' => "",
221 'type' => "flag",
222 'reqd' => "no",
223 'hiddengli' => "yes" },
224 { 'name' => "xml",
225 'desc' => "{scripts.xml}",
226 'type' => "flag",
227 'reqd' => "no",
228 'hiddengli' => "yes" }];
229
230my $options = { 'name' => "import.pl",
231 'desc' => "{import.desc}",
232 'args' => $arguments };
233
234
235&main();
236
237sub main {
238 my ($verbosity, $importdir, $archivedir, $manifest, $keepold,
239 $removeold, $saveas, $version,
240 $gzip, $groupsize, $OIDtype, $OIDmetadata, $debug,
241 $maxdocs, $collection, $configfilename, $collectcfg,
242 $pluginfo, $sortmeta, $removeprefix, $removesuffix,
243 $archive_info_filename, $statsfile,
244 $archive_info, $processor, $out, $faillog, $collectdir, $gli, $language);
245
246 my $xml = 0;
247
248 my $service = "import";
249
250 my $hashParsingResult = {};
251 my $blnParseFailed = "false";
252 # general options available to all plugins
253 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
254 # If there is more than one argument left after parsing, it mean user input too many arguments.
255 if($intArgLeftinAfterParsing > 1)
256 {
257 &PrintUsage::print_txt_usage($options, "{import.params}");
258 die "\n";
259 }
260
261 foreach my $strVariable (keys %$hashParsingResult)
262 {
263 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
264 }
265
266 # If $language has been specified, load the appropriate resource bundle
267 # (Otherwise, the default resource bundle will be loaded automatically)
268 if ($language && $language =~ /\S/) {
269 &gsprintf::load_language_specific_resource_bundle($language);
270 }
271
272 if ($xml) {
273 &PrintUsage::print_xml_usage($options);
274 print "\n";
275 return;
276 }
277
278 if ($gli) { # the gli wants strings to be in UTF-8
279 &gsprintf::output_strings_in_UTF8;
280 }
281
282 my $close_out = 0;
283 if ($out !~ /^(STDERR|STDOUT)$/i) {
284 open (OUT, ">$out") ||
285 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
286 $out = 'import::OUT';
287 $close_out = 1;
288 }
289 $out->autoflush(1);
290
291 # get and check the collection name
292 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
293 &PrintUsage::print_txt_usage($options, "{import.params}");
294 die "\n";
295 }
296 # add collection's perllib dir into include path in
297 # case we have collection specific modules
298 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
299
300 if ($faillog eq "") {
301 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
302 }
303 open (FAILLOG, ">$faillog") ||
304 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
305
306
307 my $faillogname = $faillog;
308 $faillog = 'import::FAILLOG';
309 $faillog->autoflush(1);
310
311 # check sortmeta
312 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
313 if (defined $sortmeta && $groupsize > 1) {
314 &gsprintf($out, "{import.cannot_sort}\n\n");
315 $sortmeta = undef;
316 }
317
318 # get the list of plugins for this collection and set any options that
319 # were specified in the collect.cfg (all import.pl options except
320 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
321 # options must be known before we read the collect.cfg))
322 my $plugins = [];
323 my @global_opts = ();
324
325 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
326 if (!-e $configfilename) {
327 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
328 }
329
330 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
331 if (defined $collectcfg->{'plugin'}) {
332 $plugins = $collectcfg->{'plugin'};
333 }
334
335 if ($verbosity !~ /\d+/) {
336 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
337 $verbosity = $collectcfg->{'verbosity'};
338 } else {
339 $verbosity = 2; # the default
340 }
341 }
342 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
343 $importdir = $collectcfg->{'importdir'};
344 }
345 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
346 $archivedir = $collectcfg->{'archivedir'};
347 }
348 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
349 $manifest = $collectcfg->{'manifest'};
350 }
351
352 if (defined $collectcfg->{'gzip'} && !$gzip) {
353 if ($collectcfg->{'gzip'} =~ /^true$/i) {
354 $gzip = 1;
355 }
356 }
357 if ($maxdocs !~ /\-?\d+/) {
358 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
359 $maxdocs = $collectcfg->{'maxdocs'};
360 } else {
361 $maxdocs = -1; # the default
362 }
363 }
364 if ($groupsize == 1) {
365 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
366 $groupsize = $collectcfg->{'groupsize'};
367 }
368 }
369 if ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/) {
370 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
371 $OIDtype = $collectcfg->{'OIDtype'};
372 } else {
373 $OIDtype = "hash"; # the default
374 }
375 }
376
377
378 if ($saveas !~ /^(GA|METS)$/) {
379 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
380 $saveas = $collectcfg->{'plugout'}[0];
381 } else {
382 $saveas ="GAPlugout";
383 }
384 }
385
386 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
387 $sortmeta = $collectcfg->{'sortmeta'};
388 }
389
390 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
391 $removeprefix = $collectcfg->{'removeprefix'};
392 }
393
394 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
395 $removesuffix = $collectcfg->{'removesuffix'};
396 }
397 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
398 $debug = 1;
399 }
400 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
401 $gli = 1;
402 }
403
404
405 # global plugin stuff
406 if (defined $collectcfg->{'separate_cjk'} && $collectcfg->{'separate_cjk'} =~ /^true$/i) {
407 push @global_opts, "-separate_cjk";
408 }
409
410 # check keepold and removeold
411 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, "archives", $collectcfg);
412
413 $gli = 0 unless defined $gli;
414
415 print STDERR "<Import>\n" if $gli;
416
417 # fill in the default import and archives directories if none
418 # were supplied, turn all \ into / and remove trailing /
419 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
420 $importdir =~ s/[\\\/]+/\//g;
421 $importdir =~ s/\/$//;
422 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
423 $archivedir =~ s/[\\\/]+/\//g;
424 $archivedir =~ s/\/$//;
425
426 my $manifest_lookup = new manifest();
427 if ($manifest ne "") {
428 my $manifest_filename = $manifest;
429
430 if ($manifest_filename !~ m/^[\\\/]/) {
431 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
432 }
433
434 $manifest =~ s/[\\\/]+/\//g;
435 $manifest =~ s/\/$//;
436
437 $manifest_lookup->parse($manifest_filename);
438 }
439
440
441 # load all the plugins
442 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
443 if (scalar(@$pluginfo) == 0) {
444 &gsprintf($out, "{import.no_plugins_loaded}\n");
445 die "\n";
446 }
447
448 # remove the old contents of the archives directory (and tmp directory) if needed
449 if ($removeold) {
450 if (-e $archivedir) {
451 &gsprintf($out, "{import.removing_archives}\n");
452 &util::rm_r ($archivedir);
453 }
454 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
455 $tmpdir =~ s/[\\\/]+/\//g;
456 $tmpdir =~ s/\/$//;
457 if (-e $tmpdir) {
458 &gsprintf($out, "{import.removing_tmpdir}\n");
459 &util::rm_r ($tmpdir);
460 }
461 }
462 # create the archives dir if needed
463 &util::mk_all_dir($archivedir);
464
465 # read the archive information file
466 if (!$debug) {
467 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
468 $archive_info = new arcinfo ();
469 $archive_info->load_info ($archive_info_filename);
470
471
472 ####Use Plugout####
473 my ($plugout_name);
474 if ($saveas !~ /^(GA|METS)Plugout$/ ){
475 $plugout_name = $saveas."Plugout";
476 }
477 else{
478 $plugout_name = $saveas;
479 }
480
481 my $opts=[];
482
483 push @$opts,("-output_info",$archive_info) if (defined $archive_info);
484
485 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
486 push @$opts,("-gzip_output",$gzip) if (defined $gzip);
487 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
488 push @$opts,("-output_handle",$out) if (defined $out);
489
490
491 $processor = &plugout::load_plugout($plugout_name,$opts);
492 $processor->setoutputdir ($archivedir);
493 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
494 $processor->set_OIDtype ($OIDtype);
495
496
497 } else {
498 $processor = new docprint ();
499 }
500
501 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
502
503 if ($manifest eq "") {
504 # process the import directory
505 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, 0, $gli);
506 }
507 else {
508
509 # process any new files
510 foreach my $file (keys %{$manifest_lookup->{'index'}}) {
511 &plugin::read ($pluginfo, $importdir, $file, {}, $processor, $maxdocs, 0, $gli);
512 }
513
514 # record files marked for deletion in arcinfo
515 foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
516 # consider finding it?
517 # $archive_info->add_info($OID,$doc_xml_file,"D");
518 }
519 }
520
521 &plugin::end($pluginfo, $processor);
522
523 &plugin::deinit($pluginfo, $processor);
524
525 # write out the archive information file
526 if (!$debug) {
527 $processor->close_file_output() if $groupsize > 1;
528 $processor->close_group_output() if $processor->is_group();
529 $archive_info->save_info($archive_info_filename);
530 }
531
532 # write out import stats
533 my $close_stats = 0;
534 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
535 if (open (STATS, ">$statsfile")) {
536 $statsfile = 'import::STATS';
537 $close_stats = 1;
538 } else {
539 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
540 &gsprintf($out, "{import.stats_backup}\n");
541 $statsfile = 'STDERR';
542 }
543 }
544
545 &gsprintf($out, "\n");
546 &gsprintf($out, "*********************************************\n");
547 &gsprintf($out, "{import.complete}\n");
548 &gsprintf($out, "*********************************************\n");
549
550 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
551 if ($close_stats) {
552 close STATS;
553 }
554
555 close OUT if $close_out;
556 close FAILLOG;
557}
Note: See TracBrowser for help on using the repository browser.