source: trunk/gsdl/bin/script/import.pl@ 13169

Last change on this file since 13169 was 13169, checked in by kjdon, 18 years ago

debug mode now passes debug flag to plugout rather than using docprint, which is no longer a docproc.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
41}
42
43use arcinfo;
44use colcfg;
45use plugin;
46use plugout;
47use manifest;
48use util;
49use scriptutil;
50use FileHandle;
51use gsprintf 'gsprintf';
52use printusage;
53use parse2;
54
55
56
57use strict;
58no strict 'refs'; # allow filehandles to be variables and vice versa
59no strict 'subs'; # allow barewords (eg STDERR) as function arguments
60
61my $oidtype_list =
62 [ { 'name' => "hash",
63 'desc' => "{import.OIDtype.hash}" },
64 { 'name' => "assigned",
65 'desc' => "{import.OIDtype.assigned}" },
66 { 'name' => "incremental",
67 'desc' => "{import.OIDtype.incremental}" },
68 { 'name' => "dirname",
69 'desc' => "{import.OIDtype.dirname}" } ];
70
71#** define to use the original GA format or METS format
72my $saveas_list =
73 [ { 'name' => "GA",
74 'desc' => "{import.saveas.GA}" },
75 { 'name' => "METS",
76 'desc' => "{import.saveas.METS}" } ];
77
78
79# Possible attributes for each argument
80# name: The name of the argument
81# desc: A description (or more likely a reference to a description) for this argument
82# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
83# reqd: Is this argument required?
84# hiddengli: Is this argument hidden in GLI?
85# modegli: The lowest detail mode this argument is visible at in GLI
86
87my $arguments =
88 [ { 'name' => "archivedir",
89 'desc' => "{import.archivedir}",
90 'type' => "string",
91 'reqd' => "no",
92 'hiddengli' => "yes" },
93 { 'name' => "collectdir",
94 'desc' => "{import.collectdir}",
95 'type' => "string",
96 # parsearg left "" as default
97 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
98 'deft' => "",
99 'reqd' => "no",
100 'hiddengli' => "yes" },
101 { 'name' => "manifest",
102 'desc' => "{import.manifest}",
103 'type' => "string",
104 'deft' => "",
105 'reqd' => "no",
106 'hiddengli' => "yes" },
107 { 'name' => "debug",
108 'desc' => "{import.debug}",
109 'type' => "flag",
110 'reqd' => "no",
111 'hiddengli' => "yes" },
112 { 'name' => "faillog",
113 'desc' => "{import.faillog}",
114 'type' => "string",
115 # parsearg left "" as default
116 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
117 'deft' => "",
118 'reqd' => "no",
119 'modegli' => "4" },
120 { 'name' => "importdir",
121 'desc' => "{import.importdir}",
122 'type' => "string",
123 'reqd' => "no",
124 'hiddengli' => "yes" },
125 { 'name' => "incremental",
126 'desc' => "{import.incremental}",
127 'type' => "flag",
128 'hiddengli' => "yes" },
129 { 'name' => "keepold",
130 'desc' => "{import.keepold}",
131 'type' => "flag",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "removeold",
135 'desc' => "{import.removeold}",
136 'type' => "flag",
137 'reqd' => "no",
138 'hiddengli' => "yes" },
139 { 'name' => "language",
140 'desc' => "{scripts.language}",
141 'type' => "string",
142 'reqd' => "no",
143 'hiddengli' => "yes" },
144 { 'name' => "maxdocs",
145 'desc' => "{import.maxdocs}",
146 'type' => "int",
147 'reqd' => "no",
148 # parsearg left "" as default
149 #'deft' => "-1",
150 'range' => "1,",
151 'modegli' => "1" },
152 { 'name' => "OIDtype",
153 'desc' => "{import.OIDtype}",
154 'type' => "enum",
155 'list' => $oidtype_list,
156 # parsearg left "" as default
157 #'deft' => "hash",
158 'reqd' => "no",
159 'modegli' => "2" },
160 { 'name' => "OIDmetadata",
161 'desc' => "{import.OIDmetadata}",
162 'type' => "metadata",
163 'deft' => "dc.Identifier",
164 'reqd' => "no",
165 'modegli' => "2" },
166 { 'name' => "out",
167 'desc' => "{import.out}",
168 'type' => "string",
169 'deft' => "STDERR",
170 'reqd' => "no",
171 'hiddengli' => "yes" },
172 { 'name' => "saveas",
173 'desc' => "{import.saveas}",
174 'type' => "enum",
175 'list' => $saveas_list,
176 'deft' => "GA",
177 'reqd' => "no",
178 'modegli' => "3" },
179 { 'name' => "sortmeta",
180 'desc' => "{import.sortmeta}",
181 'type' => "metadata",
182# 'type' => "string",
183 'reqd' => "no",
184 'modegli' => "3" },
185 { 'name' => "removeprefix",
186 'desc' => "{BasClas.removeprefix}",
187 'type' => "regexp",
188 'deft' => "",
189 'reqd' => "no",
190 'modegli' => "3" },
191 { 'name' => "removesuffix",
192 'desc' => "{BasClas.removesuffix}",
193 'type' => "regexp",
194 'deft' => "",
195 'reqd' => "no",
196 'modegli' => "3" },
197 { 'name' => "groupsize",
198 'desc' => "{import.groupsize}",
199 'type' => "int",
200 'deft' => "1",
201 'reqd' => "no",
202 'modegli' => "3" },
203 { 'name' => "gzip",
204 'desc' => "{import.gzip}",
205 'type' => "flag",
206 'reqd' => "no",
207 'modegli' => "4" },
208 { 'name' => "statsfile",
209 'desc' => "{import.statsfile}",
210 'type' => "string",
211 'deft' => "STDERR",
212 'reqd' => "no",
213 'hiddengli' => "yes" },
214 { 'name' => "verbosity",
215 'desc' => "{import.verbosity}",
216 'type' => "int",
217 'range' => "0,",
218 # parsearg left "" as default
219 #'deft' => "2",
220 'reqd' => "no",
221 'modegli' => "4" },
222 { 'name' => "gli",
223 'desc' => "",
224 'type' => "flag",
225 'reqd' => "no",
226 'hiddengli' => "yes" },
227 { 'name' => "xml",
228 'desc' => "{scripts.xml}",
229 'type' => "flag",
230 'reqd' => "no",
231 'hiddengli' => "yes" }];
232
233my $options = { 'name' => "import.pl",
234 'desc' => "{import.desc}",
235 'args' => $arguments };
236
237
238&main();
239
240sub main {
241 my ($verbosity, $importdir, $archivedir, $manifest, $incremental, $keepold,
242 $removeold, $saveas, $version,
243 $gzip, $groupsize, $OIDtype, $OIDmetadata, $debug,
244 $maxdocs, $collection, $configfilename, $collectcfg,
245 $pluginfo, $sortmeta, $removeprefix, $removesuffix,
246 $archive_info_filename, $statsfile,
247 $archive_info, $processor, $out, $faillog, $collectdir, $gli, $language);
248
249 my $xml = 0;
250
251 my $service = "import";
252
253 my $hashParsingResult = {};
254 # general options available to all plugins
255 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
256 # Parse returns -1 if something has gone wrong
257 if($intArgLeftinAfterParsing == -1)
258 {
259 &PrintUsage::print_txt_usage($options, "{import.params}");
260 die "\n";
261 }
262
263 foreach my $strVariable (keys %$hashParsingResult)
264 {
265 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
266 }
267
268 # If $language has been specified, load the appropriate resource bundle
269 # (Otherwise, the default resource bundle will be loaded automatically)
270 if ($language && $language =~ /\S/) {
271 &gsprintf::load_language_specific_resource_bundle($language);
272 }
273
274 if ($xml) {
275 &PrintUsage::print_xml_usage($options);
276 print "\n";
277 return;
278 }
279
280 if ($gli) { # the gli wants strings to be in UTF-8
281 &gsprintf::output_strings_in_UTF8;
282 }
283
284 # now check that we had exactly one leftover arg, which should be
285 # the collection name. We don't want to do this earlier, cos
286 # -xml arg doesn't need a collection name
287 # Or if the user specified -h, then we output the usage also
288 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
289 {
290 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
291 die "\n";
292 }
293
294 my $close_out = 0;
295 if ($out !~ /^(STDERR|STDOUT)$/i) {
296 open (OUT, ">$out") ||
297 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
298 $out = 'import::OUT';
299 $close_out = 1;
300 }
301 $out->autoflush(1);
302
303 # get and check the collection name
304 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
305 &PrintUsage::print_txt_usage($options, "{import.params}");
306 die "\n";
307 }
308
309 # add collection's perllib dir into include path in
310 # case we have collection specific modules
311 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
312
313 # check that we can open the faillog
314 if ($faillog eq "") {
315 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
316 }
317 open (FAILLOG, ">$faillog") ||
318 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
319
320
321 my $faillogname = $faillog;
322 $faillog = 'import::FAILLOG';
323 $faillog->autoflush(1);
324
325 # check that there is a collect.cfg file
326 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
327 if (!-e $configfilename) {
328 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
329 }
330 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
331
332
333 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
334 $importdir = $collectcfg->{'importdir'};
335 }
336 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
337 $archivedir = $collectcfg->{'archivedir'};
338 }
339 # fill in the default import and archives directories if none
340 # were supplied, turn all \ into / and remove trailing /
341 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
342 $importdir =~ s/[\\\/]+/\//g;
343 $importdir =~ s/\/$//;
344 if (!-e $importdir) {
345 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
346 die "\n";
347 }
348
349 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
350 $archivedir =~ s/[\\\/]+/\//g;
351 $archivedir =~ s/\/$//;
352
353 my $plugins = [];
354 if (defined $collectcfg->{'plugin'}) {
355 $plugins = $collectcfg->{'plugin'};
356 }
357 #some global options for the plugins
358 my @global_opts = ();
359
360 if ($verbosity !~ /\d+/) {
361 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
362 $verbosity = $collectcfg->{'verbosity'};
363 } else {
364 $verbosity = 2; # the default
365 }
366 }
367 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
368 $manifest = $collectcfg->{'manifest'};
369 }
370
371 if (defined $collectcfg->{'gzip'} && !$gzip) {
372 if ($collectcfg->{'gzip'} =~ /^true$/i) {
373 $gzip = 1;
374 }
375 }
376
377 if ($maxdocs !~ /\-?\d+/) {
378 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
379 $maxdocs = $collectcfg->{'maxdocs'};
380 } else {
381 $maxdocs = -1; # the default
382 }
383 }
384 if ($groupsize == 1) {
385 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
386 $groupsize = $collectcfg->{'groupsize'};
387 }
388 }
389
390 if ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/) {
391 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
392 $OIDtype = $collectcfg->{'OIDtype'};
393 } else {
394 $OIDtype = "hash"; # the default
395 }
396 }
397
398 if ($saveas !~ /^(GA|METS)$/) {
399 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
400 $saveas = $collectcfg->{'plugout'}[0];
401 } else {
402 $saveas ="GAPlugout";
403 }
404 }
405
406 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
407 $sortmeta = $collectcfg->{'sortmeta'};
408 }
409 # sortmeta cannot be used with group size
410 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
411 if (defined $sortmeta && $groupsize > 1) {
412 &gsprintf($out, "{import.cannot_sort}\n\n");
413 $sortmeta = undef;
414 }
415
416 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
417 $removeprefix = $collectcfg->{'removeprefix'};
418 }
419
420 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
421 $removesuffix = $collectcfg->{'removesuffix'};
422 }
423 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
424 $debug = 1;
425 }
426 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
427 $gli = 1;
428 }
429
430
431 # global plugin stuff
432 if (defined $collectcfg->{'separate_cjk'} && $collectcfg->{'separate_cjk'} =~ /^true$/i) {
433 push @global_opts, "-separate_cjk";
434 }
435
436 # check keepold and removeold
437 ($removeold, $keepold, $incremental) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, $incremental, "archives", $collectcfg);
438
439 $gli = 0 unless defined $gli;
440
441 print STDERR "<Import>\n" if $gli;
442
443 my $manifest_lookup = new manifest();
444 if ($manifest ne "") {
445 my $manifest_filename = $manifest;
446
447 if ($manifest_filename !~ m/^[\\\/]/) {
448 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
449 }
450
451 $manifest =~ s/[\\\/]+/\//g;
452 $manifest =~ s/\/$//;
453
454 $manifest_lookup->parse($manifest_filename);
455 }
456
457
458 # load all the plugins
459 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental);
460 if (scalar(@$pluginfo) == 0) {
461 &gsprintf($out, "{import.no_plugins_loaded}\n");
462 die "\n";
463 }
464
465 # remove the old contents of the archives directory (and tmp directory) if needed
466 if ($removeold) {
467 if (-e $archivedir) {
468 &gsprintf($out, "{import.removing_archives}\n");
469 &util::rm_r ($archivedir);
470 }
471 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
472 $tmpdir =~ s/[\\\/]+/\//g;
473 $tmpdir =~ s/\/$//;
474 if (-e $tmpdir) {
475 &gsprintf($out, "{import.removing_tmpdir}\n");
476 &util::rm_r ($tmpdir);
477 }
478 }
479 # create the archives dir if needed
480 &util::mk_all_dir($archivedir);
481
482 # read the archive information file
483 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
484 $archive_info = new arcinfo ();
485 $archive_info->load_info ($archive_info_filename);
486
487
488 ####Use Plugout####
489 my ($plugout_name);
490 if ($saveas !~ /^(GA|METS)Plugout$/ ){
491 $plugout_name = $saveas."Plugout";
492 }
493 else {
494 $plugout_name = $saveas;
495 }
496
497 my $opts=[];
498 push @$opts,("-output_info",$archive_info) if (defined $archive_info);
499
500 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
501 push @$opts,("-gzip_output") if ($gzip);
502 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
503 push @$opts,("-output_handle",$out) if (defined $out);
504
505 push @$opts,("-debug") if ($debug);
506
507 $processor = &plugout::load_plugout($plugout_name,$opts);
508 $processor->setoutputdir ($archivedir);
509 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
510 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
511
512 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
513
514 if ($manifest eq "") {
515 # process the import directory
516 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, 0, $gli);
517 }
518 else {
519
520 # process any new files
521 foreach my $file (keys %{$manifest_lookup->{'index'}}) {
522 &plugin::read ($pluginfo, $importdir, $file, {}, $processor, $maxdocs, 0, $gli);
523 }
524
525 # record files marked for deletion in arcinfo
526 foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
527 # consider finding it?
528 # $archive_info->add_info($OID,$doc_xml_file,"D");
529 }
530 }
531
532 &plugin::end($pluginfo, $processor);
533
534 &plugin::deinit($pluginfo, $processor);
535
536 # write out the archive information file
537 $processor->close_file_output() if $groupsize > 1;
538 $processor->close_group_output() if $processor->is_group();
539 # should we still do this in debug mode??
540 $archive_info->save_info($archive_info_filename);
541
542 # write out import stats
543 my $close_stats = 0;
544 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
545 if (open (STATS, ">$statsfile")) {
546 $statsfile = 'import::STATS';
547 $close_stats = 1;
548 } else {
549 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
550 &gsprintf($out, "{import.stats_backup}\n");
551 $statsfile = 'STDERR';
552 }
553 }
554
555 &gsprintf($out, "\n");
556 &gsprintf($out, "*********************************************\n");
557 &gsprintf($out, "{import.complete}\n");
558 &gsprintf($out, "*********************************************\n");
559
560 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
561 if ($close_stats) {
562 close STATS;
563 }
564
565 close OUT if $close_out;
566 close FAILLOG;
567}
Note: See TracBrowser for help on using the repository browser.