source: trunk/gsdl/bin/script/import.pl@ 13948

Last change on this file since 13948 was 13169, checked in by kjdon, 18 years ago

debug mode now passes debug flag to plugout rather than using docprint, which is no longer a docproc.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
RevLine 
[1031]1#!/usr/bin/perl -w
[4]2
[538]3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
[4]29# This program will import a number of files into a particular collection
30
[1424]31package import;
32
[4]33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
[9]36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
[5882]37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
[9]38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
[12335]39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
[946]40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
[4]41}
42
43use arcinfo;
44use colcfg;
45use plugin;
[12335]46use plugout;
[12003]47use manifest;
[130]48use util;
[10417]49use scriptutil;
[1424]50use FileHandle;
[9970]51use gsprintf 'gsprintf';
[4776]52use printusage;
[10215]53use parse2;
[4]54
[12003]55
56
[10255]57use strict;
58no strict 'refs'; # allow filehandles to be variables and vice versa
59no strict 'subs'; # allow barewords (eg STDERR) as function arguments
60
[4776]61my $oidtype_list =
62 [ { 'name' => "hash",
[4873]63 'desc' => "{import.OIDtype.hash}" },
[12819]64 { 'name' => "assigned",
65 'desc' => "{import.OIDtype.assigned}" },
[4776]66 { 'name' => "incremental",
[8796]67 'desc' => "{import.OIDtype.incremental}" },
68 { 'name' => "dirname",
69 'desc' => "{import.OIDtype.dirname}" } ];
[4776]70
[7906]71#** define to use the original GA format or METS format
72my $saveas_list =
73 [ { 'name' => "GA",
74 'desc' => "{import.saveas.GA}" },
75 { 'name' => "METS",
76 'desc' => "{import.saveas.METS}" } ];
77
78
[6407]79# Possible attributes for each argument
80# name: The name of the argument
81# desc: A description (or more likely a reference to a description) for this argument
[12360]82# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
[6407]83# reqd: Is this argument required?
84# hiddengli: Is this argument hidden in GLI?
85# modegli: The lowest detail mode this argument is visible at in GLI
86
[4776]87my $arguments =
88 [ { 'name' => "archivedir",
[6921]89 'desc' => "{import.archivedir}",
[4776]90 'type' => "string",
[6407]91 'reqd' => "no",
92 'hiddengli' => "yes" },
[4776]93 { 'name' => "collectdir",
[4873]94 'desc' => "{import.collectdir}",
[4776]95 'type' => "string",
[10215]96 # parsearg left "" as default
97 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
98 'deft' => "",
[6407]99 'reqd' => "no",
100 'hiddengli' => "yes" },
[12003]101 { 'name' => "manifest",
102 'desc' => "{import.manifest}",
103 'type' => "string",
104 'deft' => "",
105 'reqd' => "no",
106 'hiddengli' => "yes" },
[4776]107 { 'name' => "debug",
[4873]108 'desc' => "{import.debug}",
[4776]109 'type' => "flag",
[6407]110 'reqd' => "no",
111 'hiddengli' => "yes" },
[4776]112 { 'name' => "faillog",
[4873]113 'desc' => "{import.faillog}",
[4776]114 'type' => "string",
[10215]115 # parsearg left "" as default
116 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
117 'deft' => "",
[6407]118 'reqd' => "no",
119 'modegli' => "4" },
[4776]120 { 'name' => "importdir",
[4873]121 'desc' => "{import.importdir}",
[4776]122 'type' => "string",
[6407]123 'reqd' => "no",
124 'hiddengli' => "yes" },
[12964]125 { 'name' => "incremental",
126 'desc' => "{import.incremental}",
127 'type' => "flag",
128 'hiddengli' => "yes" },
[4776]129 { 'name' => "keepold",
[4873]130 'desc' => "{import.keepold}",
[4776]131 'type' => "flag",
[6407]132 'reqd' => "no",
[12500]133 'hiddengli' => "yes" },
[10417]134 { 'name' => "removeold",
135 'desc' => "{import.removeold}",
136 'type' => "flag",
137 'reqd' => "no",
[12500]138 'hiddengli' => "yes" },
[6407]139 { 'name' => "language",
140 'desc' => "{scripts.language}",
141 'type' => "string",
142 'reqd' => "no",
[12821]143 'hiddengli' => "yes" },
[4776]144 { 'name' => "maxdocs",
[4873]145 'desc' => "{import.maxdocs}",
[4776]146 'type' => "int",
[6407]147 'reqd' => "no",
[10215]148 # parsearg left "" as default
149 #'deft' => "-1",
[7063]150 'range' => "1,",
[6407]151 'modegli' => "1" },
[4776]152 { 'name' => "OIDtype",
[4873]153 'desc' => "{import.OIDtype}",
[4776]154 'type' => "enum",
155 'list' => $oidtype_list,
[10215]156 # parsearg left "" as default
157 #'deft' => "hash",
[6407]158 'reqd' => "no",
[12821]159 'modegli' => "2" },
[12266]160 { 'name' => "OIDmetadata",
161 'desc' => "{import.OIDmetadata}",
162 'type' => "metadata",
163 'deft' => "dc.Identifier",
164 'reqd' => "no",
[12821]165 'modegli' => "2" },
[4776]166 { 'name' => "out",
[4873]167 'desc' => "{import.out}",
[4776]168 'type' => "string",
169 'deft' => "STDERR",
[6407]170 'reqd' => "no",
171 'hiddengli' => "yes" },
[7906]172 { 'name' => "saveas",
173 'desc' => "{import.saveas}",
174 'type' => "enum",
175 'list' => $saveas_list,
176 'deft' => "GA",
177 'reqd' => "no",
178 'modegli' => "3" },
[4776]179 { 'name' => "sortmeta",
[4873]180 'desc' => "{import.sortmeta}",
[12360]181 'type' => "metadata",
[12354]182# 'type' => "string",
[6407]183 'reqd' => "no",
[8153]184 'modegli' => "3" },
[8855]185 { 'name' => "removeprefix",
186 'desc' => "{BasClas.removeprefix}",
[9066]187 'type' => "regexp",
[8855]188 'deft' => "",
[9066]189 'reqd' => "no",
190 'modegli' => "3" },
[8855]191 { 'name' => "removesuffix",
192 'desc' => "{BasClas.removesuffix}",
[9066]193 'type' => "regexp",
[8855]194 'deft' => "",
[9066]195 'reqd' => "no",
196 'modegli' => "3" },
[12819]197 { 'name' => "groupsize",
198 'desc' => "{import.groupsize}",
199 'type' => "int",
200 'deft' => "1",
201 'reqd' => "no",
[13163]202 'modegli' => "3" },
[12819]203 { 'name' => "gzip",
204 'desc' => "{import.gzip}",
205 'type' => "flag",
206 'reqd' => "no",
207 'modegli' => "4" },
[4776]208 { 'name' => "statsfile",
[4873]209 'desc' => "{import.statsfile}",
[4776]210 'type' => "string",
211 'deft' => "STDERR",
[6407]212 'reqd' => "no",
213 'hiddengli' => "yes" },
[4776]214 { 'name' => "verbosity",
[4873]215 'desc' => "{import.verbosity}",
[4776]216 'type' => "int",
[10215]217 'range' => "0,",
218 # parsearg left "" as default
219 #'deft' => "2",
[6407]220 'reqd' => "no",
[10215]221 'modegli' => "4" },
222 { 'name' => "gli",
223 'desc' => "",
224 'type' => "flag",
225 'reqd' => "no",
226 'hiddengli' => "yes" },
227 { 'name' => "xml",
[10349]228 'desc' => "{scripts.xml}",
[10215]229 'type' => "flag",
230 'reqd' => "no",
231 'hiddengli' => "yes" }];
[4776]232
233my $options = { 'name' => "import.pl",
[5093]234 'desc' => "{import.desc}",
[4776]235 'args' => $arguments };
236
[4]237
[1424]238&main();
[4]239
240sub main {
[12964]241 my ($verbosity, $importdir, $archivedir, $manifest, $incremental, $keepold,
[9951]242 $removeold, $saveas, $version,
[12266]243 $gzip, $groupsize, $OIDtype, $OIDmetadata, $debug,
[2328]244 $maxdocs, $collection, $configfilename, $collectcfg,
[8855]245 $pluginfo, $sortmeta, $removeprefix, $removesuffix,
246 $archive_info_filename, $statsfile,
[10255]247 $archive_info, $processor, $out, $faillog, $collectdir, $gli, $language);
[2355]248
[4776]249 my $xml = 0;
[10215]250
[8603]251 my $service = "import";
[4776]252
[10215]253 my $hashParsingResult = {};
254 # general options available to all plugins
[10255]255 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
[12545]256 # Parse returns -1 if something has gone wrong
257 if($intArgLeftinAfterParsing == -1)
[10215]258 {
[6926]259 &PrintUsage::print_txt_usage($options, "{import.params}");
[4]260 die "\n";
261 }
[10255]262
[10215]263 foreach my $strVariable (keys %$hashParsingResult)
264 {
265 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
266 }
[8855]267
[6945]268 # If $language has been specified, load the appropriate resource bundle
269 # (Otherwise, the default resource bundle will be loaded automatically)
[10255]270 if ($language && $language =~ /\S/) {
[6945]271 &gsprintf::load_language_specific_resource_bundle($language);
272 }
[6926]273
[4776]274 if ($xml) {
[6926]275 &PrintUsage::print_xml_usage($options);
[7950]276 print "\n";
277 return;
[4776]278 }
279
[7101]280 if ($gli) { # the gli wants strings to be in UTF-8
281 &gsprintf::output_strings_in_UTF8;
282 }
[12545]283
284 # now check that we had exactly one leftover arg, which should be
285 # the collection name. We don't want to do this earlier, cos
286 # -xml arg doesn't need a collection name
287 # Or if the user specified -h, then we output the usage also
288 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
289 {
290 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
291 die "\n";
292 }
[8603]293
[1424]294 my $close_out = 0;
295 if ($out !~ /^(STDERR|STDOUT)$/i) {
[5093]296 open (OUT, ">$out") ||
[12622]297 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
[1424]298 $out = 'import::OUT';
299 $close_out = 1;
300 }
301 $out->autoflush(1);
302
[130]303 # get and check the collection name
[2287]304 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
[6926]305 &PrintUsage::print_txt_usage($options, "{import.params}");
[4]306 die "\n";
307 }
[12399]308
[12359]309 # add collection's perllib dir into include path in
310 # case we have collection specific modules
[12357]311 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
[2785]312
[12399]313 # check that we can open the faillog
[2785]314 if ($faillog eq "") {
315 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
[2755]316 }
[5093]317 open (FAILLOG, ">$faillog") ||
[6921]318 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
[8603]319
[10215]320
[3402]321 my $faillogname = $faillog;
[2785]322 $faillog = 'import::FAILLOG';
323 $faillog->autoflush(1);
[6584]324
[12399]325 # check that there is a collect.cfg file
[2287]326 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
[10417]327 if (!-e $configfilename) {
328 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
329 }
[12399]330 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
331
[10417]332
[12399]333 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
334 $importdir = $collectcfg->{'importdir'};
335 }
336 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
337 $archivedir = $collectcfg->{'archivedir'};
338 }
339 # fill in the default import and archives directories if none
340 # were supplied, turn all \ into / and remove trailing /
341 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
342 $importdir =~ s/[\\\/]+/\//g;
343 $importdir =~ s/\/$//;
344 if (!-e $importdir) {
345 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
346 die "\n";
347 }
348
349 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
350 $archivedir =~ s/[\\\/]+/\//g;
351 $archivedir =~ s/\/$//;
352
353 my $plugins = [];
[10417]354 if (defined $collectcfg->{'plugin'}) {
355 $plugins = $collectcfg->{'plugin'};
356 }
[12399]357 #some global options for the plugins
358 my @global_opts = ();
[2355]359
[10417]360 if ($verbosity !~ /\d+/) {
361 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
362 $verbosity = $collectcfg->{'verbosity'};
363 } else {
364 $verbosity = 2; # the default
[2355]365 }
[10417]366 }
[12003]367 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
368 $manifest = $collectcfg->{'manifest'};
369 }
[8603]370
[10417]371 if (defined $collectcfg->{'gzip'} && !$gzip) {
372 if ($collectcfg->{'gzip'} =~ /^true$/i) {
373 $gzip = 1;
[9546]374 }
[10417]375 }
[12399]376
[10417]377 if ($maxdocs !~ /\-?\d+/) {
378 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
379 $maxdocs = $collectcfg->{'maxdocs'};
380 } else {
381 $maxdocs = -1; # the default
[9546]382 }
[10417]383 }
384 if ($groupsize == 1) {
385 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
386 $groupsize = $collectcfg->{'groupsize'};
[2355]387 }
[10417]388 }
[12458]389
[10417]390 if ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/) {
391 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
392 $OIDtype = $collectcfg->{'OIDtype'};
393 } else {
394 $OIDtype = "hash"; # the default
[6330]395 }
[10417]396 }
[12335]397
[10417]398 if ($saveas !~ /^(GA|METS)$/) {
[12335]399 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
400 $saveas = $collectcfg->{'plugout'}[0];
[10417]401 } else {
[12335]402 $saveas ="GAPlugout";
[6584]403 }
[4]404 }
[12335]405
[10417]406 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
407 $sortmeta = $collectcfg->{'sortmeta'};
408 }
[12399]409 # sortmeta cannot be used with group size
410 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
411 if (defined $sortmeta && $groupsize > 1) {
412 &gsprintf($out, "{import.cannot_sort}\n\n");
413 $sortmeta = undef;
414 }
415
[10417]416 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
417 $removeprefix = $collectcfg->{'removeprefix'};
418 }
419
420 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
421 $removesuffix = $collectcfg->{'removesuffix'};
422 }
423 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
424 $debug = 1;
425 }
426 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
427 $gli = 1;
428 }
429
430
431 # global plugin stuff
432 if (defined $collectcfg->{'separate_cjk'} && $collectcfg->{'separate_cjk'} =~ /^true$/i) {
433 push @global_opts, "-separate_cjk";
434 }
435
436 # check keepold and removeold
[12964]437 ($removeold, $keepold, $incremental) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, $incremental, "archives", $collectcfg);
[10417]438
[6330]439 $gli = 0 unless defined $gli;
440
441 print STDERR "<Import>\n" if $gli;
[4]442
[12003]443 my $manifest_lookup = new manifest();
444 if ($manifest ne "") {
445 my $manifest_filename = $manifest;
446
447 if ($manifest_filename !~ m/^[\\\/]/) {
448 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
449 }
450
451 $manifest =~ s/[\\\/]+/\//g;
452 $manifest =~ s/\/$//;
453
454 $manifest_lookup->parse($manifest_filename);
455 }
456
457
[4]458 # load all the plugins
[12964]459 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental);
[4]460 if (scalar(@$pluginfo) == 0) {
[6921]461 &gsprintf($out, "{import.no_plugins_loaded}\n");
[4]462 die "\n";
463 }
[5093]464
[10463]465 # remove the old contents of the archives directory (and tmp directory) if needed
466 if ($removeold) {
467 if (-e $archivedir) {
468 &gsprintf($out, "{import.removing_archives}\n");
469 &util::rm_r ($archivedir);
470 }
[10481]471 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
[10463]472 $tmpdir =~ s/[\\\/]+/\//g;
473 $tmpdir =~ s/\/$//;
474 if (-e $tmpdir) {
475 &gsprintf($out, "{import.removing_tmpdir}\n");
476 &util::rm_r ($tmpdir);
477 }
[130]478 }
[12370]479 # create the archives dir if needed
480 &util::mk_all_dir($archivedir);
[6921]481
[4]482 # read the archive information file
[13169]483 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
484 $archive_info = new arcinfo ();
485 $archive_info->load_info ($archive_info_filename);
486
[4]487
[13169]488 ####Use Plugout####
489 my ($plugout_name);
490 if ($saveas !~ /^(GA|METS)Plugout$/ ){
491 $plugout_name = $saveas."Plugout";
[783]492 }
[13169]493 else {
494 $plugout_name = $saveas;
495 }
496
497 my $opts=[];
498 push @$opts,("-output_info",$archive_info) if (defined $archive_info);
499
500 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
501 push @$opts,("-gzip_output") if ($gzip);
502 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
503 push @$opts,("-output_handle",$out) if (defined $out);
504
505 push @$opts,("-debug") if ($debug);
506
507 $processor = &plugout::load_plugout($plugout_name,$opts);
508 $processor->setoutputdir ($archivedir);
509 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
510 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
511
[11333]512 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
[843]513
[12003]514 if ($manifest eq "") {
515 # process the import directory
516 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, 0, $gli);
517 }
518 else {
519
520 # process any new files
521 foreach my $file (keys %{$manifest_lookup->{'index'}}) {
522 &plugin::read ($pluginfo, $importdir, $file, {}, $processor, $maxdocs, 0, $gli);
523 }
524
525 # record files marked for deletion in arcinfo
526 foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
527 # consider finding it?
528 # $archive_info->add_info($OID,$doc_xml_file,"D");
529 }
530 }
531
[2287]532 &plugin::end($pluginfo, $processor);
[2785]533
[10162]534 &plugin::deinit($pluginfo, $processor);
535
[4]536 # write out the archive information file
[13169]537 $processor->close_file_output() if $groupsize > 1;
538 $processor->close_group_output() if $processor->is_group();
539 # should we still do this in debug mode??
540 $archive_info->save_info($archive_info_filename);
[2785]541
542 # write out import stats
543 my $close_stats = 0;
544 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
545 if (open (STATS, ">$statsfile")) {
546 $statsfile = 'import::STATS';
547 $close_stats = 1;
548 } else {
[6921]549 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
550 &gsprintf($out, "{import.stats_backup}\n");
[2785]551 $statsfile = 'STDERR';
552 }
553 }
554
[6921]555 &gsprintf($out, "\n");
556 &gsprintf($out, "*********************************************\n");
557 &gsprintf($out, "{import.complete}\n");
558 &gsprintf($out, "*********************************************\n");
[2785]559
[6330]560 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
[2785]561 if ($close_stats) {
562 close STATS;
563 }
564
[1424]565 close OUT if $close_out;
[2785]566 close FAILLOG;
[4]567}
Note: See TracBrowser for help on using the repository browser.