source: trunk/gsdl/bin/script/import.pl@ 14031

Last change on this file since 14031 was 14031, checked in by xiao, 17 years ago

Changes made to look for collectionConfig.xml in gs3 mode and collect.cfg in gs2 mode, rather than presumably only for the file collect.cfg.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
41}
42
43use arcinfo;
44use colcfg;
45use plugin;
46use plugout;
47use manifest;
48use util;
49use scriptutil;
50use FileHandle;
51use gsprintf 'gsprintf';
52use printusage;
53use parse2;
54
55
56
57use strict;
58no strict 'refs'; # allow filehandles to be variables and vice versa
59no strict 'subs'; # allow barewords (eg STDERR) as function arguments
60
61my $oidtype_list =
62 [ { 'name' => "hash",
63 'desc' => "{import.OIDtype.hash}" },
64 { 'name' => "assigned",
65 'desc' => "{import.OIDtype.assigned}" },
66 { 'name' => "incremental",
67 'desc' => "{import.OIDtype.incremental}" },
68 { 'name' => "dirname",
69 'desc' => "{import.OIDtype.dirname}" } ];
70
71#** define to use the original GA format or METS format
72my $saveas_list =
73 [ { 'name' => "GA",
74 'desc' => "{import.saveas.GA}" },
75 { 'name' => "METS",
76 'desc' => "{import.saveas.METS}" } ];
77
78
79# Possible attributes for each argument
80# name: The name of the argument
81# desc: A description (or more likely a reference to a description) for this argument
82# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
83# reqd: Is this argument required?
84# hiddengli: Is this argument hidden in GLI?
85# modegli: The lowest detail mode this argument is visible at in GLI
86
87my $arguments =
88 [ { 'name' => "archivedir",
89 'desc' => "{import.archivedir}",
90 'type' => "string",
91 'reqd' => "no",
92 'hiddengli' => "yes" },
93 { 'name' => "collectdir",
94 'desc' => "{import.collectdir}",
95 'type' => "string",
96 # parsearg left "" as default
97 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
98 'deft' => "",
99 'reqd' => "no",
100 'hiddengli' => "yes" },
101 { 'name' => "manifest",
102 'desc' => "{import.manifest}",
103 'type' => "string",
104 'deft' => "",
105 'reqd' => "no",
106 'hiddengli' => "yes" },
107 { 'name' => "debug",
108 'desc' => "{import.debug}",
109 'type' => "flag",
110 'reqd' => "no",
111 'hiddengli' => "yes" },
112 { 'name' => "faillog",
113 'desc' => "{import.faillog}",
114 'type' => "string",
115 # parsearg left "" as default
116 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
117 'deft' => "",
118 'reqd' => "no",
119 'modegli' => "4" },
120 { 'name' => "importdir",
121 'desc' => "{import.importdir}",
122 'type' => "string",
123 'reqd' => "no",
124 'hiddengli' => "yes" },
125 { 'name' => "incremental",
126 'desc' => "{import.incremental}",
127 'type' => "flag",
128 'hiddengli' => "yes" },
129 { 'name' => "keepold",
130 'desc' => "{import.keepold}",
131 'type' => "flag",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "removeold",
135 'desc' => "{import.removeold}",
136 'type' => "flag",
137 'reqd' => "no",
138 'hiddengli' => "yes" },
139 { 'name' => "language",
140 'desc' => "{scripts.language}",
141 'type' => "string",
142 'reqd' => "no",
143 'hiddengli' => "yes" },
144 { 'name' => "maxdocs",
145 'desc' => "{import.maxdocs}",
146 'type' => "int",
147 'reqd' => "no",
148 # parsearg left "" as default
149 #'deft' => "-1",
150 'range' => "1,",
151 'modegli' => "1" },
152 { 'name' => "OIDtype",
153 'desc' => "{import.OIDtype}",
154 'type' => "enum",
155 'list' => $oidtype_list,
156 # parsearg left "" as default
157 #'deft' => "hash",
158 'reqd' => "no",
159 'modegli' => "2" },
160 { 'name' => "OIDmetadata",
161 'desc' => "{import.OIDmetadata}",
162 'type' => "metadata",
163 'deft' => "dc.Identifier",
164 'reqd' => "no",
165 'modegli' => "2" },
166 { 'name' => "out",
167 'desc' => "{import.out}",
168 'type' => "string",
169 'deft' => "STDERR",
170 'reqd' => "no",
171 'hiddengli' => "yes" },
172 { 'name' => "saveas",
173 'desc' => "{import.saveas}",
174 'type' => "enum",
175 'list' => $saveas_list,
176 'deft' => "GA",
177 'reqd' => "no",
178 'modegli' => "3" },
179 { 'name' => "sortmeta",
180 'desc' => "{import.sortmeta}",
181 'type' => "metadata",
182# 'type' => "string",
183 'reqd' => "no",
184 'modegli' => "3" },
185 { 'name' => "removeprefix",
186 'desc' => "{BasClas.removeprefix}",
187 'type' => "regexp",
188 'deft' => "",
189 'reqd' => "no",
190 'modegli' => "3" },
191 { 'name' => "removesuffix",
192 'desc' => "{BasClas.removesuffix}",
193 'type' => "regexp",
194 'deft' => "",
195 'reqd' => "no",
196 'modegli' => "3" },
197 { 'name' => "groupsize",
198 'desc' => "{import.groupsize}",
199 'type' => "int",
200 'deft' => "1",
201 'reqd' => "no",
202 'modegli' => "3" },
203 { 'name' => "gzip",
204 'desc' => "{import.gzip}",
205 'type' => "flag",
206 'reqd' => "no",
207 'modegli' => "4" },
208 { 'name' => "statsfile",
209 'desc' => "{import.statsfile}",
210 'type' => "string",
211 'deft' => "STDERR",
212 'reqd' => "no",
213 'hiddengli' => "yes" },
214 { 'name' => "verbosity",
215 'desc' => "{import.verbosity}",
216 'type' => "int",
217 'range' => "0,",
218 # parsearg left "" as default
219 #'deft' => "2",
220 'reqd' => "no",
221 'modegli' => "4" },
222 { 'name' => "gli",
223 'desc' => "",
224 'type' => "flag",
225 'reqd' => "no",
226 'hiddengli' => "yes" },
227 { 'name' => "xml",
228 'desc' => "{scripts.xml}",
229 'type' => "flag",
230 'reqd' => "no",
231 'hiddengli' => "yes" }];
232
233my $options = { 'name' => "import.pl",
234 'desc' => "{import.desc}",
235 'args' => $arguments };
236
237
238&main();
239
240sub main {
241 my ($verbosity, $importdir, $archivedir, $manifest, $incremental, $keepold,
242 $removeold, $saveas, $version,
243 $gzip, $groupsize, $OIDtype, $OIDmetadata, $debug,
244 $maxdocs, $collection, $configfilename, $collectcfg,
245 $pluginfo, $sortmeta, $removeprefix, $removesuffix,
246 $archive_info_filename, $statsfile,
247 $archive_info, $processor, $out, $faillog, $collectdir, $gli, $language);
248
249 my $xml = 0;
250
251 my $service = "import";
252
253 my $hashParsingResult = {};
254 # general options available to all plugins
255 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
256 # Parse returns -1 if something has gone wrong
257 if($intArgLeftinAfterParsing == -1)
258 {
259 &PrintUsage::print_txt_usage($options, "{import.params}");
260 die "\n";
261 }
262
263 foreach my $strVariable (keys %$hashParsingResult)
264 {
265 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
266 }
267
268 # If $language has been specified, load the appropriate resource bundle
269 # (Otherwise, the default resource bundle will be loaded automatically)
270 if ($language && $language =~ /\S/) {
271 &gsprintf::load_language_specific_resource_bundle($language);
272 }
273
274 if ($xml) {
275 &PrintUsage::print_xml_usage($options);
276 print "\n";
277 return;
278 }
279
280 if ($gli) { # the gli wants strings to be in UTF-8
281 &gsprintf::output_strings_in_UTF8;
282 }
283
284 # now check that we had exactly one leftover arg, which should be
285 # the collection name. We don't want to do this earlier, cos
286 # -xml arg doesn't need a collection name
287 # Or if the user specified -h, then we output the usage also
288 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
289 {
290 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
291 die "\n";
292 }
293
294 my $close_out = 0;
295 if ($out !~ /^(STDERR|STDOUT)$/i) {
296 open (OUT, ">$out") ||
297 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
298 $out = 'import::OUT';
299 $close_out = 1;
300 }
301 $out->autoflush(1);
302
303 # get and check the collection name
304 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
305 &PrintUsage::print_txt_usage($options, "{import.params}");
306 die "\n";
307 }
308
309 # add collection's perllib dir into include path in
310 # case we have collection specific modules
311 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
312
313 # check that we can open the faillog
314 if ($faillog eq "") {
315 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
316 }
317 open (FAILLOG, ">$faillog") ||
318 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
319
320
321 my $faillogname = $faillog;
322 $faillog = 'import::FAILLOG';
323 $faillog->autoflush(1);
324
325 # check that there is a collect.cfg file, i.e. it is gs2.
326 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
327 if (-e $configfilename) {
328 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
329
330 } else {
331 # check that there is a collectionConfig.xml file, i.e. it is gs3.
332 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collectionConfig.xml");
333 if (!-e $configfilename) {
334 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
335 } else {
336 $collectcfg = &colcfg::read_collection_cfg_xml ($configfilename);
337 }
338 }
339
340 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
341 $importdir = $collectcfg->{'importdir'};
342 }
343 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
344 $archivedir = $collectcfg->{'archivedir'};
345 }
346 # fill in the default import and archives directories if none
347 # were supplied, turn all \ into / and remove trailing /
348 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
349 $importdir =~ s/[\\\/]+/\//g;
350 $importdir =~ s/\/$//;
351 if (!-e $importdir) {
352 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
353 die "\n";
354 }
355
356 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
357 $archivedir =~ s/[\\\/]+/\//g;
358 $archivedir =~ s/\/$//;
359
360 my $plugins = [];
361 if (defined $collectcfg->{'plugin'}) {
362 $plugins = $collectcfg->{'plugin'};
363 }
364 #some global options for the plugins
365 my @global_opts = ();
366
367 if ($verbosity !~ /\d+/) {
368 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
369 $verbosity = $collectcfg->{'verbosity'};
370 } else {
371 $verbosity = 2; # the default
372 }
373 }
374 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
375 $manifest = $collectcfg->{'manifest'};
376 }
377
378 if (defined $collectcfg->{'gzip'} && !$gzip) {
379 if ($collectcfg->{'gzip'} =~ /^true$/i) {
380 $gzip = 1;
381 }
382 }
383
384 if ($maxdocs !~ /\-?\d+/) {
385 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
386 $maxdocs = $collectcfg->{'maxdocs'};
387 } else {
388 $maxdocs = -1; # the default
389 }
390 }
391 if ($groupsize == 1) {
392 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
393 $groupsize = $collectcfg->{'groupsize'};
394 }
395 }
396
397 if ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/) {
398 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
399 $OIDtype = $collectcfg->{'OIDtype'};
400 } else {
401 $OIDtype = "hash"; # the default
402 }
403 }
404
405 if ($saveas !~ /^(GA|METS)$/) {
406 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
407 $saveas = $collectcfg->{'plugout'}[0];
408 } else {
409 $saveas ="GAPlugout";
410 }
411 }
412
413 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
414 $sortmeta = $collectcfg->{'sortmeta'};
415 }
416 # sortmeta cannot be used with group size
417 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
418 if (defined $sortmeta && $groupsize > 1) {
419 &gsprintf($out, "{import.cannot_sort}\n\n");
420 $sortmeta = undef;
421 }
422
423 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
424 $removeprefix = $collectcfg->{'removeprefix'};
425 }
426
427 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
428 $removesuffix = $collectcfg->{'removesuffix'};
429 }
430 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
431 $debug = 1;
432 }
433 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
434 $gli = 1;
435 }
436
437
438 # global plugin stuff
439 if (defined $collectcfg->{'separate_cjk'} && $collectcfg->{'separate_cjk'} =~ /^true$/i) {
440 push @global_opts, "-separate_cjk";
441 }
442
443 # check keepold and removeold
444 ($removeold, $keepold, $incremental) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, $incremental, "archives", $collectcfg);
445
446 $gli = 0 unless defined $gli;
447
448 print STDERR "<Import>\n" if $gli;
449
450 my $manifest_lookup = new manifest();
451 if ($manifest ne "") {
452 my $manifest_filename = $manifest;
453
454 if ($manifest_filename !~ m/^[\\\/]/) {
455 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
456 }
457
458 $manifest =~ s/[\\\/]+/\//g;
459 $manifest =~ s/\/$//;
460
461 $manifest_lookup->parse($manifest_filename);
462 }
463
464
465 # load all the plugins
466 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental);
467 if (scalar(@$pluginfo) == 0) {
468 &gsprintf($out, "{import.no_plugins_loaded}\n");
469 die "\n";
470 }
471
472 # remove the old contents of the archives directory (and tmp directory) if needed
473 if ($removeold) {
474 if (-e $archivedir) {
475 &gsprintf($out, "{import.removing_archives}\n");
476 &util::rm_r ($archivedir);
477 }
478 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
479 $tmpdir =~ s/[\\\/]+/\//g;
480 $tmpdir =~ s/\/$//;
481 if (-e $tmpdir) {
482 &gsprintf($out, "{import.removing_tmpdir}\n");
483 &util::rm_r ($tmpdir);
484 }
485 }
486 # create the archives dir if needed
487 &util::mk_all_dir($archivedir);
488
489 # read the archive information file
490 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
491 $archive_info = new arcinfo ();
492 $archive_info->load_info ($archive_info_filename);
493
494
495 ####Use Plugout####
496 my ($plugout_name);
497 if ($saveas !~ /^(GA|METS)Plugout$/ ){
498 $plugout_name = $saveas."Plugout";
499 }
500 else {
501 $plugout_name = $saveas;
502 }
503
504 my $opts=[];
505 push @$opts,("-output_info",$archive_info) if (defined $archive_info);
506
507 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
508 push @$opts,("-gzip_output") if ($gzip);
509 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
510 push @$opts,("-output_handle",$out) if (defined $out);
511
512 push @$opts,("-debug") if ($debug);
513
514 $processor = &plugout::load_plugout($plugout_name,$opts);
515 $processor->setoutputdir ($archivedir);
516 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
517 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
518
519 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
520
521 if ($manifest eq "") {
522 # process the import directory
523 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, 0, $gli);
524 }
525 else {
526
527 # process any new files
528 foreach my $file (keys %{$manifest_lookup->{'index'}}) {
529 &plugin::read ($pluginfo, $importdir, $file, {}, $processor, $maxdocs, 0, $gli);
530 }
531
532 # record files marked for deletion in arcinfo
533 foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
534 # consider finding it?
535 # $archive_info->add_info($OID,$doc_xml_file,"D");
536 }
537 }
538
539 &plugin::end($pluginfo, $processor);
540
541 &plugin::deinit($pluginfo, $processor);
542
543 # write out the archive information file
544 $processor->close_file_output() if $groupsize > 1;
545 $processor->close_group_output() if $processor->is_group();
546 # should we still do this in debug mode??
547 $archive_info->save_info($archive_info_filename);
548
549 # write out import stats
550 my $close_stats = 0;
551 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
552 if (open (STATS, ">$statsfile")) {
553 $statsfile = 'import::STATS';
554 $close_stats = 1;
555 } else {
556 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
557 &gsprintf($out, "{import.stats_backup}\n");
558 $statsfile = 'STDERR';
559 }
560 }
561
562 &gsprintf($out, "\n");
563 &gsprintf($out, "*********************************************\n");
564 &gsprintf($out, "{import.complete}\n");
565 &gsprintf($out, "*********************************************\n");
566
567 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
568 if ($close_stats) {
569 close STATS;
570 }
571
572 close OUT if $close_out;
573 close FAILLOG;
574}
Note: See TracBrowser for help on using the repository browser.