source: main/tags/2.60/gsdl/bin/script/import.pl@ 25196

Last change on this file since 25196 was 9546, checked in by kjdon, 19 years ago

now teh saveas, removeprefix and removesuffix options can be specified in teh config file

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.2 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
40}
41
42use arcinfo;
43use colcfg;
44use plugin;
45use docprint;
46use util;
47use parsargv;
48use FileHandle;
49use gsprintf;
50use printusage;
51
52my $oidtype_list =
53 [ { 'name' => "hash",
54 'desc' => "{import.OIDtype.hash}" },
55 { 'name' => "incremental",
56 'desc' => "{import.OIDtype.incremental}" },
57 { 'name' => "assigned",
58 'desc' => "{import.OIDtype.assigned}" },
59 { 'name' => "dirname",
60 'desc' => "{import.OIDtype.dirname}" } ];
61
62#** define to use the original GA format or METS format
63my $saveas_list =
64 [ { 'name' => "GA",
65 'desc' => "{import.saveas.GA}" },
66 { 'name' => "METS",
67 'desc' => "{import.saveas.METS}" } ];
68
69
70# Possible attributes for each argument
71# name: The name of the argument
72# desc: A description (or more likely a reference to a description) for this argument
73# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, metadatum, language, enum etc
74# reqd: Is this argument required?
75# hiddengli: Is this argument hidden in GLI?
76# modegli: The lowest detail mode this argument is visible at in GLI
77
78my $arguments =
79 [ { 'name' => "archivedir",
80 'desc' => "{import.archivedir}",
81 'type' => "string",
82 'reqd' => "no",
83 'hiddengli' => "yes" },
84 { 'name' => "collectdir",
85 'desc' => "{import.collectdir}",
86 'type' => "string",
87 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
88 'reqd' => "no",
89 'hiddengli' => "yes" },
90 { 'name' => "debug",
91 'desc' => "{import.debug}",
92 'type' => "flag",
93 'reqd' => "no",
94 'hiddengli' => "yes" },
95 { 'name' => "faillog",
96 'desc' => "{import.faillog}",
97 'type' => "string",
98 'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
99 'reqd' => "no",
100 'modegli' => "4" },
101 { 'name' => "groupsize",
102 'desc' => "{import.groupsize}",
103 'type' => "int",
104 'deft' => "1",
105 'reqd' => "no",
106 'modegli' => "3" },
107 { 'name' => "gzip",
108 'desc' => "{import.gzip}",
109 'type' => "flag",
110 'reqd' => "no",
111 'modegli' => "4" },
112 { 'name' => "importdir",
113 'desc' => "{import.importdir}",
114 'type' => "string",
115 'reqd' => "no",
116 'hiddengli' => "yes" },
117 { 'name' => "keepold",
118 'desc' => "{import.keepold}",
119 'type' => "flag",
120 'reqd' => "no",
121 'hiddengli' => "yes" },
122 { 'name' => "language",
123 'desc' => "{scripts.language}",
124 'type' => "string",
125 'reqd' => "no",
126 'modegli' => "4" },
127 { 'name' => "maxdocs",
128 'desc' => "{import.maxdocs}",
129 'type' => "int",
130 'reqd' => "no",
131 'range' => "1,",
132 'modegli' => "1" },
133 { 'name' => "OIDtype",
134 'desc' => "{import.OIDtype}",
135 'type' => "enum",
136 'list' => $oidtype_list,
137 'deft' => "hash",
138 'reqd' => "no",
139 'modegli' => "3" },
140 { 'name' => "out",
141 'desc' => "{import.out}",
142 'type' => "string",
143 'deft' => "STDERR",
144 'reqd' => "no",
145 'hiddengli' => "yes" },
146 { 'name' => "removeold",
147 'desc' => "{import.removeold}",
148 'type' => "flag",
149 'reqd' => "no",
150 'modegli' => "3" },
151 { 'name' => "saveas",
152 'desc' => "{import.saveas}",
153 'type' => "enum",
154 'list' => $saveas_list,
155 'deft' => "GA",
156 'reqd' => "no",
157 'modegli' => "3" },
158 { 'name' => "sortmeta",
159 'desc' => "{import.sortmeta}",
160 'type' => "metadatum",
161 'reqd' => "no",
162 'modegli' => "3" },
163 { 'name' => "removeprefix",
164 'desc' => "{BasClas.removeprefix}",
165 'type' => "regexp",
166 'deft' => "",
167 'reqd' => "no",
168 'modegli' => "3" },
169 { 'name' => "removesuffix",
170 'desc' => "{BasClas.removesuffix}",
171 'type' => "regexp",
172 'deft' => "",
173 'reqd' => "no",
174 'modegli' => "3" },
175 { 'name' => "statsfile",
176 'desc' => "{import.statsfile}",
177 'type' => "string",
178 'deft' => "STDERR",
179 'reqd' => "no",
180 'hiddengli' => "yes" },
181 { 'name' => "verbosity",
182 'desc' => "{import.verbosity}",
183 'type' => "int",
184 'range' => "0,3",
185 'deft' => "2",
186 'reqd' => "no",
187 'modegli' => "4" }];
188
189my $options = { 'name' => "import.pl",
190 'desc' => "{import.desc}",
191 'args' => $arguments };
192
193sub gsprintf
194{
195 return &gsprintf::gsprintf(@_);
196}
197
198
199&main();
200
201sub main {
202 my ($verbosity, $importdir, $archivedir, $keepold,
203 $removeold, $saveas, $gzip, $groupsize, $OIDtype, $debug,
204 $maxdocs, $collection, $configfilename, $collectcfg,
205 $pluginfo, $sortmeta, $removeprefix, $removesuffix,
206 $archive_info_filename, $statsfile,
207 $archive_info, $processor, $out, $faillog, $collectdir, $gli);
208
209 # ***** 11-04-03 - John Thompson *****
210 my $xml = 0;
211 # ************************************
212 my $service = "import";
213
214 # note that no defaults are passed for most options as they're set
215 # later (after we check the collect.cfg file)
216 if (!parsargv::parse(\@ARGV,
217 'language/.*/', \$language,
218 'verbosity/\d+/', \$verbosity,
219 'importdir/.*/', \$importdir,
220 'archivedir/.*/', \$archivedir,
221 'keepold', \$keepold,
222 'removeold', \$removeold,
223 'saveas/^(GA|METS)$/', \$saveas,
224 'gzip', \$gzip,
225 'groupsize/\d+/1', \$groupsize,
226 'OIDtype/^(hash|incremental|assigned|dirname)$/', \$OIDtype,
227 'sortmeta/.*/', \$sortmeta,
228 'removeprefix/.*/', \$removeprefix,
229 'removesuffix/.*/', \$removesuffix,
230 'debug', \$debug,
231 'maxdocs/^\-?\d+/', \$maxdocs,
232 'collectdir/.*/', \$collectdir,
233 'out/.*/STDERR', \$out,
234 'statsfile/.*/STDERR', \$statsfile,
235 'faillog/.*/', \$faillog,
236 'gli', \$gli,
237 q^xml^, \$xml)) {
238 &PrintUsage::print_txt_usage($options, "{import.params}");
239 die "\n";
240 }
241
242 # If $language has been specified, load the appropriate resource bundle
243 # (Otherwise, the default resource bundle will be loaded automatically)
244 if ($language) {
245 &gsprintf::load_language_specific_resource_bundle($language);
246 }
247
248 if ($xml) {
249 &PrintUsage::print_xml_usage($options);
250 print "\n";
251 return;
252 }
253
254 if ($gli) { # the gli wants strings to be in UTF-8
255 &gsprintf::output_strings_in_UTF8;
256 }
257
258 my $close_out = 0;
259 if ($out !~ /^(STDERR|STDOUT)$/i) {
260 open (OUT, ">$out") ||
261 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
262 $out = 'import::OUT';
263 $close_out = 1;
264 }
265 $out->autoflush(1);
266
267 # set removeold to false if it has been defined
268 $removeold = 0 if ($keepold);
269
270 # get and check the collection name
271 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
272 &PrintUsage::print_txt_usage($options, "{import.params}");
273 die "\n";
274 }
275
276 if ($faillog eq "") {
277 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
278 }
279 open (FAILLOG, ">$faillog") ||
280 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
281
282 my $faillogname = $faillog;
283 $faillog = 'import::FAILLOG';
284 $faillog->autoflush(1);
285
286 # check sortmeta
287 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
288 if (defined $sortmeta && $groupsize > 1) {
289 &gsprintf($out, "{import.cannot_sort}\n\n");
290 $sortmeta = undef;
291 }
292
293 # dynamically load 'docsave' module so it can pick up on a collection
294 # specific docsave.pm is specified.
295
296 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
297 require docsave;
298
299 # get the list of plugins for this collection and set any options that
300 # were specified in the collect.cfg (all import.pl options except
301 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
302 # options must be known before we read the collect.cfg))
303 my $plugins = [];
304 my @global_opts = ();
305
306 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
307 if (-e $configfilename) {
308 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
309 if (defined $collectcfg->{'plugin'}) {
310 $plugins = $collectcfg->{'plugin'};
311 }
312
313 if ($verbosity !~ /\d+/) {
314 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
315 $verbosity = $collectcfg->{'verbosity'};
316 } else {
317 $verbosity = 2; # the default
318 }
319 }
320 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
321 $importdir = $collectcfg->{'importdir'};
322 }
323 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
324 $archivedir = $collectcfg->{'archivedir'};
325 }
326 if (defined $collectcfg->{'removeold'}) {
327 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
328 $removeold = 1;
329 }
330 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
331 $removeold = 0;
332 }
333 }
334 if (defined $collectcfg->{'keepold'}) {
335 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
336 $removeold = 1;
337 }
338 }
339 if (defined $collectcfg->{'gzip'} && !$gzip) {
340 if ($collectcfg->{'gzip'} =~ /^true$/i) {
341 $gzip = 1;
342 }
343 }
344 if ($maxdocs !~ /\-?\d+/) {
345 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
346 $maxdocs = $collectcfg->{'maxdocs'};
347 } else {
348 $maxdocs = -1; # the default
349 }
350 }
351 if ($groupsize == 1) {
352 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
353 $groupsize = $collectcfg->{'groupsize'};
354 }
355 }
356 if ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/) {
357 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
358 $OIDtype = $collectcfg->{'OIDtype'};
359 } else {
360 $OIDtype = "hash"; # the default
361 }
362 }
363 if ($saveas !~ /^(GA|METS)$/) {
364 if (defined $collectcfg->{'saveas'} && $collectcfg->{'saveas'} =~ /^(GA|METS)$/) {
365 $saveas = $collectcfg->{'saveas'};
366 } else {
367 $saveas ="GA";
368 }
369 }
370 if (defined $collectcfg->{'sortmeta'} && (!defined$sortmeta || $sortmeta eq "")) {
371 $sortmeta = $collectcfg->{'sortmeta'};
372 }
373
374 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
375 $removeprefix = $collectcfg->{'removeprefix'};
376 }
377
378 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
379 $removesuffix = $collectcfg->{'removesuffix'};
380 }
381 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
382 $debug = 1;
383 }
384 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
385 $gli = 1;
386 }
387
388 # global plugin stuff
389 if (defined $collectcfg->{'separate_cjk'} && $collectcfg->{'separate_cjk'} =~ /^true$/i) {
390 push @global_opts, "-separate_cjk";
391 }
392
393 } else {
394 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
395 }
396
397 $gli = 0 unless defined $gli;
398
399 print STDERR "<Import>\n" if $gli;
400
401 # fill in the default import and archives directories if none
402 # were supplied, turn all \ into / and remove trailing /
403 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
404 $importdir =~ s/[\\\/]+/\//g;
405 $importdir =~ s/\/$//;
406 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
407 $archivedir =~ s/[\\\/]+/\//g;
408 $archivedir =~ s/\/$//;
409
410 # load all the plugins
411 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
412 if (scalar(@$pluginfo) == 0) {
413 &gsprintf($out, "{import.no_plugins_loaded}\n");
414 die "\n";
415 }
416
417 # remove the old contents of the archives directory if needed
418 if ($removeold && -e $archivedir) {
419 &gsprintf($out, "{import.removing_archives}\n");
420 &util::rm_r ($archivedir);
421 }
422
423 # read the archive information file
424 if (!$debug) {
425 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
426 $archive_info = new arcinfo ();
427 $archive_info->load_info ($archive_info_filename);
428
429 # create a docsave object to process the documents
430 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out, $service, $saveas);
431 $processor->setarchivedir ($archivedir);
432 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
433 $processor->set_OIDtype ($OIDtype);
434 $processor->set_saveas ($saveas);
435 } else {
436 $processor = new docprint ();
437 }
438
439 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
440
441 # process the import directory
442 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, $gli);
443
444 &plugin::end($pluginfo, $processor);
445
446 # write out the archive information file
447 if (!$debug) {
448 $processor->close_file_output() if $groupsize > 1;
449 $archive_info->save_info($archive_info_filename);
450 }
451
452 # write out import stats
453 my $close_stats = 0;
454 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
455 if (open (STATS, ">$statsfile")) {
456 $statsfile = 'import::STATS';
457 $close_stats = 1;
458 } else {
459 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
460 &gsprintf($out, "{import.stats_backup}\n");
461 $statsfile = 'STDERR';
462 }
463 }
464
465 &gsprintf($out, "\n");
466 &gsprintf($out, "*********************************************\n");
467 &gsprintf($out, "{import.complete}\n");
468 &gsprintf($out, "*********************************************\n");
469
470 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
471 if ($close_stats) {
472 close STATS;
473 }
474
475 close OUT if $close_out;
476 close FAILLOG;
477}
Note: See TracBrowser for help on using the repository browser.