source: trunk/gsdl/bin/script/import.pl@ 7063

Last change on this file since 7063 was 7063, checked in by kjdon, 20 years ago

added a range option to maxdocs so that it has to be greater than 1

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
40}
41
42use arcinfo;
43use colcfg;
44use plugin;
45use docprint;
46use util;
47use parsargv;
48use FileHandle;
49use gsprintf;
50use printusage;
51
52my $oidtype_list =
53 [ { 'name' => "hash",
54 'desc' => "{import.OIDtype.hash}" },
55 { 'name' => "incremental",
56 'desc' => "{import.OIDtype.incremental}" } ];
57
58# Possible attributes for each argument
59# name: The name of the argument
60# desc: A description (or more likely a reference to a description) for this argument
61# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, metadatum, language, enum etc
62# reqd: Is this argument required?
63# hiddengli: Is this argument hidden in GLI?
64# modegli: The lowest detail mode this argument is visible at in GLI
65
66my $arguments =
67 [ { 'name' => "archivedir",
68 'desc' => "{import.archivedir}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes" },
72 { 'name' => "collectdir",
73 'desc' => "{import.collectdir}",
74 'type' => "string",
75 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
76 'reqd' => "no",
77 'hiddengli' => "yes" },
78 { 'name' => "debug",
79 'desc' => "{import.debug}",
80 'type' => "flag",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83 { 'name' => "faillog",
84 'desc' => "{import.faillog}",
85 'type' => "string",
86 'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
87 'reqd' => "no",
88 'modegli' => "4" },
89 { 'name' => "groupsize",
90 'desc' => "{import.groupsize}",
91 'type' => "int",
92 'deft' => "1",
93 'reqd' => "no",
94 'modegli' => "3" },
95 { 'name' => "gzip",
96 'desc' => "{import.gzip}",
97 'type' => "flag",
98 'reqd' => "no",
99 'modegli' => "4" },
100 { 'name' => "importdir",
101 'desc' => "{import.importdir}",
102 'type' => "string",
103 'reqd' => "no",
104 'hiddengli' => "yes" },
105 { 'name' => "keepold",
106 'desc' => "{import.keepold}",
107 'type' => "flag",
108 'reqd' => "no",
109 'hiddengli' => "yes" },
110 { 'name' => "language",
111 'desc' => "{scripts.language}",
112 'type' => "string",
113 'reqd' => "no",
114 'modegli' => "4" },
115 { 'name' => "maxdocs",
116 'desc' => "{import.maxdocs}",
117 'type' => "int",
118 'reqd' => "no",
119 'range' => "1,",
120 'modegli' => "1" },
121 { 'name' => "OIDtype",
122 'desc' => "{import.OIDtype}",
123 'type' => "enum",
124 'list' => $oidtype_list,
125 'deft' => "hash",
126 'reqd' => "no",
127 'modegli' => "3" },
128 { 'name' => "out",
129 'desc' => "{import.out}",
130 'type' => "string",
131 'deft' => "STDERR",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "removeold",
135 'desc' => "{import.removeold}",
136 'type' => "flag",
137 'reqd' => "no",
138 'modegli' => "3" },
139 { 'name' => "sortmeta",
140 'desc' => "{import.sortmeta}",
141 'type' => "metadata",
142 'reqd' => "no",
143 'modegli' => "2" },
144 { 'name' => "statsfile",
145 'desc' => "{import.statsfile}",
146 'type' => "string",
147 'deft' => "STDERR",
148 'reqd' => "no",
149 'hiddengli' => "yes" },
150 { 'name' => "verbosity",
151 'desc' => "{import.verbosity}",
152 'type' => "int",
153 'range' => "0,3",
154 'deft' => "2",
155 'reqd' => "no",
156 'modegli' => "4" } ];
157
158my $options = { 'name' => "import.pl",
159 'desc' => "{import.desc}",
160 'args' => $arguments };
161
162sub gsprintf
163{
164 return &gsprintf::gsprintf(@_);
165}
166
167
168# sub print_usage {
169# print STDOUT "\n";
170# print STDOUT "import.pl: Converts documents in collections -importdir directory into\n";
171# print STDOUT " xml documents which are written to the -archivedir directory.\n\n";
172# print STDOUT " usage: $0 [options] collection-name\n\n";
173# print STDOUT " options:\n";
174# print STDOUT " -verbosity number 0=none, 3=lots\n";
175# print STDOUT " -importdir directory Where the original material lives\n";
176# print STDOUT " -archivedir directory Where the converted material ends up\n";
177# print STDOUT " -keepold Will not destroy the current contents of the\n";
178# print STDOUT " archives directory (the default)\n";
179# print STDOUT " -removeold Will remove the old contents of the archives\n";
180# print STDOUT " directory -- use with care\n";
181# print STDOUT " -gzip Use gzip to compress resulting xml documents\n";
182# print STDOUT " (don't forget to include ZIPPlug in your plugin\n";
183# print STDOUT " list when building from compressed documents)\n";
184# print STDOUT " -maxdocs number Maximum number of documents to import\n";
185# print STDOUT " -groupsize number Number of import documents to group into one XML file\n";
186# print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n";
187# print STDOUT " identifiers for each document. \"hash\" (the\n";
188# print STDOUT " default) hashes the contents of the file and so\n";
189# print STDOUT " will be the same every time the collection is\n";
190# print STDOUT " imported. \"incremental\" is a simple document\n";
191# print STDOUT " count and so will be significantly faster than\n";
192# print STDOUT " \"hash\". It is not guaranteed to always assign\n";
193# print STDOUT " the same identifier to a given document though\n";
194# print STDOUT " and does not allow further documents to be added\n";
195# print STDOUT " to existing xml archives\n";
196# print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n";
197# print STDOUT " building. This will be disabled if groupsize > 1\n";
198# print STDOUT " -debug Print imported text to STDOUT\n";
199# print STDOUT " -collectdir directory Collection directory (defaults to " .
200# &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
201# print STDOUT " -out name Filename or handle to print output status to.\n";
202# print STDOUT " The default is STDERR\n";
203# print STDOUT " -statsfile name Filename or handle to print import statistics to.\n";
204# print STDOUT " The default is STDERR\n";
205# print STDOUT " -faillog name Fail log filename. This log receives the filenames\n";
206# print STDOUT " of any files which fail to be processed (defaults\n";
207# print STDOUT " to " .
208# &util::filename_cat("<collectdir>", "colname", "etc", "fail.log") . ")\n";
209# print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
210# print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
211# }
212
213&main();
214
215sub main {
216 my ($verbosity, $importdir, $archivedir, $keepold,
217 $removeold, $gzip, $groupsize, $OIDtype, $debug,
218 $maxdocs, $collection, $configfilename, $collectcfg,
219 $pluginfo, $sortmeta, $archive_info_filename, $statsfile,
220 $archive_info, $processor, $out, $faillog, $collectdir, $gli);
221
222 # ***** 11-04-03 - John Thompson *****
223 my $xml = 0;
224 # ************************************
225
226 # note that no defaults are passed for most options as they're set
227 # later (after we check the collect.cfg file)
228 if (!parsargv::parse(\@ARGV,
229 'language/.*/', \$language,
230 'verbosity/\d+/', \$verbosity,
231 'importdir/.*/', \$importdir,
232 'archivedir/.*/', \$archivedir,
233 'keepold', \$keepold,
234 'removeold', \$removeold,
235 'gzip', \$gzip,
236 'groupsize/\d+/1', \$groupsize,
237 'OIDtype/^(hash|incremental)$/', \$OIDtype,
238 'sortmeta/.*/', \$sortmeta,
239 'debug', \$debug,
240 'maxdocs/^\-?\d+/', \$maxdocs,
241 'collectdir/.*/', \$collectdir,
242 'out/.*/STDERR', \$out,
243 'statsfile/.*/STDERR', \$statsfile,
244 'faillog/.*/', \$faillog,
245 'gli', \$gli,
246 q^xml^, \$xml)) {
247 &PrintUsage::print_txt_usage($options, "{import.params}");
248 die "\n";
249 }
250
251 # If $language has been specified, load the appropriate resource bundle
252 # (Otherwise, the default resource bundle will be loaded automatically)
253 if ($language) {
254 &gsprintf::load_language_specific_resource_bundle($language);
255 }
256
257 if ($xml) {
258 &PrintUsage::print_xml_usage($options);
259 die "\n";
260 }
261
262 my $close_out = 0;
263 if ($out !~ /^(STDERR|STDOUT)$/i) {
264 open (OUT, ">$out") ||
265 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
266 $out = 'import::OUT';
267 $close_out = 1;
268 }
269 $out->autoflush(1);
270
271 # set removeold to false if it has been defined
272 $removeold = 0 if ($keepold);
273
274 # get and check the collection name
275 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
276 &PrintUsage::print_txt_usage($options, "{import.params}");
277 die "\n";
278 }
279
280 if ($faillog eq "") {
281 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
282 }
283 open (FAILLOG, ">$faillog") ||
284 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
285 my $faillogname = $faillog;
286 $faillog = 'import::FAILLOG';
287 $faillog->autoflush(1);
288
289 # check sortmeta
290 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
291 if (defined $sortmeta && $groupsize > 1) {
292 &gsprintf($out, "{import.cannot_sort}\n\n");
293 $sortmeta = undef;
294 }
295
296 # dynamically load 'docsave' module so it can pick up on a collection
297 # specific docsave.pm is specified.
298
299 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
300 require docsave;
301
302
303 # get the list of plugins for this collection and set any options that
304 # were specified in the collect.cfg (all import.pl options except
305 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
306 # options must be known before we read the collect.cfg))
307 my $plugins = [];
308 my @global_opts = ();
309
310 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
311 if (-e $configfilename) {
312 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
313 if (defined $collectcfg->{'plugin'}) {
314 $plugins = $collectcfg->{'plugin'};
315 }
316
317 if ($verbosity !~ /\d+/) {
318 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
319 $verbosity = $collectcfg->{'verbosity'};
320 } else {
321 $verbosity = 2; # the default
322 }
323 }
324 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
325 $importdir = $collectcfg->{'importdir'};
326 }
327 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
328 $archivedir = $collectcfg->{'archivedir'};
329 }
330 if (defined $collectcfg->{'removeold'}) {
331 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
332 $removeold = 1;
333 }
334 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
335 $removeold = 0;
336 }
337 }
338 if (defined $collectcfg->{'keepold'}) {
339 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
340 $removeold = 1;
341 }
342 }
343 if (defined $collectcfg->{'gzip'} && !$gzip) {
344 if ($collectcfg->{'gzip'} =~ /^true$/i) {
345 $gzip = 1;
346 }
347 }
348 if ($maxdocs !~ /\-?\d+/) {
349 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
350 $maxdocs = $collectcfg->{'maxdocs'};
351 } else {
352 $maxdocs = -1; # the default
353 }
354 }
355 if ($groupsize == 1) {
356 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
357 $groupsize = $collectcfg->{'groupsize'};
358 }
359 }
360 if ($OIDtype !~ /^(hash|incremental)$/) {
361 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
362 $OIDtype = $collectcfg->{'OIDtype'};
363 } else {
364 $OIDtype = "hash"; # the default
365 }
366 }
367 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
368 $sortmeta = $collectcfg->{'sortmeta'};
369 }
370 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
371 $debug = 1;
372 }
373 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
374 $gli = 1;
375 }
376
377 # global plugin stuff
378 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
379 push @global_opts, "-separate_cjk";
380 }
381
382
383 } else {
384 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
385 }
386
387 $gli = 0 unless defined $gli;
388
389 print STDERR "<Import>\n" if $gli;
390
391 # fill in the default import and archives directories if none
392 # were supplied, turn all \ into / and remove trailing /
393 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
394 $importdir =~ s/[\\\/]+/\//g;
395 $importdir =~ s/\/$//;
396 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
397 $archivedir =~ s/[\\\/]+/\//g;
398 $archivedir =~ s/\/$//;
399
400 # load all the plugins
401 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
402 if (scalar(@$pluginfo) == 0) {
403 &gsprintf($out, "{import.no_plugins_loaded}\n");
404 die "\n";
405 }
406
407 # remove the old contents of the archives directory if needed
408 if ($removeold && -e $archivedir) {
409 &gsprintf($out, "{import.removing_archives}\n");
410 sleep(3); # just in case...
411 &util::rm_r ($archivedir);
412 }
413
414 # read the archive information file
415 if (!$debug) {
416 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
417 $archive_info = new arcinfo ();
418 $archive_info->load_info ($archive_info_filename);
419
420 # create a docsave object to process the documents
421 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
422 $processor->setarchivedir ($archivedir);
423 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
424 $processor->set_OIDtype ($OIDtype);
425 } else {
426 $processor = new docprint ();
427 }
428
429 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
430
431 # process the import directory
432 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, $gli);
433
434 &plugin::end($pluginfo, $processor);
435
436 # write out the archive information file
437 if (!$debug) {
438 $processor->close_file_output() if $groupsize > 1;
439 $archive_info->save_info($archive_info_filename);
440 }
441
442 # write out import stats
443 my $close_stats = 0;
444 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
445 if (open (STATS, ">$statsfile")) {
446 $statsfile = 'import::STATS';
447 $close_stats = 1;
448 } else {
449 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
450 &gsprintf($out, "{import.stats_backup}\n");
451 $statsfile = 'STDERR';
452 }
453 }
454
455 &gsprintf($out, "\n");
456 &gsprintf($out, "*********************************************\n");
457 &gsprintf($out, "{import.complete}\n");
458 &gsprintf($out, "*********************************************\n");
459
460 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
461 if ($close_stats) {
462 close STATS;
463 }
464
465 close OUT if $close_out;
466 close FAILLOG;
467}
Note: See TracBrowser for help on using the repository browser.