source: trunk/gsdl/bin/script/import.pl@ 7101

Last change on this file since 7101 was 7101, checked in by kjdon, 20 years ago

removed the old commented out print usage stuff, added gli arg if it didn't have it, if gli arg is set, output strings in utf-8

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
40}
41
42use arcinfo;
43use colcfg;
44use plugin;
45use docprint;
46use util;
47use parsargv;
48use FileHandle;
49use gsprintf;
50use printusage;
51
52my $oidtype_list =
53 [ { 'name' => "hash",
54 'desc' => "{import.OIDtype.hash}" },
55 { 'name' => "incremental",
56 'desc' => "{import.OIDtype.incremental}" } ];
57
58# Possible attributes for each argument
59# name: The name of the argument
60# desc: A description (or more likely a reference to a description) for this argument
61# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, metadatum, language, enum etc
62# reqd: Is this argument required?
63# hiddengli: Is this argument hidden in GLI?
64# modegli: The lowest detail mode this argument is visible at in GLI
65
66my $arguments =
67 [ { 'name' => "archivedir",
68 'desc' => "{import.archivedir}",
69 'type' => "string",
70 'reqd' => "no",
71 'hiddengli' => "yes" },
72 { 'name' => "collectdir",
73 'desc' => "{import.collectdir}",
74 'type' => "string",
75 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
76 'reqd' => "no",
77 'hiddengli' => "yes" },
78 { 'name' => "debug",
79 'desc' => "{import.debug}",
80 'type' => "flag",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83 { 'name' => "faillog",
84 'desc' => "{import.faillog}",
85 'type' => "string",
86 'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
87 'reqd' => "no",
88 'modegli' => "4" },
89 { 'name' => "groupsize",
90 'desc' => "{import.groupsize}",
91 'type' => "int",
92 'deft' => "1",
93 'reqd' => "no",
94 'modegli' => "3" },
95 { 'name' => "gzip",
96 'desc' => "{import.gzip}",
97 'type' => "flag",
98 'reqd' => "no",
99 'modegli' => "4" },
100 { 'name' => "importdir",
101 'desc' => "{import.importdir}",
102 'type' => "string",
103 'reqd' => "no",
104 'hiddengli' => "yes" },
105 { 'name' => "keepold",
106 'desc' => "{import.keepold}",
107 'type' => "flag",
108 'reqd' => "no",
109 'hiddengli' => "yes" },
110 { 'name' => "language",
111 'desc' => "{scripts.language}",
112 'type' => "string",
113 'reqd' => "no",
114 'modegli' => "4" },
115 { 'name' => "maxdocs",
116 'desc' => "{import.maxdocs}",
117 'type' => "int",
118 'reqd' => "no",
119 'range' => "1,",
120 'modegli' => "1" },
121 { 'name' => "OIDtype",
122 'desc' => "{import.OIDtype}",
123 'type' => "enum",
124 'list' => $oidtype_list,
125 'deft' => "hash",
126 'reqd' => "no",
127 'modegli' => "3" },
128 { 'name' => "out",
129 'desc' => "{import.out}",
130 'type' => "string",
131 'deft' => "STDERR",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "removeold",
135 'desc' => "{import.removeold}",
136 'type' => "flag",
137 'reqd' => "no",
138 'modegli' => "3" },
139 { 'name' => "sortmeta",
140 'desc' => "{import.sortmeta}",
141 'type' => "metadata",
142 'reqd' => "no",
143 'modegli' => "2" },
144 { 'name' => "statsfile",
145 'desc' => "{import.statsfile}",
146 'type' => "string",
147 'deft' => "STDERR",
148 'reqd' => "no",
149 'hiddengli' => "yes" },
150 { 'name' => "verbosity",
151 'desc' => "{import.verbosity}",
152 'type' => "int",
153 'range' => "0,3",
154 'deft' => "2",
155 'reqd' => "no",
156 'modegli' => "4" } ];
157
158my $options = { 'name' => "import.pl",
159 'desc' => "{import.desc}",
160 'args' => $arguments };
161
162sub gsprintf
163{
164 return &gsprintf::gsprintf(@_);
165}
166
167
168&main();
169
170sub main {
171 my ($verbosity, $importdir, $archivedir, $keepold,
172 $removeold, $gzip, $groupsize, $OIDtype, $debug,
173 $maxdocs, $collection, $configfilename, $collectcfg,
174 $pluginfo, $sortmeta, $archive_info_filename, $statsfile,
175 $archive_info, $processor, $out, $faillog, $collectdir, $gli);
176
177 # ***** 11-04-03 - John Thompson *****
178 my $xml = 0;
179 # ************************************
180
181 # note that no defaults are passed for most options as they're set
182 # later (after we check the collect.cfg file)
183 if (!parsargv::parse(\@ARGV,
184 'language/.*/', \$language,
185 'verbosity/\d+/', \$verbosity,
186 'importdir/.*/', \$importdir,
187 'archivedir/.*/', \$archivedir,
188 'keepold', \$keepold,
189 'removeold', \$removeold,
190 'gzip', \$gzip,
191 'groupsize/\d+/1', \$groupsize,
192 'OIDtype/^(hash|incremental)$/', \$OIDtype,
193 'sortmeta/.*/', \$sortmeta,
194 'debug', \$debug,
195 'maxdocs/^\-?\d+/', \$maxdocs,
196 'collectdir/.*/', \$collectdir,
197 'out/.*/STDERR', \$out,
198 'statsfile/.*/STDERR', \$statsfile,
199 'faillog/.*/', \$faillog,
200 'gli', \$gli,
201 q^xml^, \$xml)) {
202 &PrintUsage::print_txt_usage($options, "{import.params}");
203 die "\n";
204 }
205
206 # If $language has been specified, load the appropriate resource bundle
207 # (Otherwise, the default resource bundle will be loaded automatically)
208 if ($language) {
209 &gsprintf::load_language_specific_resource_bundle($language);
210 }
211
212 if ($xml) {
213 &PrintUsage::print_xml_usage($options);
214 die "\n";
215 }
216
217 if ($gli) { # the gli wants strings to be in UTF-8
218 &gsprintf::output_strings_in_UTF8;
219 }
220 my $close_out = 0;
221 if ($out !~ /^(STDERR|STDOUT)$/i) {
222 open (OUT, ">$out") ||
223 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
224 $out = 'import::OUT';
225 $close_out = 1;
226 }
227 $out->autoflush(1);
228
229 # set removeold to false if it has been defined
230 $removeold = 0 if ($keepold);
231
232 # get and check the collection name
233 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
234 &PrintUsage::print_txt_usage($options, "{import.params}");
235 die "\n";
236 }
237
238 if ($faillog eq "") {
239 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
240 }
241 open (FAILLOG, ">$faillog") ||
242 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
243 my $faillogname = $faillog;
244 $faillog = 'import::FAILLOG';
245 $faillog->autoflush(1);
246
247 # check sortmeta
248 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
249 if (defined $sortmeta && $groupsize > 1) {
250 &gsprintf($out, "{import.cannot_sort}\n\n");
251 $sortmeta = undef;
252 }
253
254 # dynamically load 'docsave' module so it can pick up on a collection
255 # specific docsave.pm is specified.
256
257 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
258 require docsave;
259
260
261 # get the list of plugins for this collection and set any options that
262 # were specified in the collect.cfg (all import.pl options except
263 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
264 # options must be known before we read the collect.cfg))
265 my $plugins = [];
266 my @global_opts = ();
267
268 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
269 if (-e $configfilename) {
270 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
271 if (defined $collectcfg->{'plugin'}) {
272 $plugins = $collectcfg->{'plugin'};
273 }
274
275 if ($verbosity !~ /\d+/) {
276 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
277 $verbosity = $collectcfg->{'verbosity'};
278 } else {
279 $verbosity = 2; # the default
280 }
281 }
282 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
283 $importdir = $collectcfg->{'importdir'};
284 }
285 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
286 $archivedir = $collectcfg->{'archivedir'};
287 }
288 if (defined $collectcfg->{'removeold'}) {
289 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
290 $removeold = 1;
291 }
292 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
293 $removeold = 0;
294 }
295 }
296 if (defined $collectcfg->{'keepold'}) {
297 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
298 $removeold = 1;
299 }
300 }
301 if (defined $collectcfg->{'gzip'} && !$gzip) {
302 if ($collectcfg->{'gzip'} =~ /^true$/i) {
303 $gzip = 1;
304 }
305 }
306 if ($maxdocs !~ /\-?\d+/) {
307 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
308 $maxdocs = $collectcfg->{'maxdocs'};
309 } else {
310 $maxdocs = -1; # the default
311 }
312 }
313 if ($groupsize == 1) {
314 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
315 $groupsize = $collectcfg->{'groupsize'};
316 }
317 }
318 if ($OIDtype !~ /^(hash|incremental)$/) {
319 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
320 $OIDtype = $collectcfg->{'OIDtype'};
321 } else {
322 $OIDtype = "hash"; # the default
323 }
324 }
325 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
326 $sortmeta = $collectcfg->{'sortmeta'};
327 }
328 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
329 $debug = 1;
330 }
331 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
332 $gli = 1;
333 }
334
335 # global plugin stuff
336 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
337 push @global_opts, "-separate_cjk";
338 }
339
340
341 } else {
342 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
343 }
344
345 $gli = 0 unless defined $gli;
346
347 print STDERR "<Import>\n" if $gli;
348
349 # fill in the default import and archives directories if none
350 # were supplied, turn all \ into / and remove trailing /
351 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
352 $importdir =~ s/[\\\/]+/\//g;
353 $importdir =~ s/\/$//;
354 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
355 $archivedir =~ s/[\\\/]+/\//g;
356 $archivedir =~ s/\/$//;
357
358 # load all the plugins
359 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
360 if (scalar(@$pluginfo) == 0) {
361 &gsprintf($out, "{import.no_plugins_loaded}\n");
362 die "\n";
363 }
364
365 # remove the old contents of the archives directory if needed
366 if ($removeold && -e $archivedir) {
367 &gsprintf($out, "{import.removing_archives}\n");
368 sleep(3); # just in case...
369 &util::rm_r ($archivedir);
370 }
371
372 # read the archive information file
373 if (!$debug) {
374 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
375 $archive_info = new arcinfo ();
376 $archive_info->load_info ($archive_info_filename);
377
378 # create a docsave object to process the documents
379 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
380 $processor->setarchivedir ($archivedir);
381 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
382 $processor->set_OIDtype ($OIDtype);
383 } else {
384 $processor = new docprint ();
385 }
386
387 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
388
389 # process the import directory
390 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, $gli);
391
392 &plugin::end($pluginfo, $processor);
393
394 # write out the archive information file
395 if (!$debug) {
396 $processor->close_file_output() if $groupsize > 1;
397 $archive_info->save_info($archive_info_filename);
398 }
399
400 # write out import stats
401 my $close_stats = 0;
402 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
403 if (open (STATS, ">$statsfile")) {
404 $statsfile = 'import::STATS';
405 $close_stats = 1;
406 } else {
407 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
408 &gsprintf($out, "{import.stats_backup}\n");
409 $statsfile = 'STDERR';
410 }
411 }
412
413 &gsprintf($out, "\n");
414 &gsprintf($out, "*********************************************\n");
415 &gsprintf($out, "{import.complete}\n");
416 &gsprintf($out, "*********************************************\n");
417
418 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
419 if ($close_stats) {
420 close STATS;
421 }
422
423 close OUT if $close_out;
424 close FAILLOG;
425}
Note: See TracBrowser for help on using the repository browser.