source: trunk/gsdl/bin/script/export.pl@ 11449

Last change on this file since 11449 was 10417, checked in by kjdon, 19 years ago

standardised -removeold and -keepold. if neither or both are specified, a warning is printed, and -removeold is used.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
41}
42
43use strict;
44no strict 'refs'; # allow filehandles to be variables and vice versa
45no strict 'subs'; # allow barewords (eg STDERR) as function arguments
46
47use arcinfo;
48use colcfg;
49use plugin;
50use docprint;
51use util;
52use scriptutil;
53use FileHandle;
54use gsprintf;
55use printusage;
56use parse2;
57
58
59my $oidtype_list =
60 [ { 'name' => "hash",
61 'desc' => "{export.OIDtype.hash}" },
62 { 'name' => "incremental",
63 'desc' => "{export.OIDtype.incremental}" } ];
64
65#** define to use the METS format or DSpace format
66my $saveas_list =
67 [ { 'name' => "DSpace",
68 'desc' => "{export.saveas.DSpace}" },
69 { 'name' => "METS",
70 'desc' => "{export.saveas.METS}" } ];
71
72
73# Possible attributes for each argument
74# name: The name of the argument
75# desc: A description (or more likely a reference to a description) for this argument
76# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, metadatum, language, enum etc
77# reqd: Is this argument required?
78# hiddengli: Is this argument hidden in GLI?
79# modegli: The lowest detail mode this argument is visible at in GLI
80
81my $saveas_argument =
82 { 'name' => "saveas",
83 'desc' => "{export.saveas}",
84 'type' => "enum",
85 'list' => $saveas_list,
86 'deft' => "METS",
87 'reqd' => "no",
88 'modegli' => "3" };
89
90
91my $arguments =
92 [
93 $saveas_argument,
94 { 'name' => "saveas_version",
95 'desc' => "{export.saveas_version}",
96 'type' => "string",
97 'reqd' => "no" },
98 { 'name' => "exportdir",
99 'desc' => "{export.exportdir}",
100 'type' => "string",
101 'reqd' => "no",
102 'hiddengli' => "yes" },
103 { 'name' => "importdir",
104 'desc' => "{import.importdir}",
105 'type' => "string",
106 'reqd' => "no",
107 'hiddengli' => "yes" },
108 { 'name' => "collectdir",
109 'desc' => "{export.collectdir}",
110 'type' => "string",
111 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
112 'reqd' => "no",
113 'hiddengli' => "yes" },
114 { 'name' => "listall",
115 'desc' => "{scripts.listall}",
116 'type' => "flag",
117 'reqd' => "no" },
118 { 'name' => "debug",
119 'desc' => "{export.debug}",
120 'type' => "flag",
121 'reqd' => "no",
122 'hiddengli' => "yes" },
123 { 'name' => "faillog",
124 'desc' => "{export.faillog}",
125 'type' => "string",
126 'deft' => "",
127 'reqd' => "no",
128 'modegli' => "4" },
129 { 'name' => "keepold",
130 'desc' => "{export.keepold}",
131 'type' => "flag",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "removeold",
135 'desc' => "{export.removeold}",
136 'type' => "flag",
137 'reqd' => "no",
138 'modegli' => "3" },
139 { 'name' => "language",
140 'desc' => "{scripts.language}",
141 'type' => "string",
142 'reqd' => "no",
143 'modegli' => "4" },
144 { 'name' => "maxdocs",
145 'desc' => "{export.maxdocs}",
146 'type' => "int",
147 'reqd' => "no",
148 'range' => "1,",
149 'modegli' => "1" },
150 { 'name' => "out",
151 'desc' => "{export.out}",
152 'type' => "string",
153 'deft' => "STDERR",
154 'reqd' => "no",
155 'hiddengli' => "yes" },
156 { 'name' => "statsfile",
157 'desc' => "{export.statsfile}",
158 'type' => "string",
159 'deft' => "STDERR",
160 'reqd' => "no",
161 'hiddengli' => "yes" },
162 { 'name' => "verbosity",
163 'desc' => "{export.verbosity}",
164 'type' => "int",
165 'range' => "0,3",
166 'deft' => "2",
167 'reqd' => "no",
168 'modegli' => "4" },
169 { 'name' => "gli",
170 'desc' => "",
171 'type' => "flag",
172 'reqd' => "no",
173 'hiddengli' => "yes" },
174 { 'name' => "xml",
175 'desc' => "{scripts.xml}",
176 'type' => "flag",
177 'reqd' => "no",
178 'hiddengli' => "yes" }
179 ];
180
181my $options = { 'name' => "export.pl",
182 'desc' => "{export.desc}",
183 'args' => $arguments };
184
185my $listall_options = { 'name' => "export.pl",
186 'desc' => "{export.desc}",
187 'args' => [ $saveas_argument ] };
188
189sub gsprintf
190{
191 return &gsprintf::gsprintf(@_);
192}
193
194
195&main();
196
197sub main {
198 # params
199 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
200 $removeold, $saveas, $saveas_version, $debug,
201 $maxdocs, $statsfile, $out, $faillog, $collectdir, $gli);
202 my $xml = 0;
203
204 # other vars
205 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
206 my $service = "export";
207
208 my $hashParsingResult = {};
209 my $blnParseFailed = "false";
210 # general options available to all plugins
211 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
212 # can have more than one collection name, so don't check num args left
213 foreach my $strVariable (keys %$hashParsingResult)
214 {
215 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
216 }
217
218
219 # these are options used by other things - we just set default values
220 # undef means will be set from config file if there
221 my $gzip = undef;
222 my $groupsize = 1;
223 my $OIDtype = undef;
224 my $sortmeta = undef;
225
226 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
227
228 # save these command line settings. don't want config file settings in one
229 # coll used for other colls
230 # does this apply to other vars???
231 my $global_removeold = $removeold;
232 my $global_keepold = $keepold;
233 # If $language has been specified, load the appropriate resource bundle
234 # (Otherwise, the default resource bundle will be loaded automatically)
235 if ($language) {
236 &gsprintf::load_language_specific_resource_bundle($language);
237 }
238
239 if ($listall) {
240 if ($xml) {
241 &PrintUsage::print_xml_usage($listall_options);
242 }
243 else
244 {
245 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
246 }
247 die "\n";
248 }
249 elsif ($xml) {
250 &PrintUsage::print_xml_usage($options);
251 die "\n";
252 }
253
254 if (scalar(@ARGV) == 0) {
255 &PrintUsage::print_txt_usage($options, "{export.params}");
256 die "\n";
257 }
258
259 if ($gli) { # the gli wants strings to be in UTF-8
260 &gsprintf::output_strings_in_UTF8;
261 }
262 my $close_out = 0;
263 if ($out !~ /^(STDERR|STDOUT)$/i) {
264 open (OUT, ">$out") ||
265 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
266 $out = 'export::OUT';
267 $close_out = 1;
268 }
269 $out->autoflush(1);
270
271 while (scalar(@ARGV)>0) {
272 my $collect_name = shift @ARGV;
273 $ENV{'GSDLCOLLECTION'} = $collect_name;
274
275 eval {
276 # get and check the collection name
277 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
278 &PrintUsage::print_txt_usage($options, "{export.params}");
279 die "\n";
280 }
281
282 if ($faillog eq "") {
283 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
284 }
285 open (FAILLOG, ">$faillog") ||
286 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
287 my $faillogname = $faillog;
288 $faillog = 'export::FAILLOG';
289 $faillog->autoflush(1);
290
291 # check sortmeta
292 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
293 if (defined $sortmeta && $groupsize > 1) {
294 &gsprintf($out, "{export.cannot_sort}\n\n");
295 $sortmeta = undef;
296 }
297
298 # dynamically load 'docsave' module so it can pick up on a collection
299 # specific docsave.pm is specified.
300
301 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
302 require docsave;
303
304 # get the list of plugins for this collection and set any options that
305 # were specified in the collect.cfg (all export.pl options except
306 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
307 # options must be known before we read the collect.cfg))
308 my $plugins = [];
309 my @global_opts = ();
310
311 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
312 if (!-e $configfilename) {
313 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
314 }
315
316 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
317 if (defined $collectcfg->{'plugin'}) {
318 $plugins = $collectcfg->{'plugin'};
319 }
320
321 if ($verbosity !~ /\d+/) {
322 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
323 $verbosity = $collectcfg->{'verbosity'};
324 } else {
325 $verbosity = 2; # the default
326 }
327 }
328 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
329 $importdir = $collectcfg->{'importdir'};
330 }
331 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
332 $exportdir = $collectcfg->{'exportdir'};
333 }
334
335 if (defined $collectcfg->{'gzip'} && !$gzip) {
336 if ($collectcfg->{'gzip'} =~ /^true$/i) {
337 $gzip = 1;
338 }
339 }
340 if ($maxdocs !~ /\-?\d+/) {
341 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
342 $maxdocs = $collectcfg->{'maxdocs'};
343 } else {
344 $maxdocs = -1; # the default
345 }
346 }
347 if ($groupsize == 1) {
348 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
349 $groupsize = $collectcfg->{'groupsize'};
350 }
351 }
352 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental)$/)) {
353 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
354 $OIDtype = $collectcfg->{'OIDtype'};
355 } else {
356 $OIDtype = "hash"; # the default
357 }
358 }
359 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
360 $sortmeta = $collectcfg->{'sortmeta'};
361 }
362 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
363 $debug = 1;
364 }
365 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
366 $gli = 1;
367 }
368
369 # global plugin stuff
370 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
371 push @global_opts, "-separate_cjk";
372 }
373
374 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, "export", $collectcfg);
375
376 $gli = 0 unless defined $gli;
377
378 print STDERR "<export>\n" if $gli;
379
380 # fill in the default import and export directories if none
381 # were supplied, turn all \ into / and remove trailing /
382 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
383 $importdir =~ s/[\\\/]+/\//g;
384 $importdir =~ s/\/$//;
385 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
386 $exportdir =~ s/[\\\/]+/\//g;
387 $exportdir =~ s/\/$//;
388
389 # load all the plugins
390 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
391
392 if (scalar(@$pluginfo) == 0) {
393 &gsprintf($out, "{import.no_plugins_loaded}\n");
394 die "\n";
395 }
396
397 # remove the old contents of the export directory if needed
398 if ($removeold && -e $exportdir) {
399 &gsprintf($out, "{export.removing_export}\n");
400 &util::rm_r ($exportdir);
401 }
402
403 # read the export information file
404 if (!$debug) {
405 # Export to DSpace Arhive format or METs format
406 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
407
408 if ($saveas eq "DSpace"){
409 $export_info_filename = &util::filename_cat ($exportdir, "contents");
410 } elsif ($saveas eq "METS") {
411 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
412 }
413
414 $export_info = new arcinfo();
415 $export_info -> load_info ($export_info_filename);
416
417 $processor = new docsave ($collection, $export_info, $verbosity, $gzip, $groupsize, $out, $service, $saveas);
418
419 $processor->setoutputdir ($exportdir);
420
421 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
422 $processor->set_OIDtype ($OIDtype);
423 $processor->set_saveas ($saveas);
424 $processor->set_saveas_version ($saveas_version);
425 } else {
426 $processor = new docprint ();
427 }
428
429 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
430
431 # process the import directory
432 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
433
434 &plugin::end($pluginfo, $processor);
435
436 &plugin::deinit($pluginfo, $processor);
437
438 # write out the export information file
439 if (!$debug) {
440 $processor->close_file_output() if $groupsize > 1;
441 if ($saveas eq "METS") {
442 $export_info->save_info($export_info_filename);
443 }
444 }
445
446 # write out export stats
447 my $close_stats = 0;
448 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
449 if (open (STATS, ">$statsfile")) {
450 $statsfile = 'import::STATS';
451 $close_stats = 1;
452 } else {
453 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
454 &gsprintf($out, "{import.stats_backup}\n");
455 $statsfile = 'STDERR';
456 }
457 }
458
459 &gsprintf($out, "\n");
460 &gsprintf($out, "*********************************************\n");
461
462 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
463 if ($close_stats) {
464 close STATS;
465 }
466
467 &gsprintf($out, "*********************************************\n");
468
469 close OUT if $close_out;
470
471 close FAILLOG;
472 };
473
474 if ($@) {
475 print STDERR $@;
476 }
477
478## $ENV{'GSDLCOLLECTION'} = undef;
479 $importdir = "";
480 $removeold = 0 if ($explicit_exportdir);
481
482 } # while processing ARGV
483
484 &gsprintf($out, "\n");
485 &gsprintf($out, "*********************************************\n");
486 &gsprintf($out, "* {export.complete}\n");
487 &gsprintf($out, "*********************************************\n");
488
489}
Note: See TracBrowser for help on using the repository browser.