source: trunk/gsdl/bin/script/export.pl@ 12983

Last change on this file since 12983 was 12965, checked in by kjdon, 18 years ago

scriptutil::check_removeold_and_keepold now has a incremental argument - pass in 0 for this

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use docprint;
53use util;
54use scriptutil;
55use FileHandle;
56use gsprintf;
57use printusage;
58use parse2;
59
60
61my $oidtype_list =
62 [ { 'name' => "hash",
63 'desc' => "{import.OIDtype.hash}" },
64 { 'name' => "incremental",
65 'desc' => "{import.OIDtype.incremental}" },
66 { 'name' => "assigned",
67 'desc' => "{import.OIDtype.assigned}" },
68 { 'name' => "dirname",
69 'desc' => "{import.OIDtype.dirname}" } ];
70
71#** define to use the METS format or DSpace format
72my $saveas_list =
73 [ { 'name' => "DSpace",
74 'desc' => "{export.saveas.DSpace}" },
75 { 'name' => "METS",
76 'desc' => "{export.saveas.METS}"},
77 { 'name' => "GA",
78 'desc' => "{export.saveas.GA}"},
79 { 'name' => "MARCXML",
80 'desc' => "{export.saveas.MARCXML}"}
81 ];
82
83
84# Possible attributes for each argument
85# name: The name of the argument
86# desc: A description (or more likely a reference to a description) for this argument
87# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
88# reqd: Is this argument required?
89# hiddengli: Is this argument hidden in GLI?
90# modegli: The lowest detail mode this argument is visible at in GLI
91
92my $saveas_argument =
93 { 'name' => "saveas",
94 'desc' => "{export.saveas}",
95 'type' => "enum",
96 'list' => $saveas_list,
97 'deft' => "METS",
98 'reqd' => "no",
99 'modegli' => "3" };
100
101
102my $arguments =
103 [
104 $saveas_argument,
105 { 'name' => "saveas_version",
106 'desc' => "{export.saveas_version}",
107 'type' => "string",
108 'reqd' => "no",
109 'deft' => "greenstone" },
110 { 'name' => "exportdir",
111 'desc' => "{export.exportdir}",
112 'type' => "string",
113 'reqd' => "no",
114 'hiddengli' => "yes" },
115 { 'name' => "importdir",
116 'desc' => "{import.importdir}",
117 'type' => "string",
118 'reqd' => "no",
119 'hiddengli' => "yes" },
120 { 'name' => "collectdir",
121 'desc' => "{export.collectdir}",
122 'type' => "string",
123 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
124 'reqd' => "no",
125 'hiddengli' => "yes" },
126 { 'name' => "listall",
127 'desc' => "{export.listall}",
128 'type' => "flag",
129 'reqd' => "no" },
130 { 'name' => "debug",
131 'desc' => "{export.debug}",
132 'type' => "flag",
133 'reqd' => "no",
134 'hiddengli' => "yes" },
135 { 'name' => "faillog",
136 'desc' => "{export.faillog}",
137 'type' => "string",
138 'deft' => "",
139 'reqd' => "no",
140 'modegli' => "4" },
141 { 'name' => "keepold",
142 'desc' => "{export.keepold}",
143 'type' => "flag",
144 'reqd' => "no",
145 'hiddengli' => "yes" },
146 { 'name' => "removeold",
147 'desc' => "{export.removeold}",
148 'type' => "flag",
149 'reqd' => "no",
150 'modegli' => "3" },
151 { 'name' => "language",
152 'desc' => "{scripts.language}",
153 'type' => "string",
154 'reqd' => "no",
155 'modegli' => "4" },
156 { 'name' => "maxdocs",
157 'desc' => "{export.maxdocs}",
158 'type' => "int",
159 'reqd' => "no",
160 'range' => "1,",
161 'modegli' => "1" },
162 { 'name' => "OIDtype",
163 'desc' => "{import.OIDtype}",
164 'type' => "enum",
165 'list' => $oidtype_list,
166 # parsearg left "" as default
167 #'deft' => "hash",
168 'reqd' => "no",
169 'modegli' => "3" },
170 { 'name' => "OIDmetadata",
171 'desc' => "{import.OIDmetadata}",
172 'type' => "metadata",
173 'deft' => "dc.Identifier",
174 'reqd' => "no",
175 'modegli' => "3" },
176 { 'name' => "out",
177 'desc' => "{export.out}",
178 'type' => "string",
179 'deft' => "STDERR",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182 { 'name' => "statsfile",
183 'desc' => "{export.statsfile}",
184 'type' => "string",
185 'deft' => "STDERR",
186 'reqd' => "no",
187 'hiddengli' => "yes" },
188 { 'name' => "xsltfile",
189 'desc' => "{BasPlugout.xslt_file}",
190 'type' => "string",
191 'reqd' => "no",
192 'hiddengli' => "yes" },
193 { 'name' => "xslt_txt",
194 'desc' => "{METSPlugout.xslt_txt}",
195 'type' => "string",
196 'reqd' => "no",
197 'hiddengli' => "no" },
198 { 'name' => "xslt_mets",
199 'desc' => "{METSPlugout.xslt_mets}",
200 'type' => "string",
201 'reqd' => "no",
202 'hiddengli' => "no" },
203 { 'name' => "mapping_file",
204 'desc' => "{MARCXMLPlugout.mapping_file}",
205 'type' => "string",
206 'reqd' => "no",
207 'hiddengli' => "no" },
208 { 'name' => "group_marc",
209 'desc' => "{MARCXMLPlugout.group}",
210 'type' => "flag",
211 'reqd' => "no",
212 'hiddengli' => "no" },
213 { 'name' => "verbosity",
214 'desc' => "{export.verbosity}",
215 'type' => "int",
216 'range' => "0,3",
217 'deft' => "2",
218 'reqd' => "no",
219 'modegli' => "4" },
220 { 'name' => "gli",
221 'desc' => "",
222 'type' => "flag",
223 'reqd' => "no",
224 'hiddengli' => "yes" },
225 { 'name' => "xml",
226 'desc' => "{scripts.xml}",
227 'type' => "flag",
228 'reqd' => "no",
229 'hiddengli' => "yes" }
230 ];
231
232my $options = { 'name' => "export.pl",
233 'desc' => "{export.desc}",
234 'args' => $arguments };
235
236my $listall_options = { 'name' => "export.pl",
237 'desc' => "{export.desc}",
238 'args' => [ $saveas_argument ] };
239
240sub gsprintf
241{
242 return &gsprintf::gsprintf(@_);
243}
244
245
246&main();
247
248sub main {
249 # params
250 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
251 $removeold, $saveas, $saveas_version, $debug, $OIDtype, $OIDmetadata,
252 $maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog,
253 $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
254 my $xml = 0;
255
256 # other vars
257 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
258 my $service = "export";
259
260 my $hashParsingResult = {};
261 # general options available to all plugins
262 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
263
264 # If parse returns -1 then something has gone wrong
265 if ($intArgLeftinAfterParsing == -1)
266 {
267 &PrintUsage::print_txt_usage($options, "{export.params}");
268 die "\n";
269 }
270
271 foreach my $strVariable (keys %$hashParsingResult)
272 {
273 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
274 }
275
276
277 # these are options used by other things - we just set default values
278 # undef means will be set from config file if there
279 my $gzip = undef;
280 my $groupsize = 1;
281 #my $OIDtype = undef;
282 my $sortmeta = undef;
283
284 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
285
286 # save these command line settings. don't want config file settings in one
287 # coll used for other colls
288 # does this apply to other vars???
289 my $global_removeold = $removeold;
290 my $global_keepold = $keepold;
291 # If $language has been specified, load the appropriate resource bundle
292 # (Otherwise, the default resource bundle will be loaded automatically)
293 if ($language) {
294 &gsprintf::load_language_specific_resource_bundle($language);
295 }
296
297 if ($listall) {
298 if ($xml) {
299 &PrintUsage::print_xml_usage($listall_options);
300 }
301 else
302 {
303 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
304 }
305 die "\n";
306 }
307 elsif ($xml) {
308 &PrintUsage::print_xml_usage($options);
309 die "\n";
310 }
311
312 # can have more than one collection name,
313 # if the first extra option is -h, then output the help
314 if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
315 &PrintUsage::print_txt_usage($options, "{export.params}");
316 die "\n";
317 }
318
319 if ($gli) { # the gli wants strings to be in UTF-8
320 &gsprintf::output_strings_in_UTF8;
321 }
322 my $close_out = 0;
323 if ($out !~ /^(STDERR|STDOUT)$/i) {
324 open (OUT, ">$out") ||
325 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
326 $out = 'export::OUT';
327 $close_out = 1;
328 }
329 $out->autoflush(1);
330
331 while (scalar(@ARGV)>0) {
332 my $collect_name = shift @ARGV;
333 $ENV{'GSDLCOLLECTION'} = $collect_name;
334
335 eval {
336 # get and check the collection name
337 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
338 &PrintUsage::print_txt_usage($options, "{export.params}");
339 die "\n";
340 }
341 # add collection's perllib dir into include path in
342 # case we have collection specific modules
343 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
344
345 if ($faillog eq "") {
346 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
347 }
348 open (FAILLOG, ">$faillog") ||
349 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
350 my $faillogname = $faillog;
351 $faillog = 'export::FAILLOG';
352 $faillog->autoflush(1);
353
354 # check sortmeta
355 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
356 if (defined $sortmeta && $groupsize > 1) {
357 &gsprintf($out, "{export.cannot_sort}\n\n");
358 $sortmeta = undef;
359 }
360
361 # get the list of plugins for this collection and set any options that
362 # were specified in the collect.cfg (all export.pl options except
363 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
364 # options must be known before we read the collect.cfg))
365 my $plugins = [];
366 my @global_opts = ();
367
368 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
369 if (!-e $configfilename) {
370 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
371 }
372
373 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
374 if (defined $collectcfg->{'plugin'}) {
375 $plugins = $collectcfg->{'plugin'};
376 }
377
378 if ($verbosity !~ /\d+/) {
379 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
380 $verbosity = $collectcfg->{'verbosity'};
381 } else {
382 $verbosity = 2; # the default
383 }
384 }
385 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
386 $importdir = $collectcfg->{'importdir'};
387 }
388 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
389 $exportdir = $collectcfg->{'exportdir'};
390 }
391
392 if (defined $collectcfg->{'gzip'} && !$gzip) {
393 if ($collectcfg->{'gzip'} =~ /^true$/i) {
394 $gzip = 1;
395 }
396 }
397 if ($maxdocs !~ /\-?\d+/) {
398 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
399 $maxdocs = $collectcfg->{'maxdocs'};
400 } else {
401 $maxdocs = -1; # the default
402 }
403 }
404 if ($groupsize == 1) {
405 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
406 $groupsize = $collectcfg->{'groupsize'};
407 }
408 }
409 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
410 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
411 $OIDtype = $collectcfg->{'OIDtype'};
412 } else {
413 $OIDtype = "hash"; # the default
414 }
415 }
416 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
417 $sortmeta = $collectcfg->{'sortmeta'};
418 }
419 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
420 $debug = 1;
421 }
422 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
423 $gli = 1;
424 }
425
426 # global plugin stuff
427 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
428 push @global_opts, "-separate_cjk";
429 }
430
431 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, 0, "export", $collectcfg);
432
433 $gli = 0 unless defined $gli;
434
435 print STDERR "<export>\n" if $gli;
436
437 # fill in the default import and export directories if none
438 # were supplied, turn all \ into / and remove trailing /
439 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
440 $importdir =~ s/[\\\/]+/\//g;
441 $importdir =~ s/\/$//;
442 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
443 $exportdir =~ s/[\\\/]+/\//g;
444 $exportdir =~ s/\/$//;
445
446 # load all the plugins
447 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
448
449 if (scalar(@$pluginfo) == 0) {
450 &gsprintf($out, "{import.no_plugins_loaded}\n");
451 die "\n";
452 }
453
454 # remove the old contents of the export directory if needed
455 if ($removeold && -e $exportdir) {
456 &gsprintf($out, "{export.removing_export}\n");
457 &util::rm_r ($exportdir);
458 }
459
460 # read the export information file
461 if (!$debug) {
462 # Export to DSpace Archive format or METs format
463 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
464
465 if ($saveas eq "DSpace"){
466 $export_info_filename = &util::filename_cat ($exportdir, "contents");
467 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
468 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
469 }
470
471 $export_info = new arcinfo();
472 $export_info -> load_info ($export_info_filename);
473
474 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
475 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
476 $saveas = $collectcfg->{'plugout'}[0];
477 } else {
478 $saveas ="GAPlugout";
479 }
480 }
481
482
483 ####Use Plugout####
484 my ($plugout_name);
485 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)Plugout$/ ){
486 $plugout_name = $saveas."Plugout";
487 }
488 else{
489 $plugout_name = $saveas;
490 }
491
492 my $opts=[];
493
494
495 push @$opts,("-output_info",$export_info) if (defined $export_info);
496
497 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
498
499 push @$opts,("-gzip_output",$gzip) if (defined $gzip);
500 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
501 push @$opts,("-output_handle",$out) if (defined $out);
502 push @$opts,("-xslt_file",$xsltfile) if (defined $xsltfile);
503 push @$opts,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
504 push @$opts,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
505 push @$opts,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
506 push @$opts,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
507 push @$opts,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
508 $processor = &plugout::load_plugout($plugout_name,$opts);
509
510 $processor->setoutputdir ($exportdir);
511
512 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
513 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
514
515 } else {
516 $processor = new docprint ();
517 }
518
519 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
520
521 # process the import directory
522 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
523
524 &plugin::end($pluginfo, $processor);
525
526 &plugin::deinit($pluginfo, $processor);
527
528 # write out the export information file
529 if (!$debug) {
530 $processor->close_file_output() if $groupsize > 1;
531 $processor->close_group_output() if $processor->is_group();
532 if ($saveas eq "METS") {
533 $export_info->save_info($export_info_filename);
534 }
535 }
536
537 # write out export stats
538 my $close_stats = 0;
539 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
540 if (open (STATS, ">$statsfile")) {
541 $statsfile = 'import::STATS';
542 $close_stats = 1;
543 } else {
544 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
545 &gsprintf($out, "{import.stats_backup}\n");
546 $statsfile = 'STDERR';
547 }
548 }
549
550 &gsprintf($out, "\n");
551 &gsprintf($out, "*********************************************\n");
552
553 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
554 if ($close_stats) {
555 close STATS;
556 }
557
558 &gsprintf($out, "*********************************************\n");
559
560 close OUT if $close_out;
561
562 close FAILLOG;
563 };
564
565 if ($@) {
566 print STDERR $@;
567 }
568
569## $ENV{'GSDLCOLLECTION'} = undef;
570 $importdir = "";
571 $removeold = 0 if ($explicit_exportdir);
572
573 } # while processing ARGV
574
575 &gsprintf($out, "\n");
576 &gsprintf($out, "*********************************************\n");
577 &gsprintf($out, "* {export.complete}\n");
578 &gsprintf($out, "*********************************************\n");
579
580}
Note: See TracBrowser for help on using the repository browser.