source: trunk/gsdl/bin/script/export.pl@ 12625

Last change on this file since 12625 was 12598, checked in by shaoqun, 18 years ago

added mapping_file option for MARCXML plugout

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.1 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use docprint;
53use util;
54use scriptutil;
55use FileHandle;
56use gsprintf;
57use printusage;
58use parse2;
59
60
61my $oidtype_list =
62 [ { 'name' => "hash",
63 'desc' => "{export.OIDtype.hash}" },
64 { 'name' => "incremental",
65 'desc' => "{export.OIDtype.incremental}" } ];
66
67#** define to use the METS format or DSpace format
68my $saveas_list =
69 [ { 'name' => "DSpace",
70 'desc' => "{export.saveas.DSpace}" },
71 { 'name' => "METS",
72 'desc' => "{export.saveas.METS}"},
73 { 'name' => "GA",
74 'desc' => "{export.saveas.GA}"},
75 { 'name' => "MARCXML",
76 'desc' => "{export.saveas.MARCXML}"}
77 ];
78
79
80# Possible attributes for each argument
81# name: The name of the argument
82# desc: A description (or more likely a reference to a description) for this argument
83# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
84# reqd: Is this argument required?
85# hiddengli: Is this argument hidden in GLI?
86# modegli: The lowest detail mode this argument is visible at in GLI
87
88my $saveas_argument =
89 { 'name' => "saveas",
90 'desc' => "{export.saveas}",
91 'type' => "enum",
92 'list' => $saveas_list,
93 'deft' => "METS",
94 'reqd' => "no",
95 'modegli' => "3" };
96
97
98my $arguments =
99 [
100 $saveas_argument,
101 { 'name' => "saveas_version",
102 'desc' => "{export.saveas_version}",
103 'type' => "string",
104 'reqd' => "no",
105 'deft' => "greenstone" },
106 { 'name' => "exportdir",
107 'desc' => "{export.exportdir}",
108 'type' => "string",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111 { 'name' => "importdir",
112 'desc' => "{import.importdir}",
113 'type' => "string",
114 'reqd' => "no",
115 'hiddengli' => "yes" },
116 { 'name' => "collectdir",
117 'desc' => "{export.collectdir}",
118 'type' => "string",
119 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
120 'reqd' => "no",
121 'hiddengli' => "yes" },
122 { 'name' => "listall",
123 'desc' => "{scripts.listall}",
124 'type' => "flag",
125 'reqd' => "no" },
126 { 'name' => "debug",
127 'desc' => "{export.debug}",
128 'type' => "flag",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
131 { 'name' => "faillog",
132 'desc' => "{export.faillog}",
133 'type' => "string",
134 'deft' => "",
135 'reqd' => "no",
136 'modegli' => "4" },
137 { 'name' => "keepold",
138 'desc' => "{export.keepold}",
139 'type' => "flag",
140 'reqd' => "no",
141 'hiddengli' => "yes" },
142 { 'name' => "removeold",
143 'desc' => "{export.removeold}",
144 'type' => "flag",
145 'reqd' => "no",
146 'modegli' => "3" },
147 { 'name' => "language",
148 'desc' => "{scripts.language}",
149 'type' => "string",
150 'reqd' => "no",
151 'modegli' => "4" },
152 { 'name' => "maxdocs",
153 'desc' => "{export.maxdocs}",
154 'type' => "int",
155 'reqd' => "no",
156 'range' => "1,",
157 'modegli' => "1" },
158 { 'name' => "out",
159 'desc' => "{export.out}",
160 'type' => "string",
161 'deft' => "STDERR",
162 'reqd' => "no",
163 'hiddengli' => "yes" },
164 { 'name' => "statsfile",
165 'desc' => "{export.statsfile}",
166 'type' => "string",
167 'deft' => "STDERR",
168 'reqd' => "no",
169 'hiddengli' => "yes" },
170 { 'name' => "xsltfile",
171 'desc' => "{BasPlugout.xslt_file}",
172 'type' => "string",
173 'reqd' => "no",
174 'hiddengli' => "yes" },
175 { 'name' => "xslt_txt",
176 'desc' => "{METSPlugout.xslt_txt}",
177 'type' => "string",
178 'reqd' => "no",
179 'hiddengli' => "no" },
180 { 'name' => "xslt_mets",
181 'desc' => "{METSPlugout.xslt_mets}",
182 'type' => "string",
183 'reqd' => "no",
184 'hiddengli' => "no" },
185 { 'name' => "mapping_file",
186 'desc' => "{MARCXMLPlugout.mapping_file}",
187 'type' => "string",
188 'reqd' => "no",
189 'hiddengli' => "no" },
190 { 'name' => "group_marc",
191 'desc' => "{MARCXMLPlugout.group}",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "no" },
195 { 'name' => "verbosity",
196 'desc' => "{export.verbosity}",
197 'type' => "int",
198 'range' => "0,3",
199 'deft' => "2",
200 'reqd' => "no",
201 'modegli' => "4" },
202 { 'name' => "gli",
203 'desc' => "",
204 'type' => "flag",
205 'reqd' => "no",
206 'hiddengli' => "yes" },
207 { 'name' => "xml",
208 'desc' => "{scripts.xml}",
209 'type' => "flag",
210 'reqd' => "no",
211 'hiddengli' => "yes" }
212 ];
213
214my $options = { 'name' => "export.pl",
215 'desc' => "{export.desc}",
216 'args' => $arguments };
217
218my $listall_options = { 'name' => "export.pl",
219 'desc' => "{export.desc}",
220 'args' => [ $saveas_argument ] };
221
222sub gsprintf
223{
224 return &gsprintf::gsprintf(@_);
225}
226
227
228&main();
229
230sub main {
231 # params
232 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
233 $removeold, $saveas, $saveas_version, $debug,
234 $maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog, $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
235 my $xml = 0;
236
237 # other vars
238 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
239 my $service = "export";
240
241 my $hashParsingResult = {};
242 # general options available to all plugins
243 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
244
245 # If parse returns -1 then something has gone wrong
246 if ($intArgLeftinAfterParsing == -1)
247 {
248 &PrintUsage::print_txt_usage($options, "{export.params}");
249 die "\n";
250 }
251
252 foreach my $strVariable (keys %$hashParsingResult)
253 {
254 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
255 }
256
257
258 # these are options used by other things - we just set default values
259 # undef means will be set from config file if there
260 my $gzip = undef;
261 my $groupsize = 1;
262 my $OIDtype = undef;
263 my $sortmeta = undef;
264
265 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
266
267 # save these command line settings. don't want config file settings in one
268 # coll used for other colls
269 # does this apply to other vars???
270 my $global_removeold = $removeold;
271 my $global_keepold = $keepold;
272 # If $language has been specified, load the appropriate resource bundle
273 # (Otherwise, the default resource bundle will be loaded automatically)
274 if ($language) {
275 &gsprintf::load_language_specific_resource_bundle($language);
276 }
277
278 if ($listall) {
279 if ($xml) {
280 &PrintUsage::print_xml_usage($listall_options);
281 }
282 else
283 {
284 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
285 }
286 die "\n";
287 }
288 elsif ($xml) {
289 &PrintUsage::print_xml_usage($options);
290 die "\n";
291 }
292
293 # can have more than one collection name,
294 # if the first extra option is -h, then output the help
295 if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
296 &PrintUsage::print_txt_usage($options, "{export.params}");
297 die "\n";
298 }
299
300 if ($gli) { # the gli wants strings to be in UTF-8
301 &gsprintf::output_strings_in_UTF8;
302 }
303 my $close_out = 0;
304 if ($out !~ /^(STDERR|STDOUT)$/i) {
305 open (OUT, ">$out") ||
306 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
307 $out = 'export::OUT';
308 $close_out = 1;
309 }
310 $out->autoflush(1);
311
312 while (scalar(@ARGV)>0) {
313 my $collect_name = shift @ARGV;
314 $ENV{'GSDLCOLLECTION'} = $collect_name;
315
316 eval {
317 # get and check the collection name
318 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
319 &PrintUsage::print_txt_usage($options, "{export.params}");
320 die "\n";
321 }
322 # add collection's perllib dir into include path in
323 # case we have collection specific modules
324 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
325
326 if ($faillog eq "") {
327 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
328 }
329 open (FAILLOG, ">$faillog") ||
330 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
331 my $faillogname = $faillog;
332 $faillog = 'export::FAILLOG';
333 $faillog->autoflush(1);
334
335 # check sortmeta
336 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
337 if (defined $sortmeta && $groupsize > 1) {
338 &gsprintf($out, "{export.cannot_sort}\n\n");
339 $sortmeta = undef;
340 }
341
342 # get the list of plugins for this collection and set any options that
343 # were specified in the collect.cfg (all export.pl options except
344 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
345 # options must be known before we read the collect.cfg))
346 my $plugins = [];
347 my @global_opts = ();
348
349 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
350 if (!-e $configfilename) {
351 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
352 }
353
354 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
355 if (defined $collectcfg->{'plugin'}) {
356 $plugins = $collectcfg->{'plugin'};
357 }
358
359 if ($verbosity !~ /\d+/) {
360 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
361 $verbosity = $collectcfg->{'verbosity'};
362 } else {
363 $verbosity = 2; # the default
364 }
365 }
366 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
367 $importdir = $collectcfg->{'importdir'};
368 }
369 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
370 $exportdir = $collectcfg->{'exportdir'};
371 }
372
373 if (defined $collectcfg->{'gzip'} && !$gzip) {
374 if ($collectcfg->{'gzip'} =~ /^true$/i) {
375 $gzip = 1;
376 }
377 }
378 if ($maxdocs !~ /\-?\d+/) {
379 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
380 $maxdocs = $collectcfg->{'maxdocs'};
381 } else {
382 $maxdocs = -1; # the default
383 }
384 }
385 if ($groupsize == 1) {
386 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
387 $groupsize = $collectcfg->{'groupsize'};
388 }
389 }
390 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental)$/)) {
391 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
392 $OIDtype = $collectcfg->{'OIDtype'};
393 } else {
394 $OIDtype = "hash"; # the default
395 }
396 }
397 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
398 $sortmeta = $collectcfg->{'sortmeta'};
399 }
400 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
401 $debug = 1;
402 }
403 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
404 $gli = 1;
405 }
406
407 # global plugin stuff
408 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
409 push @global_opts, "-separate_cjk";
410 }
411
412 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, "export", $collectcfg);
413
414 $gli = 0 unless defined $gli;
415
416 print STDERR "<export>\n" if $gli;
417
418 # fill in the default import and export directories if none
419 # were supplied, turn all \ into / and remove trailing /
420 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
421 $importdir =~ s/[\\\/]+/\//g;
422 $importdir =~ s/\/$//;
423 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
424 $exportdir =~ s/[\\\/]+/\//g;
425 $exportdir =~ s/\/$//;
426
427 # load all the plugins
428 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
429
430 if (scalar(@$pluginfo) == 0) {
431 &gsprintf($out, "{import.no_plugins_loaded}\n");
432 die "\n";
433 }
434
435 # remove the old contents of the export directory if needed
436 if ($removeold && -e $exportdir) {
437 &gsprintf($out, "{export.removing_export}\n");
438 &util::rm_r ($exportdir);
439 }
440
441 # read the export information file
442 if (!$debug) {
443 # Export to DSpace Arhive format or METs format
444 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
445
446 if ($saveas eq "DSpace"){
447 $export_info_filename = &util::filename_cat ($exportdir, "contents");
448 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
449 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
450 }
451
452 $export_info = new arcinfo();
453 $export_info -> load_info ($export_info_filename);
454
455 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
456 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
457 $saveas = $collectcfg->{'plugout'}[0];
458 } else {
459 $saveas ="GAPlugout";
460 }
461 }
462
463
464 ####Use Plugout####
465 my ($plugout_name);
466 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)Plugout$/ ){
467 $plugout_name = $saveas."Plugout";
468 }
469 else{
470 $plugout_name = $saveas;
471 }
472
473 my $opts=[];
474
475
476 push @$opts,("-output_info",$export_info) if (defined $export_info);
477
478 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
479
480 push @$opts,("-gzip_output",$gzip) if (defined $gzip);
481 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
482 push @$opts,("-output_handle",$out) if (defined $out);
483 push @$opts,("-xslt_file",$xsltfile) if (defined $xsltfile);
484 push @$opts,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
485 push @$opts,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
486 push @$opts,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
487 push @$opts,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
488 push @$opts,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
489 $processor = &plugout::load_plugout($plugout_name,$opts);
490
491 $processor->setoutputdir ($exportdir);
492
493 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
494 $processor->set_OIDtype ($OIDtype);
495
496 } else {
497 $processor = new docprint ();
498 }
499
500 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
501
502 # process the import directory
503 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
504
505 &plugin::end($pluginfo, $processor);
506
507 &plugin::deinit($pluginfo, $processor);
508
509 # write out the export information file
510 if (!$debug) {
511 $processor->close_file_output() if $groupsize > 1;
512 $processor->close_group_output() if $processor->is_group();
513 if ($saveas eq "METS") {
514 $export_info->save_info($export_info_filename);
515 }
516 }
517
518 # write out export stats
519 my $close_stats = 0;
520 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
521 if (open (STATS, ">$statsfile")) {
522 $statsfile = 'import::STATS';
523 $close_stats = 1;
524 } else {
525 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
526 &gsprintf($out, "{import.stats_backup}\n");
527 $statsfile = 'STDERR';
528 }
529 }
530
531 &gsprintf($out, "\n");
532 &gsprintf($out, "*********************************************\n");
533
534 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
535 if ($close_stats) {
536 close STATS;
537 }
538
539 &gsprintf($out, "*********************************************\n");
540
541 close OUT if $close_out;
542
543 close FAILLOG;
544 };
545
546 if ($@) {
547 print STDERR $@;
548 }
549
550## $ENV{'GSDLCOLLECTION'} = undef;
551 $importdir = "";
552 $removeold = 0 if ($explicit_exportdir);
553
554 } # while processing ARGV
555
556 &gsprintf($out, "\n");
557 &gsprintf($out, "*********************************************\n");
558 &gsprintf($out, "* {export.complete}\n");
559 &gsprintf($out, "*********************************************\n");
560
561}
Note: See TracBrowser for help on using the repository browser.