source: gsdl/trunk/bin/script/export.pl@ 14733

Last change on this file since 14733 was 14733, checked in by shaoqun, 17 years ago

make it use the new version of plugout.pm

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use util;
53use scriptutil;
54use FileHandle;
55use gsprintf;
56use printusage;
57use parse2;
58
59
60my $oidtype_list =
61 [ { 'name' => "hash",
62 'desc' => "{import.OIDtype.hash}" },
63 { 'name' => "incremental",
64 'desc' => "{import.OIDtype.incremental}" },
65 { 'name' => "assigned",
66 'desc' => "{import.OIDtype.assigned}" },
67 { 'name' => "dirname",
68 'desc' => "{import.OIDtype.dirname}" } ];
69
70#** define to use the METS format or DSpace format
71my $saveas_list =
72 [ { 'name' => "DSpace",
73 'desc' => "{export.saveas.DSpace}" },
74 { 'name' => "METS",
75 'desc' => "{export.saveas.METS}"},
76 { 'name' => "GA",
77 'desc' => "{export.saveas.GA}"},
78 { 'name' => "MARCXML",
79 'desc' => "{export.saveas.MARCXML}"}
80 ];
81
82
83# Possible attributes for each argument
84# name: The name of the argument
85# desc: A description (or more likely a reference to a description) for this argument
86# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
87# reqd: Is this argument required?
88# hiddengli: Is this argument hidden in GLI?
89# modegli: The lowest detail mode this argument is visible at in GLI
90
91my $saveas_argument =
92 { 'name' => "saveas",
93 'desc' => "{export.saveas}",
94 'type' => "enum",
95 'list' => $saveas_list,
96 'deft' => "METS",
97 'reqd' => "no",
98 'modegli' => "3" };
99
100
101my $arguments =
102 [
103 $saveas_argument,
104 { 'name' => "saveas_version",
105 'desc' => "{export.saveas_version}",
106 'type' => "string",
107 'reqd' => "no",
108 'deft' => "greenstone" },
109 { 'name' => "exportdir",
110 'desc' => "{export.exportdir}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes" },
114 { 'name' => "importdir",
115 'desc' => "{import.importdir}",
116 'type' => "string",
117 'reqd' => "no",
118 'hiddengli' => "yes" },
119 { 'name' => "collectdir",
120 'desc' => "{export.collectdir}",
121 'type' => "string",
122 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
123 'reqd' => "no",
124 'hiddengli' => "yes" },
125 { 'name' => "listall",
126 'desc' => "{export.listall}",
127 'type' => "flag",
128 'reqd' => "no" },
129 { 'name' => "debug",
130 'desc' => "{export.debug}",
131 'type' => "flag",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "faillog",
135 'desc' => "{export.faillog}",
136 'type' => "string",
137 'deft' => "",
138 'reqd' => "no",
139 'modegli' => "4" },
140 { 'name' => "keepold",
141 'desc' => "{export.keepold}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "removeold",
146 'desc' => "{export.removeold}",
147 'type' => "flag",
148 'reqd' => "no",
149 'modegli' => "3" },
150 { 'name' => "language",
151 'desc' => "{scripts.language}",
152 'type' => "string",
153 'reqd' => "no",
154 'modegli' => "4" },
155 { 'name' => "maxdocs",
156 'desc' => "{export.maxdocs}",
157 'type' => "int",
158 'reqd' => "no",
159 'range' => "1,",
160 'modegli' => "1" },
161 { 'name' => "OIDtype",
162 'desc' => "{import.OIDtype}",
163 'type' => "enum",
164 'list' => $oidtype_list,
165 # parsearg left "" as default
166 #'deft' => "hash",
167 'reqd' => "no",
168 'modegli' => "3" },
169 { 'name' => "OIDmetadata",
170 'desc' => "{import.OIDmetadata}",
171 'type' => "metadata",
172 'deft' => "dc.Identifier",
173 'reqd' => "no",
174 'modegli' => "3" },
175 { 'name' => "out",
176 'desc' => "{export.out}",
177 'type' => "string",
178 'deft' => "STDERR",
179 'reqd' => "no",
180 'hiddengli' => "yes" },
181 { 'name' => "statsfile",
182 'desc' => "{export.statsfile}",
183 'type' => "string",
184 'deft' => "STDERR",
185 'reqd' => "no",
186 'hiddengli' => "yes" },
187 { 'name' => "xsltfile",
188 'desc' => "{BasPlugout.xslt_file}",
189 'type' => "string",
190 'reqd' => "no",
191 'hiddengli' => "yes" },
192 { 'name' => "xslt_txt",
193 'desc' => "{METSPlugout.xslt_txt}",
194 'type' => "string",
195 'reqd' => "no",
196 'hiddengli' => "no" },
197 { 'name' => "xslt_mets",
198 'desc' => "{METSPlugout.xslt_mets}",
199 'type' => "string",
200 'reqd' => "no",
201 'hiddengli' => "no" },
202 { 'name' => "mapping_file",
203 'desc' => "{MARCXMLPlugout.mapping_file}",
204 'type' => "string",
205 'reqd' => "no",
206 'hiddengli' => "no" },
207 { 'name' => "group_marc",
208 'desc' => "{MARCXMLPlugout.group}",
209 'type' => "flag",
210 'reqd' => "no",
211 'hiddengli' => "no" },
212 { 'name' => "verbosity",
213 'desc' => "{export.verbosity}",
214 'type' => "int",
215 'range' => "0,3",
216 'deft' => "2",
217 'reqd' => "no",
218 'modegli' => "4" },
219 { 'name' => "gli",
220 'desc' => "",
221 'type' => "flag",
222 'reqd' => "no",
223 'hiddengli' => "yes" },
224 { 'name' => "xml",
225 'desc' => "{scripts.xml}",
226 'type' => "flag",
227 'reqd' => "no",
228 'hiddengli' => "yes" }
229 ];
230
231my $options = { 'name' => "export.pl",
232 'desc' => "{export.desc}",
233 'args' => $arguments };
234
235my $listall_options = { 'name' => "export.pl",
236 'desc' => "{export.desc}",
237 'args' => [ $saveas_argument ] };
238
239sub gsprintf
240{
241 return &gsprintf::gsprintf(@_);
242}
243
244
245&main();
246
247sub main {
248 # params
249 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
250 $removeold, $saveas, $saveas_version, $debug, $OIDtype, $OIDmetadata,
251 $maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog,
252 $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
253 my $xml = 0;
254
255 # other vars
256 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
257 my $service = "export";
258
259 my $hashParsingResult = {};
260 # general options available to all plugins
261 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
262
263 # If parse returns -1 then something has gone wrong
264 if ($intArgLeftinAfterParsing == -1)
265 {
266 &PrintUsage::print_txt_usage($options, "{export.params}");
267 die "\n";
268 }
269
270 foreach my $strVariable (keys %$hashParsingResult)
271 {
272 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
273 }
274
275
276 # these are options used by other things - we just set default values
277 # undef means will be set from config file if there
278 my $gzip = undef;
279 my $groupsize = 1;
280 #my $OIDtype = undef;
281 my $sortmeta = undef;
282
283 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
284
285 # save these command line settings. don't want config file settings in one
286 # coll used for other colls
287 # does this apply to other vars???
288 my $global_removeold = $removeold;
289 my $global_keepold = $keepold;
290 # If $language has been specified, load the appropriate resource bundle
291 # (Otherwise, the default resource bundle will be loaded automatically)
292 if ($language) {
293 &gsprintf::load_language_specific_resource_bundle($language);
294 }
295
296 if ($listall) {
297 if ($xml) {
298 &PrintUsage::print_xml_usage($listall_options);
299 }
300 else
301 {
302 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
303 }
304 die "\n";
305 }
306 elsif ($xml) {
307 &PrintUsage::print_xml_usage($options);
308 die "\n";
309 }
310
311 # can have more than one collection name,
312 # if the first extra option is -h, then output the help
313 if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
314 &PrintUsage::print_txt_usage($options, "{export.params}");
315 die "\n";
316 }
317
318 if ($gli) { # the gli wants strings to be in UTF-8
319 &gsprintf::output_strings_in_UTF8;
320 }
321 my $close_out = 0;
322 if ($out !~ /^(STDERR|STDOUT)$/i) {
323 open (OUT, ">$out") ||
324 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
325 $out = 'export::OUT';
326 $close_out = 1;
327 }
328 $out->autoflush(1);
329
330 while (scalar(@ARGV)>0) {
331 my $collect_name = shift @ARGV;
332 $ENV{'GSDLCOLLECTION'} = $collect_name;
333
334 eval {
335 # get and check the collection name
336 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
337 &PrintUsage::print_txt_usage($options, "{export.params}");
338 die "\n";
339 }
340 # add collection's perllib dir into include path in
341 # case we have collection specific modules
342 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
343
344 if ($faillog eq "") {
345 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
346 }
347 open (FAILLOG, ">$faillog") ||
348 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
349 my $faillogname = $faillog;
350 $faillog = 'export::FAILLOG';
351 $faillog->autoflush(1);
352
353 # check sortmeta
354 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
355 if (defined $sortmeta && $groupsize > 1) {
356 &gsprintf($out, "{export.cannot_sort}\n\n");
357 $sortmeta = undef;
358 }
359
360 # get the list of plugins for this collection and set any options that
361 # were specified in the collect.cfg (all export.pl options except
362 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
363 # options must be known before we read the collect.cfg))
364 my $plugins = [];
365 my @global_opts = ();
366
367 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
368 if (!-e $configfilename) {
369 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
370 }
371
372 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
373 if (defined $collectcfg->{'plugin'}) {
374 $plugins = $collectcfg->{'plugin'};
375 }
376
377 if ($verbosity !~ /\d+/) {
378 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
379 $verbosity = $collectcfg->{'verbosity'};
380 } else {
381 $verbosity = 2; # the default
382 }
383 }
384 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
385 $importdir = $collectcfg->{'importdir'};
386 }
387 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
388 $exportdir = $collectcfg->{'exportdir'};
389 }
390
391 if (defined $collectcfg->{'gzip'} && !$gzip) {
392 if ($collectcfg->{'gzip'} =~ /^true$/i) {
393 $gzip = 1;
394 }
395 }
396 if ($maxdocs !~ /\-?\d+/) {
397 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
398 $maxdocs = $collectcfg->{'maxdocs'};
399 } else {
400 $maxdocs = -1; # the default
401 }
402 }
403 if ($groupsize == 1) {
404 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
405 $groupsize = $collectcfg->{'groupsize'};
406 }
407 }
408 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
409 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
410 $OIDtype = $collectcfg->{'OIDtype'};
411 } else {
412 $OIDtype = "hash"; # the default
413 }
414 }
415 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
416 $sortmeta = $collectcfg->{'sortmeta'};
417 }
418 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
419 $debug = 1;
420 }
421 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
422 $gli = 1;
423 }
424
425 # global plugin stuff
426 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
427 push @global_opts, "-separate_cjk";
428 }
429
430 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, 0, "export", $collectcfg);
431
432 $gli = 0 unless defined $gli;
433
434 print STDERR "<export>\n" if $gli;
435
436 # fill in the default import and export directories if none
437 # were supplied, turn all \ into / and remove trailing /
438 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
439 $importdir =~ s/[\\\/]+/\//g;
440 $importdir =~ s/\/$//;
441 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
442 $exportdir =~ s/[\\\/]+/\//g;
443 $exportdir =~ s/\/$//;
444
445 # load all the plugins
446 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
447
448 if (scalar(@$pluginfo) == 0) {
449 &gsprintf($out, "{import.no_plugins_loaded}\n");
450 die "\n";
451 }
452
453 # remove the old contents of the export directory if needed
454 if ($removeold && -e $exportdir) {
455 &gsprintf($out, "{export.removing_export}\n");
456 &util::rm_r ($exportdir);
457 }
458
459 # read the export information file
460
461 # Export to DSpace Archive format or METs format
462 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
463
464 # the plugouts should be doing this!!
465 if ($saveas eq "DSpace"){
466 $export_info_filename = &util::filename_cat ($exportdir, "contents");
467 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
468 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
469 }
470
471 $export_info = new arcinfo();
472 $export_info -> load_info ($export_info_filename);
473
474 my ($plugout);
475 if (defined $collectcfg->{'plugout'}) {
476 $plugout = $collectcfg->{'plugout'};
477 }
478 else{
479 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
480 push @$plugout,"GAPlugout";
481 }
482 else{
483 push @$plugout,$saveas."Plugout";
484 }
485 }
486
487 my $plugout_name = $plugout->[0];
488
489 push @$plugout,("-output_info",$export_info) if (defined $export_info);
490 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
491 push @$plugout,("-debug") if ($debug);
492 push @$plugout,("-gzip_output",$gzip) if (defined $gzip);
493 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
494 push @$plugout,("-output_handle",$out) if (defined $out);
495 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile);
496 push @$plugout,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
497 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
498 push @$plugout,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
499 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
500 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
501 $processor = &plugout::load_plugout($plugout);
502
503 $processor->setoutputdir ($exportdir);
504
505 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
506 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
507
508 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
509
510 # process the import directory
511 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
512
513 &plugin::end($pluginfo, $processor);
514
515 &plugin::deinit($pluginfo, $processor);
516
517 # write out the export information file
518 $processor->close_file_output() if $groupsize > 1;
519 $processor->close_group_output() if $processor->is_group();
520 if ($saveas eq "METS") {
521 $export_info->save_info($export_info_filename);
522 }
523
524 # write out export stats
525 my $close_stats = 0;
526 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
527 if (open (STATS, ">$statsfile")) {
528 $statsfile = 'import::STATS';
529 $close_stats = 1;
530 } else {
531 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
532 &gsprintf($out, "{import.stats_backup}\n");
533 $statsfile = 'STDERR';
534 }
535 }
536
537 &gsprintf($out, "\n");
538 &gsprintf($out, "*********************************************\n");
539
540 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
541 if ($close_stats) {
542 close STATS;
543 }
544
545 &gsprintf($out, "*********************************************\n");
546
547 close OUT if $close_out;
548
549 close FAILLOG;
550 };
551
552 if ($@) {
553 print STDERR $@;
554 }
555
556## $ENV{'GSDLCOLLECTION'} = undef;
557 $importdir = "";
558 $removeold = 0 if ($explicit_exportdir);
559
560 } # while processing ARGV
561
562 &gsprintf($out, "\n");
563 &gsprintf($out, "*********************************************\n");
564 &gsprintf($out, "* {export.complete}\n");
565 &gsprintf($out, "*********************************************\n");
566
567}
Note: See TracBrowser for help on using the repository browser.