source: trunk/gsdl/bin/script/export.pl@ 13948

Last change on this file since 13948 was 13169, checked in by kjdon, 18 years ago

debug mode now passes debug flag to plugout rather than using docprint, which is no longer a docproc.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use util;
53use scriptutil;
54use FileHandle;
55use gsprintf;
56use printusage;
57use parse2;
58
59
60my $oidtype_list =
61 [ { 'name' => "hash",
62 'desc' => "{import.OIDtype.hash}" },
63 { 'name' => "incremental",
64 'desc' => "{import.OIDtype.incremental}" },
65 { 'name' => "assigned",
66 'desc' => "{import.OIDtype.assigned}" },
67 { 'name' => "dirname",
68 'desc' => "{import.OIDtype.dirname}" } ];
69
70#** define to use the METS format or DSpace format
71my $saveas_list =
72 [ { 'name' => "DSpace",
73 'desc' => "{export.saveas.DSpace}" },
74 { 'name' => "METS",
75 'desc' => "{export.saveas.METS}"},
76 { 'name' => "GA",
77 'desc' => "{export.saveas.GA}"},
78 { 'name' => "MARCXML",
79 'desc' => "{export.saveas.MARCXML}"}
80 ];
81
82
83# Possible attributes for each argument
84# name: The name of the argument
85# desc: A description (or more likely a reference to a description) for this argument
86# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
87# reqd: Is this argument required?
88# hiddengli: Is this argument hidden in GLI?
89# modegli: The lowest detail mode this argument is visible at in GLI
90
91my $saveas_argument =
92 { 'name' => "saveas",
93 'desc' => "{export.saveas}",
94 'type' => "enum",
95 'list' => $saveas_list,
96 'deft' => "METS",
97 'reqd' => "no",
98 'modegli' => "3" };
99
100
101my $arguments =
102 [
103 $saveas_argument,
104 { 'name' => "saveas_version",
105 'desc' => "{export.saveas_version}",
106 'type' => "string",
107 'reqd' => "no",
108 'deft' => "greenstone" },
109 { 'name' => "exportdir",
110 'desc' => "{export.exportdir}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes" },
114 { 'name' => "importdir",
115 'desc' => "{import.importdir}",
116 'type' => "string",
117 'reqd' => "no",
118 'hiddengli' => "yes" },
119 { 'name' => "collectdir",
120 'desc' => "{export.collectdir}",
121 'type' => "string",
122 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
123 'reqd' => "no",
124 'hiddengli' => "yes" },
125 { 'name' => "listall",
126 'desc' => "{export.listall}",
127 'type' => "flag",
128 'reqd' => "no" },
129 { 'name' => "debug",
130 'desc' => "{export.debug}",
131 'type' => "flag",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "faillog",
135 'desc' => "{export.faillog}",
136 'type' => "string",
137 'deft' => "",
138 'reqd' => "no",
139 'modegli' => "4" },
140 { 'name' => "keepold",
141 'desc' => "{export.keepold}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "removeold",
146 'desc' => "{export.removeold}",
147 'type' => "flag",
148 'reqd' => "no",
149 'modegli' => "3" },
150 { 'name' => "language",
151 'desc' => "{scripts.language}",
152 'type' => "string",
153 'reqd' => "no",
154 'modegli' => "4" },
155 { 'name' => "maxdocs",
156 'desc' => "{export.maxdocs}",
157 'type' => "int",
158 'reqd' => "no",
159 'range' => "1,",
160 'modegli' => "1" },
161 { 'name' => "OIDtype",
162 'desc' => "{import.OIDtype}",
163 'type' => "enum",
164 'list' => $oidtype_list,
165 # parsearg left "" as default
166 #'deft' => "hash",
167 'reqd' => "no",
168 'modegli' => "3" },
169 { 'name' => "OIDmetadata",
170 'desc' => "{import.OIDmetadata}",
171 'type' => "metadata",
172 'deft' => "dc.Identifier",
173 'reqd' => "no",
174 'modegli' => "3" },
175 { 'name' => "out",
176 'desc' => "{export.out}",
177 'type' => "string",
178 'deft' => "STDERR",
179 'reqd' => "no",
180 'hiddengli' => "yes" },
181 { 'name' => "statsfile",
182 'desc' => "{export.statsfile}",
183 'type' => "string",
184 'deft' => "STDERR",
185 'reqd' => "no",
186 'hiddengli' => "yes" },
187 { 'name' => "xsltfile",
188 'desc' => "{BasPlugout.xslt_file}",
189 'type' => "string",
190 'reqd' => "no",
191 'hiddengli' => "yes" },
192 { 'name' => "xslt_txt",
193 'desc' => "{METSPlugout.xslt_txt}",
194 'type' => "string",
195 'reqd' => "no",
196 'hiddengli' => "no" },
197 { 'name' => "xslt_mets",
198 'desc' => "{METSPlugout.xslt_mets}",
199 'type' => "string",
200 'reqd' => "no",
201 'hiddengli' => "no" },
202 { 'name' => "mapping_file",
203 'desc' => "{MARCXMLPlugout.mapping_file}",
204 'type' => "string",
205 'reqd' => "no",
206 'hiddengli' => "no" },
207 { 'name' => "group_marc",
208 'desc' => "{MARCXMLPlugout.group}",
209 'type' => "flag",
210 'reqd' => "no",
211 'hiddengli' => "no" },
212 { 'name' => "verbosity",
213 'desc' => "{export.verbosity}",
214 'type' => "int",
215 'range' => "0,3",
216 'deft' => "2",
217 'reqd' => "no",
218 'modegli' => "4" },
219 { 'name' => "gli",
220 'desc' => "",
221 'type' => "flag",
222 'reqd' => "no",
223 'hiddengli' => "yes" },
224 { 'name' => "xml",
225 'desc' => "{scripts.xml}",
226 'type' => "flag",
227 'reqd' => "no",
228 'hiddengli' => "yes" }
229 ];
230
231my $options = { 'name' => "export.pl",
232 'desc' => "{export.desc}",
233 'args' => $arguments };
234
235my $listall_options = { 'name' => "export.pl",
236 'desc' => "{export.desc}",
237 'args' => [ $saveas_argument ] };
238
239sub gsprintf
240{
241 return &gsprintf::gsprintf(@_);
242}
243
244
245&main();
246
247sub main {
248 # params
249 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
250 $removeold, $saveas, $saveas_version, $debug, $OIDtype, $OIDmetadata,
251 $maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog,
252 $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
253 my $xml = 0;
254
255 # other vars
256 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
257 my $service = "export";
258
259 my $hashParsingResult = {};
260 # general options available to all plugins
261 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
262
263 # If parse returns -1 then something has gone wrong
264 if ($intArgLeftinAfterParsing == -1)
265 {
266 &PrintUsage::print_txt_usage($options, "{export.params}");
267 die "\n";
268 }
269
270 foreach my $strVariable (keys %$hashParsingResult)
271 {
272 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
273 }
274
275
276 # these are options used by other things - we just set default values
277 # undef means will be set from config file if there
278 my $gzip = undef;
279 my $groupsize = 1;
280 #my $OIDtype = undef;
281 my $sortmeta = undef;
282
283 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
284
285 # save these command line settings. don't want config file settings in one
286 # coll used for other colls
287 # does this apply to other vars???
288 my $global_removeold = $removeold;
289 my $global_keepold = $keepold;
290 # If $language has been specified, load the appropriate resource bundle
291 # (Otherwise, the default resource bundle will be loaded automatically)
292 if ($language) {
293 &gsprintf::load_language_specific_resource_bundle($language);
294 }
295
296 if ($listall) {
297 if ($xml) {
298 &PrintUsage::print_xml_usage($listall_options);
299 }
300 else
301 {
302 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
303 }
304 die "\n";
305 }
306 elsif ($xml) {
307 &PrintUsage::print_xml_usage($options);
308 die "\n";
309 }
310
311 # can have more than one collection name,
312 # if the first extra option is -h, then output the help
313 if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
314 &PrintUsage::print_txt_usage($options, "{export.params}");
315 die "\n";
316 }
317
318 if ($gli) { # the gli wants strings to be in UTF-8
319 &gsprintf::output_strings_in_UTF8;
320 }
321 my $close_out = 0;
322 if ($out !~ /^(STDERR|STDOUT)$/i) {
323 open (OUT, ">$out") ||
324 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
325 $out = 'export::OUT';
326 $close_out = 1;
327 }
328 $out->autoflush(1);
329
330 while (scalar(@ARGV)>0) {
331 my $collect_name = shift @ARGV;
332 $ENV{'GSDLCOLLECTION'} = $collect_name;
333
334 eval {
335 # get and check the collection name
336 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
337 &PrintUsage::print_txt_usage($options, "{export.params}");
338 die "\n";
339 }
340 # add collection's perllib dir into include path in
341 # case we have collection specific modules
342 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
343
344 if ($faillog eq "") {
345 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
346 }
347 open (FAILLOG, ">$faillog") ||
348 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
349 my $faillogname = $faillog;
350 $faillog = 'export::FAILLOG';
351 $faillog->autoflush(1);
352
353 # check sortmeta
354 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
355 if (defined $sortmeta && $groupsize > 1) {
356 &gsprintf($out, "{export.cannot_sort}\n\n");
357 $sortmeta = undef;
358 }
359
360 # get the list of plugins for this collection and set any options that
361 # were specified in the collect.cfg (all export.pl options except
362 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
363 # options must be known before we read the collect.cfg))
364 my $plugins = [];
365 my @global_opts = ();
366
367 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
368 if (!-e $configfilename) {
369 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
370 }
371
372 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
373 if (defined $collectcfg->{'plugin'}) {
374 $plugins = $collectcfg->{'plugin'};
375 }
376
377 if ($verbosity !~ /\d+/) {
378 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
379 $verbosity = $collectcfg->{'verbosity'};
380 } else {
381 $verbosity = 2; # the default
382 }
383 }
384 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
385 $importdir = $collectcfg->{'importdir'};
386 }
387 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
388 $exportdir = $collectcfg->{'exportdir'};
389 }
390
391 if (defined $collectcfg->{'gzip'} && !$gzip) {
392 if ($collectcfg->{'gzip'} =~ /^true$/i) {
393 $gzip = 1;
394 }
395 }
396 if ($maxdocs !~ /\-?\d+/) {
397 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
398 $maxdocs = $collectcfg->{'maxdocs'};
399 } else {
400 $maxdocs = -1; # the default
401 }
402 }
403 if ($groupsize == 1) {
404 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
405 $groupsize = $collectcfg->{'groupsize'};
406 }
407 }
408 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
409 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
410 $OIDtype = $collectcfg->{'OIDtype'};
411 } else {
412 $OIDtype = "hash"; # the default
413 }
414 }
415 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
416 $sortmeta = $collectcfg->{'sortmeta'};
417 }
418 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
419 $debug = 1;
420 }
421 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
422 $gli = 1;
423 }
424
425 # global plugin stuff
426 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
427 push @global_opts, "-separate_cjk";
428 }
429
430 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, 0, "export", $collectcfg);
431
432 $gli = 0 unless defined $gli;
433
434 print STDERR "<export>\n" if $gli;
435
436 # fill in the default import and export directories if none
437 # were supplied, turn all \ into / and remove trailing /
438 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
439 $importdir =~ s/[\\\/]+/\//g;
440 $importdir =~ s/\/$//;
441 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
442 $exportdir =~ s/[\\\/]+/\//g;
443 $exportdir =~ s/\/$//;
444
445 # load all the plugins
446 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
447
448 if (scalar(@$pluginfo) == 0) {
449 &gsprintf($out, "{import.no_plugins_loaded}\n");
450 die "\n";
451 }
452
453 # remove the old contents of the export directory if needed
454 if ($removeold && -e $exportdir) {
455 &gsprintf($out, "{export.removing_export}\n");
456 &util::rm_r ($exportdir);
457 }
458
459 # read the export information file
460
461 # Export to DSpace Archive format or METs format
462 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
463
464 # the plugouts should be doing this!!
465 if ($saveas eq "DSpace"){
466 $export_info_filename = &util::filename_cat ($exportdir, "contents");
467 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
468 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
469 }
470
471 $export_info = new arcinfo();
472 $export_info -> load_info ($export_info_filename);
473
474 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
475 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
476 $saveas = $collectcfg->{'plugout'}[0];
477 } else {
478 $saveas ="GAPlugout";
479 }
480 }
481
482
483 ####Use Plugout####
484 my ($plugout_name);
485 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)Plugout$/ ){
486 $plugout_name = $saveas."Plugout";
487 }
488 else {
489 $plugout_name = $saveas;
490 }
491
492 my $opts=[];
493
494 push @$opts,("-output_info",$export_info) if (defined $export_info);
495 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
496 push @$opts,("-debug") if ($debug);
497 push @$opts,("-gzip_output",$gzip) if (defined $gzip);
498 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
499 push @$opts,("-output_handle",$out) if (defined $out);
500 push @$opts,("-xslt_file",$xsltfile) if (defined $xsltfile);
501 push @$opts,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
502 push @$opts,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
503 push @$opts,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
504 push @$opts,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
505 push @$opts,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
506 $processor = &plugout::load_plugout($plugout_name,$opts);
507
508 $processor->setoutputdir ($exportdir);
509
510 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
511 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
512
513 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
514
515 # process the import directory
516 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
517
518 &plugin::end($pluginfo, $processor);
519
520 &plugin::deinit($pluginfo, $processor);
521
522 # write out the export information file
523 $processor->close_file_output() if $groupsize > 1;
524 $processor->close_group_output() if $processor->is_group();
525 if ($saveas eq "METS") {
526 $export_info->save_info($export_info_filename);
527 }
528
529 # write out export stats
530 my $close_stats = 0;
531 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
532 if (open (STATS, ">$statsfile")) {
533 $statsfile = 'import::STATS';
534 $close_stats = 1;
535 } else {
536 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
537 &gsprintf($out, "{import.stats_backup}\n");
538 $statsfile = 'STDERR';
539 }
540 }
541
542 &gsprintf($out, "\n");
543 &gsprintf($out, "*********************************************\n");
544
545 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
546 if ($close_stats) {
547 close STATS;
548 }
549
550 &gsprintf($out, "*********************************************\n");
551
552 close OUT if $close_out;
553
554 close FAILLOG;
555 };
556
557 if ($@) {
558 print STDERR $@;
559 }
560
561## $ENV{'GSDLCOLLECTION'} = undef;
562 $importdir = "";
563 $removeold = 0 if ($explicit_exportdir);
564
565 } # while processing ARGV
566
567 &gsprintf($out, "\n");
568 &gsprintf($out, "*********************************************\n");
569 &gsprintf($out, "* {export.complete}\n");
570 &gsprintf($out, "*********************************************\n");
571
572}
Note: See TracBrowser for help on using the repository browser.