source: trunk/gsdl/bin/script/export.pl@ 12373

Last change on this file since 12373 was 12361, checked in by kjdon, 18 years ago

changed a comment

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use docprint;
53use util;
54use scriptutil;
55use FileHandle;
56use gsprintf;
57use printusage;
58use parse2;
59
60
61my $oidtype_list =
62 [ { 'name' => "hash",
63 'desc' => "{export.OIDtype.hash}" },
64 { 'name' => "incremental",
65 'desc' => "{export.OIDtype.incremental}" } ];
66
67#** define to use the METS format or DSpace format
68my $saveas_list =
69 [ { 'name' => "DSpace",
70 'desc' => "{export.saveas.DSpace}" },
71 { 'name' => "METS",
72 'desc' => "{export.saveas.METS}"},
73 { 'name' => "GA",
74 'desc' => "{export.saveas.GA}"},
75 { 'name' => "MARCXML",
76 'desc' => "{export.saveas.MARCXML}"}
77 ];
78
79
80# Possible attributes for each argument
81# name: The name of the argument
82# desc: A description (or more likely a reference to a description) for this argument
83# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
84# reqd: Is this argument required?
85# hiddengli: Is this argument hidden in GLI?
86# modegli: The lowest detail mode this argument is visible at in GLI
87
88my $saveas_argument =
89 { 'name' => "saveas",
90 'desc' => "{export.saveas}",
91 'type' => "enum",
92 'list' => $saveas_list,
93 'deft' => "METS",
94 'reqd' => "no",
95 'modegli' => "3" };
96
97
98my $arguments =
99 [
100 $saveas_argument,
101 { 'name' => "saveas_version",
102 'desc' => "{export.saveas_version}",
103 'type' => "string",
104 'reqd' => "no",
105 'deft' => "greenstone" },
106 { 'name' => "exportdir",
107 'desc' => "{export.exportdir}",
108 'type' => "string",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111 { 'name' => "importdir",
112 'desc' => "{import.importdir}",
113 'type' => "string",
114 'reqd' => "no",
115 'hiddengli' => "yes" },
116 { 'name' => "collectdir",
117 'desc' => "{export.collectdir}",
118 'type' => "string",
119 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
120 'reqd' => "no",
121 'hiddengli' => "yes" },
122 { 'name' => "listall",
123 'desc' => "{scripts.listall}",
124 'type' => "flag",
125 'reqd' => "no" },
126 { 'name' => "debug",
127 'desc' => "{export.debug}",
128 'type' => "flag",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
131 { 'name' => "faillog",
132 'desc' => "{export.faillog}",
133 'type' => "string",
134 'deft' => "",
135 'reqd' => "no",
136 'modegli' => "4" },
137 { 'name' => "keepold",
138 'desc' => "{export.keepold}",
139 'type' => "flag",
140 'reqd' => "no",
141 'hiddengli' => "yes" },
142 { 'name' => "removeold",
143 'desc' => "{export.removeold}",
144 'type' => "flag",
145 'reqd' => "no",
146 'modegli' => "3" },
147 { 'name' => "language",
148 'desc' => "{scripts.language}",
149 'type' => "string",
150 'reqd' => "no",
151 'modegli' => "4" },
152 { 'name' => "maxdocs",
153 'desc' => "{export.maxdocs}",
154 'type' => "int",
155 'reqd' => "no",
156 'range' => "1,",
157 'modegli' => "1" },
158 { 'name' => "out",
159 'desc' => "{export.out}",
160 'type' => "string",
161 'deft' => "STDERR",
162 'reqd' => "no",
163 'hiddengli' => "yes" },
164 { 'name' => "statsfile",
165 'desc' => "{export.statsfile}",
166 'type' => "string",
167 'deft' => "STDERR",
168 'reqd' => "no",
169 'hiddengli' => "yes" },
170 { 'name' => "xsltfile",
171 'desc' => "{BasPlugout.xslt_file}",
172 'type' => "string",
173 'reqd' => "no",
174 'hiddengli' => "yes" },
175 { 'name' => "xslt_txt",
176 'desc' => "{METSPlugout.xslt_txt}",
177 'type' => "string",
178 'reqd' => "no",
179 'hiddengli' => "no" },
180 { 'name' => "xslt_mets",
181 'desc' => "{METSPlugout.xslt_mets}",
182 'type' => "string",
183 'reqd' => "no",
184 'hiddengli' => "no" },
185 { 'name' => "group_marc",
186 'desc' => "{MARCXMLPlugout.group}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "no" },
190 { 'name' => "verbosity",
191 'desc' => "{export.verbosity}",
192 'type' => "int",
193 'range' => "0,3",
194 'deft' => "2",
195 'reqd' => "no",
196 'modegli' => "4" },
197 { 'name' => "gli",
198 'desc' => "",
199 'type' => "flag",
200 'reqd' => "no",
201 'hiddengli' => "yes" },
202 { 'name' => "xml",
203 'desc' => "{scripts.xml}",
204 'type' => "flag",
205 'reqd' => "no",
206 'hiddengli' => "yes" }
207 ];
208
209my $options = { 'name' => "export.pl",
210 'desc' => "{export.desc}",
211 'args' => $arguments };
212
213my $listall_options = { 'name' => "export.pl",
214 'desc' => "{export.desc}",
215 'args' => [ $saveas_argument ] };
216
217sub gsprintf
218{
219 return &gsprintf::gsprintf(@_);
220}
221
222
223&main();
224
225sub main {
226 # params
227 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
228 $removeold, $saveas, $saveas_version, $debug,
229 $maxdocs, $statsfile, $xsltfile, $out, $faillog, $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
230 my $xml = 0;
231
232 # other vars
233 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
234 my $service = "export";
235
236 my $hashParsingResult = {};
237 my $blnParseFailed = "false";
238 # general options available to all plugins
239 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
240 # can have more than one collection name, so don't check num args left
241 foreach my $strVariable (keys %$hashParsingResult)
242 {
243 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
244 }
245
246
247 # these are options used by other things - we just set default values
248 # undef means will be set from config file if there
249 my $gzip = undef;
250 my $groupsize = 1;
251 my $OIDtype = undef;
252 my $sortmeta = undef;
253
254 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
255
256 # save these command line settings. don't want config file settings in one
257 # coll used for other colls
258 # does this apply to other vars???
259 my $global_removeold = $removeold;
260 my $global_keepold = $keepold;
261 # If $language has been specified, load the appropriate resource bundle
262 # (Otherwise, the default resource bundle will be loaded automatically)
263 if ($language) {
264 &gsprintf::load_language_specific_resource_bundle($language);
265 }
266
267 if ($listall) {
268 if ($xml) {
269 &PrintUsage::print_xml_usage($listall_options);
270 }
271 else
272 {
273 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
274 }
275 die "\n";
276 }
277 elsif ($xml) {
278 &PrintUsage::print_xml_usage($options);
279 die "\n";
280 }
281
282 if (scalar(@ARGV) == 0) {
283 &PrintUsage::print_txt_usage($options, "{export.params}");
284 die "\n";
285 }
286
287 if ($gli) { # the gli wants strings to be in UTF-8
288 &gsprintf::output_strings_in_UTF8;
289 }
290 my $close_out = 0;
291 if ($out !~ /^(STDERR|STDOUT)$/i) {
292 open (OUT, ">$out") ||
293 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
294 $out = 'export::OUT';
295 $close_out = 1;
296 }
297 $out->autoflush(1);
298
299 while (scalar(@ARGV)>0) {
300 my $collect_name = shift @ARGV;
301 $ENV{'GSDLCOLLECTION'} = $collect_name;
302
303 eval {
304 # get and check the collection name
305 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
306 &PrintUsage::print_txt_usage($options, "{export.params}");
307 die "\n";
308 }
309 # add collection's perllib dir into include path in
310 # case we have collection specific modules
311 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
312
313 if ($faillog eq "") {
314 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
315 }
316 open (FAILLOG, ">$faillog") ||
317 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
318 my $faillogname = $faillog;
319 $faillog = 'export::FAILLOG';
320 $faillog->autoflush(1);
321
322 # check sortmeta
323 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
324 if (defined $sortmeta && $groupsize > 1) {
325 &gsprintf($out, "{export.cannot_sort}\n\n");
326 $sortmeta = undef;
327 }
328
329 # get the list of plugins for this collection and set any options that
330 # were specified in the collect.cfg (all export.pl options except
331 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
332 # options must be known before we read the collect.cfg))
333 my $plugins = [];
334 my @global_opts = ();
335
336 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
337 if (!-e $configfilename) {
338 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
339 }
340
341 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
342 if (defined $collectcfg->{'plugin'}) {
343 $plugins = $collectcfg->{'plugin'};
344 }
345
346 if ($verbosity !~ /\d+/) {
347 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
348 $verbosity = $collectcfg->{'verbosity'};
349 } else {
350 $verbosity = 2; # the default
351 }
352 }
353 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
354 $importdir = $collectcfg->{'importdir'};
355 }
356 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
357 $exportdir = $collectcfg->{'exportdir'};
358 }
359
360 if (defined $collectcfg->{'gzip'} && !$gzip) {
361 if ($collectcfg->{'gzip'} =~ /^true$/i) {
362 $gzip = 1;
363 }
364 }
365 if ($maxdocs !~ /\-?\d+/) {
366 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
367 $maxdocs = $collectcfg->{'maxdocs'};
368 } else {
369 $maxdocs = -1; # the default
370 }
371 }
372 if ($groupsize == 1) {
373 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
374 $groupsize = $collectcfg->{'groupsize'};
375 }
376 }
377 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental)$/)) {
378 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
379 $OIDtype = $collectcfg->{'OIDtype'};
380 } else {
381 $OIDtype = "hash"; # the default
382 }
383 }
384 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
385 $sortmeta = $collectcfg->{'sortmeta'};
386 }
387 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
388 $debug = 1;
389 }
390 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
391 $gli = 1;
392 }
393
394 # global plugin stuff
395 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
396 push @global_opts, "-separate_cjk";
397 }
398
399 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, "export", $collectcfg);
400
401 $gli = 0 unless defined $gli;
402
403 print STDERR "<export>\n" if $gli;
404
405 # fill in the default import and export directories if none
406 # were supplied, turn all \ into / and remove trailing /
407 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
408 $importdir =~ s/[\\\/]+/\//g;
409 $importdir =~ s/\/$//;
410 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
411 $exportdir =~ s/[\\\/]+/\//g;
412 $exportdir =~ s/\/$//;
413
414 # load all the plugins
415 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
416
417 if (scalar(@$pluginfo) == 0) {
418 &gsprintf($out, "{import.no_plugins_loaded}\n");
419 die "\n";
420 }
421
422 # remove the old contents of the export directory if needed
423 if ($removeold && -e $exportdir) {
424 &gsprintf($out, "{export.removing_export}\n");
425 &util::rm_r ($exportdir);
426 }
427
428 # read the export information file
429 if (!$debug) {
430 # Export to DSpace Arhive format or METs format
431 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
432
433 if ($saveas eq "DSpace"){
434 $export_info_filename = &util::filename_cat ($exportdir, "contents");
435 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
436 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
437 }
438
439 $export_info = new arcinfo();
440 $export_info -> load_info ($export_info_filename);
441
442 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
443 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
444 $saveas = $collectcfg->{'plugout'}[0];
445 } else {
446 $saveas ="GAPlugout";
447 }
448 }
449
450
451 ####Use Plugout####
452 my ($plugout_name);
453 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)Plugout$/ ){
454 $plugout_name = $saveas."Plugout";
455 }
456 else{
457 $plugout_name = $saveas;
458 }
459
460 my $opts=[];
461
462
463 push @$opts,("-output_info",$export_info) if (defined $export_info);
464
465 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
466
467 push @$opts,("-gzip_output",$gzip) if (defined $gzip);
468 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
469 push @$opts,("-output_handle",$out) if (defined $out);
470
471 push @$opts,("-xslt_file",$xsltfile) if (defined $xsltfile);
472 push @$opts,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
473 push @$opts,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
474 push @$opts,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
475 push @$opts,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
476
477
478 $processor = &plugout::load_plugout($plugout_name,$opts);
479
480 $processor->setoutputdir ($exportdir);
481
482 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
483 $processor->set_OIDtype ($OIDtype);
484 #$processor->set_saveas ($saveas);
485 #$processor->set_saveas_version ($saveas_version);
486 } else {
487 $processor = new docprint ();
488 }
489
490 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
491
492 # process the import directory
493 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
494
495 &plugin::end($pluginfo, $processor);
496
497 &plugin::deinit($pluginfo, $processor);
498
499 # write out the export information file
500 if (!$debug) {
501 $processor->close_file_output() if $groupsize > 1;
502 $processor->close_group_output() if $processor->is_group();
503 if ($saveas eq "METS") {
504 $export_info->save_info($export_info_filename);
505 }
506 }
507
508 # write out export stats
509 my $close_stats = 0;
510 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
511 if (open (STATS, ">$statsfile")) {
512 $statsfile = 'import::STATS';
513 $close_stats = 1;
514 } else {
515 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
516 &gsprintf($out, "{import.stats_backup}\n");
517 $statsfile = 'STDERR';
518 }
519 }
520
521 &gsprintf($out, "\n");
522 &gsprintf($out, "*********************************************\n");
523
524 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
525 if ($close_stats) {
526 close STATS;
527 }
528
529 &gsprintf($out, "*********************************************\n");
530
531 close OUT if $close_out;
532
533 close FAILLOG;
534 };
535
536 if ($@) {
537 print STDERR $@;
538 }
539
540## $ENV{'GSDLCOLLECTION'} = undef;
541 $importdir = "";
542 $removeold = 0 if ($explicit_exportdir);
543
544 } # while processing ARGV
545
546 &gsprintf($out, "\n");
547 &gsprintf($out, "*********************************************\n");
548 &gsprintf($out, "* {export.complete}\n");
549 &gsprintf($out, "*********************************************\n");
550
551}
Note: See TracBrowser for help on using the repository browser.