source: gsdl/trunk/bin/script/export.pl@ 14925

Last change on this file since 14925 was 14925, checked in by dmn, 16 years ago

davidbs changes to update for gs3 building

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use util;
53use scriptutil;
54use FileHandle;
55use gsprintf;
56use printusage;
57use parse2;
58
59
60my $oidtype_list =
61 [ { 'name' => "hash",
62 'desc' => "{import.OIDtype.hash}" },
63 { 'name' => "incremental",
64 'desc' => "{import.OIDtype.incremental}" },
65 { 'name' => "assigned",
66 'desc' => "{import.OIDtype.assigned}" },
67 { 'name' => "dirname",
68 'desc' => "{import.OIDtype.dirname}" } ];
69
70#** define to use the METS format or DSpace format
71my $saveas_list =
72 [ { 'name' => "DSpace",
73 'desc' => "{export.saveas.DSpace}" },
74 { 'name' => "METS",
75 'desc' => "{export.saveas.METS}"},
76 { 'name' => "GA",
77 'desc' => "{export.saveas.GA}"},
78 { 'name' => "MARCXML",
79 'desc' => "{export.saveas.MARCXML}"}
80 ];
81
82
83# Possible attributes for each argument
84# name: The name of the argument
85# desc: A description (or more likely a reference to a description) for this argument
86# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
87# reqd: Is this argument required?
88# hiddengli: Is this argument hidden in GLI?
89# modegli: The lowest detail mode this argument is visible at in GLI
90
91my $saveas_argument =
92 { 'name' => "saveas",
93 'desc' => "{export.saveas}",
94 'type' => "enum",
95 'list' => $saveas_list,
96 'deft' => "METS",
97 'reqd' => "no",
98 'modegli' => "3" };
99
100
101my $arguments =
102 [
103 $saveas_argument,
104 { 'name' => "saveas_version",
105 'desc' => "{export.saveas_version}",
106 'type' => "string",
107 'reqd' => "no",
108 'deft' => "greenstone" },
109 { 'name' => "exportdir",
110 'desc' => "{export.exportdir}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes" },
114 { 'name' => "importdir",
115 'desc' => "{import.importdir}",
116 'type' => "string",
117 'reqd' => "no",
118 'hiddengli' => "yes" },
119 { 'name' => "site",
120 'desc' => "{import.site}",
121 'type' => "string",
122 'deft' => "",
123 'reqd' => "no",
124 'hiddengli' => "yes" },
125 { 'name' => "collectdir",
126 'desc' => "{export.collectdir}",
127 'type' => "string",
128 # parsearg left "" as default
129 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
130 'reqd' => "no",
131 'hiddengli' => "yes" },
132 { 'name' => "listall",
133 'desc' => "{export.listall}",
134 'type' => "flag",
135 'reqd' => "no" },
136 { 'name' => "debug",
137 'desc' => "{export.debug}",
138 'type' => "flag",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "faillog",
142 'desc' => "{export.faillog}",
143 'type' => "string",
144 'deft' => "",
145 'reqd' => "no",
146 'modegli' => "4" },
147 { 'name' => "keepold",
148 'desc' => "{export.keepold}",
149 'type' => "flag",
150 'reqd' => "no",
151 'hiddengli' => "yes" },
152 { 'name' => "removeold",
153 'desc' => "{export.removeold}",
154 'type' => "flag",
155 'reqd' => "no",
156 'modegli' => "3" },
157 { 'name' => "language",
158 'desc' => "{scripts.language}",
159 'type' => "string",
160 'reqd' => "no",
161 'modegli' => "4" },
162 { 'name' => "maxdocs",
163 'desc' => "{export.maxdocs}",
164 'type' => "int",
165 'reqd' => "no",
166 'range' => "1,",
167 'modegli' => "1" },
168 { 'name' => "OIDtype",
169 'desc' => "{import.OIDtype}",
170 'type' => "enum",
171 'list' => $oidtype_list,
172 # parsearg left "" as default
173 #'deft' => "hash",
174 'reqd' => "no",
175 'modegli' => "3" },
176 { 'name' => "OIDmetadata",
177 'desc' => "{import.OIDmetadata}",
178 'type' => "metadata",
179 'deft' => "dc.Identifier",
180 'reqd' => "no",
181 'modegli' => "3" },
182 { 'name' => "out",
183 'desc' => "{export.out}",
184 'type' => "string",
185 'deft' => "STDERR",
186 'reqd' => "no",
187 'hiddengli' => "yes" },
188 { 'name' => "statsfile",
189 'desc' => "{export.statsfile}",
190 'type' => "string",
191 'deft' => "STDERR",
192 'reqd' => "no",
193 'hiddengli' => "yes" },
194 { 'name' => "xsltfile",
195 'desc' => "{BasPlugout.xslt_file}",
196 'type' => "string",
197 'reqd' => "no",
198 'hiddengli' => "yes" },
199 { 'name' => "xslt_txt",
200 'desc' => "{METSPlugout.xslt_txt}",
201 'type' => "string",
202 'reqd' => "no",
203 'hiddengli' => "no" },
204 { 'name' => "xslt_mets",
205 'desc' => "{METSPlugout.xslt_mets}",
206 'type' => "string",
207 'reqd' => "no",
208 'hiddengli' => "no" },
209 { 'name' => "mapping_file",
210 'desc' => "{MARCXMLPlugout.mapping_file}",
211 'type' => "string",
212 'reqd' => "no",
213 'hiddengli' => "no" },
214 { 'name' => "group_marc",
215 'desc' => "{MARCXMLPlugout.group}",
216 'type' => "flag",
217 'reqd' => "no",
218 'hiddengli' => "no" },
219 { 'name' => "verbosity",
220 'desc' => "{export.verbosity}",
221 'type' => "int",
222 'range' => "0,3",
223 'deft' => "2",
224 'reqd' => "no",
225 'modegli' => "4" },
226 { 'name' => "gli",
227 'desc' => "",
228 'type' => "flag",
229 'reqd' => "no",
230 'hiddengli' => "yes" },
231 { 'name' => "xml",
232 'desc' => "{scripts.xml}",
233 'type' => "flag",
234 'reqd' => "no",
235 'hiddengli' => "yes" }
236 ];
237
238my $options = { 'name' => "export.pl",
239 'desc' => "{export.desc}",
240 'args' => $arguments };
241
242my $listall_options = { 'name' => "export.pl",
243 'desc' => "{export.desc}",
244 'args' => [ $saveas_argument ] };
245
246sub gsprintf
247{
248 return &gsprintf::gsprintf(@_);
249}
250
251
252&main();
253
254sub main {
255 # params
256 my ($language, $verbosity, $importdir, $exportdir, $site, $keepold, $listall,
257 $removeold, $saveas, $saveas_version, $debug, $OIDtype, $OIDmetadata,
258 $maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog, $gs_mode, $collectcfg,
259 $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
260 my $xml = 0;
261
262 # other vars
263 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
264 my $service = "export";
265
266 my $hashParsingResult = {};
267 # general options available to all plugins
268 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
269
270 # If parse returns -1 then something has gone wrong
271 if ($intArgLeftinAfterParsing == -1)
272 {
273 &PrintUsage::print_txt_usage($options, "{export.params}");
274 die "\n";
275 }
276
277 foreach my $strVariable (keys %$hashParsingResult)
278 {
279 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
280 }
281
282
283 # these are options used by other things - we just set default values
284 # undef means will be set from config file if there
285 my $gzip = undef;
286 my $groupsize = 1;
287 #my $OIDtype = undef;
288 my $sortmeta = undef;
289
290 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
291
292 # save these command line settings. don't want config file settings in one
293 # coll used for other colls
294 # does this apply to other vars???
295 my $global_removeold = $removeold;
296 my $global_keepold = $keepold;
297 # If $language has been specified, load the appropriate resource bundle
298 # (Otherwise, the default resource bundle will be loaded automatically)
299 if ($language) {
300 &gsprintf::load_language_specific_resource_bundle($language);
301 }
302
303 if ($listall) {
304 if ($xml) {
305 &PrintUsage::print_xml_usage($listall_options);
306 }
307 else
308 {
309 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
310 }
311 die "\n";
312 }
313 elsif ($xml) {
314 &PrintUsage::print_xml_usage($options);
315 die "\n";
316 }
317
318 # can have more than one collection name,
319 # if the first extra option is -h, then output the help
320 if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
321 &PrintUsage::print_txt_usage($options, "{export.params}");
322 die "\n";
323 }
324
325 if ($gli) { # the gli wants strings to be in UTF-8
326 &gsprintf::output_strings_in_UTF8;
327 }
328 my $close_out = 0;
329 if ($out !~ /^(STDERR|STDOUT)$/i) {
330 open (OUT, ">$out") ||
331 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
332 $out = 'export::OUT';
333 $close_out = 1;
334 }
335 $out->autoflush(1);
336
337 while (scalar(@ARGV)>0) {
338 my $collect_name = shift @ARGV;
339 $ENV{'GSDLCOLLECTION'} = $collect_name;
340
341 eval {
342 # get and check the collection name
343 print STDERR "**** site = $site\n";
344 if (($collection = &colcfg::use_collection($site, $collect_name, $collectdir)) eq "") {
345 &PrintUsage::print_txt_usage($options, "{export.params}");
346 die "\n";
347 }
348 # add collection's perllib dir into include path in
349 # case we have collection specific modules
350 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
351
352 if ($faillog eq "") {
353 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
354 }
355 open (FAILLOG, ">$faillog") ||
356 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
357 my $faillogname = $faillog;
358 $faillog = 'export::FAILLOG';
359 $faillog->autoflush(1);
360
361 # check sortmeta
362 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
363 if (defined $sortmeta && $groupsize > 1) {
364 &gsprintf($out, "{export.cannot_sort}\n\n");
365 $sortmeta = undef;
366 }
367
368 # get the list of plugins for this collection and set any options that
369 # were specified in the collect.cfg (all export.pl options except
370 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
371 # options must be known before we read the collect.cfg))
372 my $plugins = [];
373 my @global_opts = ();
374
375 # Read in the collection configuration file.
376 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
377
378 if ($gs_mode eq "gs2") {
379 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
380 } elsif ($gs_mode eq "gs3") {
381 $collectcfg = &colcfg::read_collection_cfg_xml ($configfilename);
382 }
383
384 if (defined $collectcfg->{'plugin'}) {
385 $plugins = $collectcfg->{'plugin'};
386 }
387
388 if ($verbosity !~ /\d+/) {
389 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
390 $verbosity = $collectcfg->{'verbosity'};
391 } else {
392 $verbosity = 2; # the default
393 }
394 }
395 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
396 $importdir = $collectcfg->{'importdir'};
397 }
398 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
399 $exportdir = $collectcfg->{'exportdir'};
400 }
401
402 if (defined $collectcfg->{'gzip'} && !$gzip) {
403 if ($collectcfg->{'gzip'} =~ /^true$/i) {
404 $gzip = 1;
405 }
406 }
407 if ($maxdocs !~ /\-?\d+/) {
408 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
409 $maxdocs = $collectcfg->{'maxdocs'};
410 } else {
411 $maxdocs = -1; # the default
412 }
413 }
414 if ($groupsize == 1) {
415 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
416 $groupsize = $collectcfg->{'groupsize'};
417 }
418 }
419 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
420 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
421 $OIDtype = $collectcfg->{'OIDtype'};
422 } else {
423 $OIDtype = "hash"; # the default
424 }
425 }
426 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
427 $sortmeta = $collectcfg->{'sortmeta'};
428 }
429 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
430 $debug = 1;
431 }
432 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
433 $gli = 1;
434 }
435
436 # global plugin stuff
437 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
438 push @global_opts, "-separate_cjk";
439 }
440
441 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, 0, "export", $collectcfg);
442
443 $gli = 0 unless defined $gli;
444
445 print STDERR "<export>\n" if $gli;
446
447 # fill in the default import and export directories if none
448 # were supplied, turn all \ into / and remove trailing /
449 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
450 $importdir =~ s/[\\\/]+/\//g;
451 $importdir =~ s/\/$//;
452 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
453 $exportdir =~ s/[\\\/]+/\//g;
454 $exportdir =~ s/\/$//;
455
456 # load all the plugins
457 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
458
459 if (scalar(@$pluginfo) == 0) {
460 &gsprintf($out, "{import.no_plugins_loaded}\n");
461 die "\n";
462 }
463
464 # remove the old contents of the export directory if needed
465 if ($removeold && -e $exportdir) {
466 &gsprintf($out, "{export.removing_export}\n");
467 &util::rm_r ($exportdir);
468 }
469
470 # read the export information file
471
472 # Export to DSpace Archive format or METs format
473 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
474
475 # the plugouts should be doing this!!
476 if ($saveas eq "DSpace"){
477 $export_info_filename = &util::filename_cat ($exportdir, "contents");
478 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
479 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
480 }
481
482 $export_info = new arcinfo();
483 $export_info -> load_info ($export_info_filename);
484
485 my ($plugout);
486 if (defined $collectcfg->{'plugout'}) {
487 $plugout = $collectcfg->{'plugout'};
488 }
489 else{
490 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
491 push @$plugout,"GAPlugout";
492 }
493 else{
494 push @$plugout,$saveas."Plugout";
495 }
496 }
497
498 my $plugout_name = $plugout->[0];
499
500 push @$plugout,("-output_info",$export_info) if (defined $export_info);
501 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
502 push @$plugout,("-debug") if ($debug);
503 push @$plugout,("-gzip_output",$gzip) if (defined $gzip);
504 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
505 push @$plugout,("-output_handle",$out) if (defined $out);
506 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile);
507 push @$plugout,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
508 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
509 push @$plugout,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
510 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
511 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
512 $processor = &plugout::load_plugout($plugout);
513
514 $processor->setoutputdir ($exportdir);
515
516 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
517 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
518
519 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
520
521 # process the import directory
522 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
523
524 &plugin::end($pluginfo, $processor);
525
526 &plugin::deinit($pluginfo, $processor);
527
528 # write out the export information file
529 $processor->close_file_output() if $groupsize > 1;
530 $processor->close_group_output() if $processor->is_group();
531 if ($saveas eq "METS") {
532 $export_info->save_info($export_info_filename);
533 }
534
535 # write out export stats
536 my $close_stats = 0;
537 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
538 if (open (STATS, ">$statsfile")) {
539 $statsfile = 'import::STATS';
540 $close_stats = 1;
541 } else {
542 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
543 &gsprintf($out, "{import.stats_backup}\n");
544 $statsfile = 'STDERR';
545 }
546 }
547
548 &gsprintf($out, "\n");
549 &gsprintf($out, "*********************************************\n");
550
551 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
552 if ($close_stats) {
553 close STATS;
554 }
555
556 &gsprintf($out, "*********************************************\n");
557
558 close OUT if $close_out;
559
560 close FAILLOG;
561 };
562
563 if ($@) {
564 print STDERR $@;
565 }
566
567## $ENV{'GSDLCOLLECTION'} = undef;
568 $importdir = "";
569 $removeold = 0 if ($explicit_exportdir);
570
571 } # while processing ARGV
572
573 &gsprintf($out, "\n");
574 &gsprintf($out, "*********************************************\n");
575 &gsprintf($out, "* {export.complete}\n");
576 &gsprintf($out, "*********************************************\n");
577
578}
Note: See TracBrowser for help on using the repository browser.