source: trunk/gsdl/bin/script/export.pl@ 12691

Last change on this file since 12691 was 12691, checked in by kjdon, 18 years ago

added OIDtype and OIDmetadata to the option list. it was using OIDtype anyway, just not allowing it as an option. added assigned and dirname to the options.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# export.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
30# Author: Chi-Yu Huang Date: 08-10-2004
31
32package export;
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42}
43
44use strict;
45no strict 'refs'; # allow filehandles to be variables and vice versa
46no strict 'subs'; # allow barewords (eg STDERR) as function arguments
47
48use arcinfo;
49use colcfg;
50use plugin;
51use plugout;
52use docprint;
53use util;
54use scriptutil;
55use FileHandle;
56use gsprintf;
57use printusage;
58use parse2;
59
60
61my $oidtype_list =
62 [ { 'name' => "hash",
63 'desc' => "{import.OIDtype.hash}" },
64 { 'name' => "incremental",
65 'desc' => "{import.OIDtype.incremental}" },
66 { 'name' => "assigned",
67 'desc' => "{import.OIDtype.assigned}" },
68 { 'name' => "dirname",
69 'desc' => "{import.OIDtype.dirname}" } ];
70
71#** define to use the METS format or DSpace format
72my $saveas_list =
73 [ { 'name' => "DSpace",
74 'desc' => "{export.saveas.DSpace}" },
75 { 'name' => "METS",
76 'desc' => "{export.saveas.METS}"},
77 { 'name' => "GA",
78 'desc' => "{export.saveas.GA}"},
79 { 'name' => "MARCXML",
80 'desc' => "{export.saveas.MARCXML}"}
81 ];
82
83
84# Possible attributes for each argument
85# name: The name of the argument
86# desc: A description (or more likely a reference to a description) for this argument
87# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
88# reqd: Is this argument required?
89# hiddengli: Is this argument hidden in GLI?
90# modegli: The lowest detail mode this argument is visible at in GLI
91
92my $saveas_argument =
93 { 'name' => "saveas",
94 'desc' => "{export.saveas}",
95 'type' => "enum",
96 'list' => $saveas_list,
97 'deft' => "METS",
98 'reqd' => "no",
99 'modegli' => "3" };
100
101
102my $arguments =
103 [
104 $saveas_argument,
105 { 'name' => "saveas_version",
106 'desc' => "{export.saveas_version}",
107 'type' => "string",
108 'reqd' => "no",
109 'deft' => "greenstone" },
110 { 'name' => "exportdir",
111 'desc' => "{export.exportdir}",
112 'type' => "string",
113 'reqd' => "no",
114 'hiddengli' => "yes" },
115 { 'name' => "importdir",
116 'desc' => "{import.importdir}",
117 'type' => "string",
118 'reqd' => "no",
119 'hiddengli' => "yes" },
120 { 'name' => "collectdir",
121 'desc' => "{export.collectdir}",
122 'type' => "string",
123 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
124 'reqd' => "no",
125 'hiddengli' => "yes" },
126 { 'name' => "listall",
127 'desc' => "{export.listall}",
128 'type' => "flag",
129 'reqd' => "no" },
130 { 'name' => "debug",
131 'desc' => "{export.debug}",
132 'type' => "flag",
133 'reqd' => "no",
134 'hiddengli' => "yes" },
135 { 'name' => "faillog",
136 'desc' => "{export.faillog}",
137 'type' => "string",
138 'deft' => "",
139 'reqd' => "no",
140 'modegli' => "4" },
141 { 'name' => "keepold",
142 'desc' => "{export.keepold}",
143 'type' => "flag",
144 'reqd' => "no",
145 'hiddengli' => "yes" },
146 { 'name' => "removeold",
147 'desc' => "{export.removeold}",
148 'type' => "flag",
149 'reqd' => "no",
150 'modegli' => "3" },
151 { 'name' => "language",
152 'desc' => "{scripts.language}",
153 'type' => "string",
154 'reqd' => "no",
155 'modegli' => "4" },
156 { 'name' => "maxdocs",
157 'desc' => "{export.maxdocs}",
158 'type' => "int",
159 'reqd' => "no",
160 'range' => "1,",
161 'modegli' => "1" },
162 { 'name' => "OIDtype",
163 'desc' => "{import.OIDtype}",
164 'type' => "enum",
165 'list' => $oidtype_list,
166 # parsearg left "" as default
167 #'deft' => "hash",
168 'reqd' => "no",
169 'modegli' => "3" },
170 { 'name' => "OIDmetadata",
171 'desc' => "{import.OIDmetadata}",
172 'type' => "metadata",
173 'deft' => "dc.Identifier",
174 'reqd' => "no",
175 'modegli' => "3" },
176 { 'name' => "out",
177 'desc' => "{export.out}",
178 'type' => "string",
179 'deft' => "STDERR",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182 { 'name' => "statsfile",
183 'desc' => "{export.statsfile}",
184 'type' => "string",
185 'deft' => "STDERR",
186 'reqd' => "no",
187 'hiddengli' => "yes" },
188 { 'name' => "xsltfile",
189 'desc' => "{BasPlugout.xslt_file}",
190 'type' => "string",
191 'reqd' => "no",
192 'hiddengli' => "yes" },
193 { 'name' => "xslt_txt",
194 'desc' => "{METSPlugout.xslt_txt}",
195 'type' => "string",
196 'reqd' => "no",
197 'hiddengli' => "no" },
198 { 'name' => "xslt_mets",
199 'desc' => "{METSPlugout.xslt_mets}",
200 'type' => "string",
201 'reqd' => "no",
202 'hiddengli' => "no" },
203 { 'name' => "mapping_file",
204 'desc' => "{MARCXMLPlugout.mapping_file}",
205 'type' => "string",
206 'reqd' => "no",
207 'hiddengli' => "no" },
208 { 'name' => "group_marc",
209 'desc' => "{MARCXMLPlugout.group}",
210 'type' => "flag",
211 'reqd' => "no",
212 'hiddengli' => "no" },
213 { 'name' => "verbosity",
214 'desc' => "{export.verbosity}",
215 'type' => "int",
216 'range' => "0,3",
217 'deft' => "2",
218 'reqd' => "no",
219 'modegli' => "4" },
220 { 'name' => "gli",
221 'desc' => "",
222 'type' => "flag",
223 'reqd' => "no",
224 'hiddengli' => "yes" },
225 { 'name' => "xml",
226 'desc' => "{scripts.xml}",
227 'type' => "flag",
228 'reqd' => "no",
229 'hiddengli' => "yes" }
230 ];
231
232my $options = { 'name' => "export.pl",
233 'desc' => "{export.desc}",
234 'args' => $arguments };
235
236my $listall_options = { 'name' => "export.pl",
237 'desc' => "{export.desc}",
238 'args' => [ $saveas_argument ] };
239
240sub gsprintf
241{
242 return &gsprintf::gsprintf(@_);
243}
244
245
246&main();
247
248sub main {
249 # params
250 my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
251 $removeold, $saveas, $saveas_version, $debug, $OIDtype, $OIDmetadata,
252 $maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog, $collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
253 my $xml = 0;
254
255 # other vars
256 my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
257 my $service = "export";
258
259 my $hashParsingResult = {};
260 # general options available to all plugins
261 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
262
263 # If parse returns -1 then something has gone wrong
264 if ($intArgLeftinAfterParsing == -1)
265 {
266 &PrintUsage::print_txt_usage($options, "{export.params}");
267 die "\n";
268 }
269
270 foreach my $strVariable (keys %$hashParsingResult)
271 {
272 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
273 }
274
275
276 # these are options used by other things - we just set default values
277 # undef means will be set from config file if there
278 my $gzip = undef;
279 my $groupsize = 1;
280 #my $OIDtype = undef;
281 my $sortmeta = undef;
282
283 my $explicit_exportdir = (defined $exportdir) ? 1 : 0;
284
285 # save these command line settings. don't want config file settings in one
286 # coll used for other colls
287 # does this apply to other vars???
288 my $global_removeold = $removeold;
289 my $global_keepold = $keepold;
290 # If $language has been specified, load the appropriate resource bundle
291 # (Otherwise, the default resource bundle will be loaded automatically)
292 if ($language) {
293 &gsprintf::load_language_specific_resource_bundle($language);
294 }
295
296 if ($listall) {
297 if ($xml) {
298 &PrintUsage::print_xml_usage($listall_options);
299 }
300 else
301 {
302 &PrintUsage::print_txt_usage($listall_options,"{export.params}");
303 }
304 die "\n";
305 }
306 elsif ($xml) {
307 &PrintUsage::print_xml_usage($options);
308 die "\n";
309 }
310
311 # can have more than one collection name,
312 # if the first extra option is -h, then output the help
313 if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
314 &PrintUsage::print_txt_usage($options, "{export.params}");
315 die "\n";
316 }
317
318 if ($gli) { # the gli wants strings to be in UTF-8
319 &gsprintf::output_strings_in_UTF8;
320 }
321 my $close_out = 0;
322 if ($out !~ /^(STDERR|STDOUT)$/i) {
323 open (OUT, ">$out") ||
324 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
325 $out = 'export::OUT';
326 $close_out = 1;
327 }
328 $out->autoflush(1);
329
330 while (scalar(@ARGV)>0) {
331 my $collect_name = shift @ARGV;
332 $ENV{'GSDLCOLLECTION'} = $collect_name;
333
334 eval {
335 # get and check the collection name
336 if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
337 &PrintUsage::print_txt_usage($options, "{export.params}");
338 die "\n";
339 }
340 # add collection's perllib dir into include path in
341 # case we have collection specific modules
342 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
343
344 if ($faillog eq "") {
345 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
346 }
347 open (FAILLOG, ">$faillog") ||
348 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
349 my $faillogname = $faillog;
350 $faillog = 'export::FAILLOG';
351 $faillog->autoflush(1);
352
353 # check sortmeta
354 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
355 if (defined $sortmeta && $groupsize > 1) {
356 &gsprintf($out, "{export.cannot_sort}\n\n");
357 $sortmeta = undef;
358 }
359
360 # get the list of plugins for this collection and set any options that
361 # were specified in the collect.cfg (all export.pl options except
362 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
363 # options must be known before we read the collect.cfg))
364 my $plugins = [];
365 my @global_opts = ();
366
367 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
368 if (!-e $configfilename) {
369 (&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
370 }
371
372 my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
373 if (defined $collectcfg->{'plugin'}) {
374 $plugins = $collectcfg->{'plugin'};
375 }
376
377 if ($verbosity !~ /\d+/) {
378 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
379 $verbosity = $collectcfg->{'verbosity'};
380 } else {
381 $verbosity = 2; # the default
382 }
383 }
384 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
385 $importdir = $collectcfg->{'importdir'};
386 }
387 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
388 $exportdir = $collectcfg->{'exportdir'};
389 }
390
391 if (defined $collectcfg->{'gzip'} && !$gzip) {
392 if ($collectcfg->{'gzip'} =~ /^true$/i) {
393 $gzip = 1;
394 }
395 }
396 if ($maxdocs !~ /\-?\d+/) {
397 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
398 $maxdocs = $collectcfg->{'maxdocs'};
399 } else {
400 $maxdocs = -1; # the default
401 }
402 }
403 if ($groupsize == 1) {
404 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
405 $groupsize = $collectcfg->{'groupsize'};
406 }
407 }
408 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
409 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
410 $OIDtype = $collectcfg->{'OIDtype'};
411 } else {
412 $OIDtype = "hash"; # the default
413 }
414 }
415 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
416 $sortmeta = $collectcfg->{'sortmeta'};
417 }
418 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
419 $debug = 1;
420 }
421 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
422 $gli = 1;
423 }
424
425 # global plugin stuff
426 if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
427 push @global_opts, "-separate_cjk";
428 }
429
430 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, "export", $collectcfg);
431
432 $gli = 0 unless defined $gli;
433
434 print STDERR "<export>\n" if $gli;
435
436 # fill in the default import and export directories if none
437 # were supplied, turn all \ into / and remove trailing /
438 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
439 $importdir =~ s/[\\\/]+/\//g;
440 $importdir =~ s/\/$//;
441 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
442 $exportdir =~ s/[\\\/]+/\//g;
443 $exportdir =~ s/\/$//;
444
445 # load all the plugins
446 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
447
448 if (scalar(@$pluginfo) == 0) {
449 &gsprintf($out, "{import.no_plugins_loaded}\n");
450 die "\n";
451 }
452
453 # remove the old contents of the export directory if needed
454 if ($removeold && -e $exportdir) {
455 &gsprintf($out, "{export.removing_export}\n");
456 &util::rm_r ($exportdir);
457 }
458
459 # read the export information file
460 if (!$debug) {
461 # Export to DSpace Archive format or METs format
462 # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
463
464 if ($saveas eq "DSpace"){
465 $export_info_filename = &util::filename_cat ($exportdir, "contents");
466 } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
467 $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
468 }
469
470 $export_info = new arcinfo();
471 $export_info -> load_info ($export_info_filename);
472
473 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
474 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
475 $saveas = $collectcfg->{'plugout'}[0];
476 } else {
477 $saveas ="GAPlugout";
478 }
479 }
480
481
482 ####Use Plugout####
483 my ($plugout_name);
484 if ($saveas !~ /^(GA|METS|DSpace|MARCXML)Plugout$/ ){
485 $plugout_name = $saveas."Plugout";
486 }
487 else{
488 $plugout_name = $saveas;
489 }
490
491 my $opts=[];
492
493
494 push @$opts,("-output_info",$export_info) if (defined $export_info);
495
496 push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
497
498 push @$opts,("-gzip_output",$gzip) if (defined $gzip);
499 push @$opts,("-group_size",$groupsize) if (defined $groupsize);
500 push @$opts,("-output_handle",$out) if (defined $out);
501 push @$opts,("-xslt_file",$xsltfile) if (defined $xsltfile);
502 push @$opts,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
503 push @$opts,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
504 push @$opts,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
505 push @$opts,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
506 push @$opts,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
507 $processor = &plugout::load_plugout($plugout_name,$opts);
508
509 $processor->setoutputdir ($exportdir);
510
511 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
512 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
513
514 } else {
515 $processor = new docprint ();
516 }
517
518 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
519
520 # process the import directory
521 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
522
523 &plugin::end($pluginfo, $processor);
524
525 &plugin::deinit($pluginfo, $processor);
526
527 # write out the export information file
528 if (!$debug) {
529 $processor->close_file_output() if $groupsize > 1;
530 $processor->close_group_output() if $processor->is_group();
531 if ($saveas eq "METS") {
532 $export_info->save_info($export_info_filename);
533 }
534 }
535
536 # write out export stats
537 my $close_stats = 0;
538 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
539 if (open (STATS, ">$statsfile")) {
540 $statsfile = 'import::STATS';
541 $close_stats = 1;
542 } else {
543 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
544 &gsprintf($out, "{import.stats_backup}\n");
545 $statsfile = 'STDERR';
546 }
547 }
548
549 &gsprintf($out, "\n");
550 &gsprintf($out, "*********************************************\n");
551
552 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
553 if ($close_stats) {
554 close STATS;
555 }
556
557 &gsprintf($out, "*********************************************\n");
558
559 close OUT if $close_out;
560
561 close FAILLOG;
562 };
563
564 if ($@) {
565 print STDERR $@;
566 }
567
568## $ENV{'GSDLCOLLECTION'} = undef;
569 $importdir = "";
570 $removeold = 0 if ($explicit_exportdir);
571
572 } # while processing ARGV
573
574 &gsprintf($out, "\n");
575 &gsprintf($out, "*********************************************\n");
576 &gsprintf($out, "* {export.complete}\n");
577 &gsprintf($out, "*********************************************\n");
578
579}
Note: See TracBrowser for help on using the repository browser.