source: main/trunk/greenstone2/bin/script/import.pl@ 32528

Last change on this file since 32528 was 32528, checked in by ak19, 6 years ago

Related to previous commit, part 2 of commit. Renamed new MySQLPlugout to GreenstoneSQLPlugout to indicate that it deals with the internal Greenstone doc format and to match with the in-progress GreenstoneSQLPlugin (whereas the extant DatabasePlugin, like most other plugins, appears to work with external document formats, database records in DatabasePlugin's case).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
RevLine 
[14031]1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
[14957]40
41 if (defined $ENV{'GSDLEXTS'}) {
42 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
43 foreach my $e (@extensions) {
44 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
45
46 unshift (@INC, "$ext_prefix/perllib");
47 unshift (@INC, "$ext_prefix/perllib/cpan");
[16788]48 unshift (@INC, "$ext_prefix/perllib/plugins");
49 unshift (@INC, "$ext_prefix/perllib/plugouts");
[14957]50 }
51 }
[21291]52 if (defined $ENV{'GSDL3EXTS'}) {
53 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
54 foreach my $e (@extensions) {
55 my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
56
57 unshift (@INC, "$ext_prefix/perllib");
58 unshift (@INC, "$ext_prefix/perllib/cpan");
59 unshift (@INC, "$ext_prefix/perllib/plugins");
60 unshift (@INC, "$ext_prefix/perllib/plugouts");
61 }
62 }
[23372]63
64 if ((defined $ENV{'DEBUG_UNICODE'}) && (defined $ENV{'DEBUG_UNICODE'})) {
65 binmode(STDERR,":utf8");
66 }
[14031]67}
68
[27305]69# Pragma
[14957]70use strict;
[31132]71no strict 'subs'; # allow barewords (eg STDERR) as function arguments
[27305]72use warnings;
73
74# Modules
75use Symbol qw<qualify>; # Needed for runtime loading of modules [jmt12]
76
77# Greenstone Modules
78use FileUtils;
[18456]79use inexport;
[27305]80use util;
[28640]81use gsprintf 'gsprintf';
[14031]82
83
[14957]84# used to control output file format
[14031]85my $saveas_list =
[17751]86 [ { 'name' => "GreenstoneXML",
87 'desc' => "{export.saveas.GreenstoneXML}"},
[14957]88 { 'name' => "GreenstoneMETS",
89 'desc' => "{export.saveas.GreenstoneMETS}"},
[32528]90 { 'name' => "GreenstoneSQL",
91 'desc' => "{export.saveas.GreenstoneSQL}"},
92 { 'name' => "GreenstoneSQL_metadata_only",
93 'desc' => "{export.saveas.GreenstoneSQL.meta_only}"},
94 { 'name' => "GreenstoneSQL_fulltext_only",
95 'desc' => "{export.saveas.GreenstoneSQL.text_only}"},
[17038]96 ];
[14031]97
98
99# Possible attributes for each argument
100# name: The name of the argument
101# desc: A description (or more likely a reference to a description) for this argument
102# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
103# reqd: Is this argument required?
104# hiddengli: Is this argument hidden in GLI?
105# modegli: The lowest detail mode this argument is visible at in GLI
106
[14957]107my $saveas_argument
108 = { 'name' => "saveas",
109 'desc' => "{import.saveas}",
110 'type' => "enum",
111 'list' => $saveas_list,
[28703]112 'deft' => "GreenstoneXML",
[14957]113 'reqd' => "no",
114 'modegli' => "3" };
115
116
[14031]117my $arguments =
[14957]118 [
119 $saveas_argument,
[31700]120 { 'name' => "saveas_options",
121 'desc' => "{import.saveas_options}",
122 'type' => "string",
123 'reqd' => "no" },
[28639]124 { 'name' => "sortmeta",
[14031]125 'desc' => "{import.sortmeta}",
[19625]126 'type' => "string",
127 #'type' => "metadata", #doesn't work properly in GLI
[14031]128 'reqd' => "no",
[18590]129 'modegli' => "2" },
[14031]130 { 'name' => "removeprefix",
131 'desc' => "{BasClas.removeprefix}",
132 'type' => "regexp",
133 'deft' => "",
134 'reqd' => "no",
135 'modegli' => "3" },
136 { 'name' => "removesuffix",
137 'desc' => "{BasClas.removesuffix}",
138 'type' => "regexp",
139 'deft' => "",
140 'reqd' => "no",
141 'modegli' => "3" },
142 { 'name' => "groupsize",
143 'desc' => "{import.groupsize}",
144 'type' => "int",
145 'deft' => "1",
146 'reqd' => "no",
[18590]147 'modegli' => "2" },
[28639]148 { 'name' => "archivedir",
149 'desc' => "{import.archivedir}",
150 'type' => "string",
151 'reqd' => "no",
[28703]152 'deft' => "archives",
[28639]153 'hiddengli' => "yes" },
154 @$inexport::directory_arguments,
[14031]155 { 'name' => "gzip",
156 'desc' => "{import.gzip}",
157 'type' => "flag",
158 'reqd' => "no",
[18590]159 'modegli' => "3" },
[31132]160 @$inexport::arguments,
161 { 'name' => "NO_IMPORT",
162 'desc' => "{import.NO_IMPORT}",
163 'type' => "flag",
164 'reqd' => "no",
165 'modegli' => "3"}
[28639]166];
[14031]167
168my $options = { 'name' => "import.pl",
169 'desc' => "{import.desc}",
170 'args' => $arguments };
171
[27305]172my $function_to_inexport_subclass_mappings = {};
[14031]173
[27305]174sub main
[22413]175{
[27305]176 # Dynamically include arguments from any subclasses of inexport we find
177 # in the extensions directory
178 if (defined $ENV{'GSDLEXTS'})
179 {
180 &_scanForSubclasses($ENV{'GSDLHOME'}, $ENV{'GSDLEXTS'});
181 }
182 if (defined $ENV{'GSDL3EXTS'})
183 {
184 &_scanForSubclasses($ENV{'GSDL3SRCHOME'}, $ENV{'GSDL3EXTS'});
185 }
[14031]186
[27305]187 # Loop through arguments, checking to see if any depend on a specific
188 # subclass of InExport. Note that we load the first subclass we encounter
189 # so only support a single 'override' ATM.
190 my $inexport_subclass;
191 foreach my $argument (@ARGV)
192 {
[31132]193 if ($argument eq "-NO_IMPORT") {
194 &gsprintf(STDERR, "{import.NO_IMPORT_set}\n\n");
195 exit 0;
196 }
[27305]197 # proper arguments start with a hyphen
198 if ($argument =~ /^-/ && defined $function_to_inexport_subclass_mappings->{$argument})
199 {
200 my $required_inexport_subclass = $function_to_inexport_subclass_mappings->{$argument};
201 if (!defined $inexport_subclass)
202 {
203 $inexport_subclass = $required_inexport_subclass;
204 }
205 # Oh noes! The user has included specific arguments from two different
206 # inexport subclasses... this isn't supported
207 elsif ($inexport_subclass ne $required_inexport_subclass)
208 {
209 print STDERR "Error! You cannot specify arguments from two different extention specific inexport modules: " . $inexport_subclass . " != " . $required_inexport_subclass . "\n";
210 exit;
211 }
212 }
213 }
[29097]214
[27305]215 my $inexport;
216 if (defined $inexport_subclass)
217 {
218 print "* Loading Overriding InExport Module: " . $inexport_subclass . "\n";
219 require $inexport_subclass . '.pm';
220 $inexport = new $inexport_subclass("import",\@ARGV,$options);
221 }
[29097]222
[27305]223 # We don't have a overridden inexport, or the above command failed somehow
224 # so load the base inexport class
225 if (!defined $inexport)
226 {
227 $inexport = new inexport("import",\@ARGV,$options);
228 }
229
230 my $collection = $inexport->get_collection();
[29097]231
[27305]232 if (defined $collection)
233 {
234 my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options);
[31132]235 if ($collect_cfg->{'NO_IMPORT'}) {
236 &gsprintf(STDERR, "{import.NO_IMPORT_set}\n\n");
237 exit 0;
238 }
[28639]239 #$inexport->set_collection_options($collect_cfg);
240 &set_collection_options($inexport, $collect_cfg);
[27305]241
[31132]242
[27305]243 my $pluginfo = $inexport->process_files($config_filename,$collect_cfg);
244
245 $inexport->generate_statistics($pluginfo);
[29107]246 }
247
[27305]248 $inexport->deinit();
249}
250# main()
251
252# @function _scanForSubclasses()
253# @param $dir The extension directory to look within
254# @param $exts A list of the available extensions (as a colon separated string)
255# @return The number of subclasses of InExport found as an Integer
256sub _scanForSubclasses
257{
258 my ($dir, $exts) = @_;
259 my $inexport_class_count = 0;
260 my $ext_prefix = &FileUtils::filenameConcatenate($dir, "ext");
261 my @extensions = split(/:/, $exts);
262 foreach my $e (@extensions)
263 {
264 # - any subclass of InExport must be prefixed with the name of the ext
265 my $package_name = $e . 'inexport';
266 $package_name =~ s/[^a-z]//gi; # package names have limited characters
267 my $inexport_filename = $package_name . '.pm';
268 my $inexport_path = &FileUtils::filenameConcatenate($ext_prefix, $e, 'perllib', $inexport_filename);
269 # see if we have a subclass of InExport lurking in that extension folder
270 if (-f $inexport_path)
271 {
272 # - note we load the filename (with pm) unlike normal modules
273 require $inexport_filename;
274 # - make call to the newly created package
275 my $symbol = qualify('getSupportedArguments', $package_name);
276 # - strict prevents strings being used as function calls, so temporarily
277 # disable that pragma
278 no strict;
279 # - lets check that the function we are about to call actually exists
280 if ( defined &{$symbol} )
281 {
282 my $extra_arguments = &{$symbol}();
283 foreach my $argument (@{$extra_arguments})
284 {
285 # - record a mapping from each extra arguments to the inexport class
286 # that supports it. We put the hyphen on here to make comparing
287 # with command line arguments even easier
288 $function_to_inexport_subclass_mappings->{'-' . $argument->{'name'}} = $package_name;
289 # - and them add them as acceptable arguments to import.pl
290 push(@{$options->{'args'}}, $argument);
291 }
292 $inexport_class_count++;
293 }
294 else
295 {
296 print "Warning! A subclass of InExport module (named '" . $inexport_filename . "') does not implement the required getSupportedArguments() function - ignoring. Found in: " . $inexport_path . "\n";
297 }
[22459]298 }
[27305]299 }
300 return $inexport_class_count;
[22413]301}
[27305]302# _scanForInExportModules()
[14031]303
[28639]304# look up collect.cfg for import options, then all inexport version for the
305# common ones
306sub set_collection_options
307{
308
309 my ($inexport, $collectcfg) = @_;
[28640]310 my $out = $inexport->{'out'};
[28639]311
[28703]312 # check all options for default_optname - this will be set if the parsing
313 # code has just set the value based on the arg default. In this case,
314 # check in collect.cfg for the option
315
316 # groupsize can only be defined for import, not export, and actually only
317 # applies to GreenstoneXML format.
318 if (defined $inexport->{'default_groupsize'}) {
[28639]319 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
320 $inexport->{'groupsize'} = $collectcfg->{'groupsize'};
321 }
[28703]322
[28639]323 }
[28703]324 if (defined $inexport->{'default_saveas'}) {
[28639]325 if (defined $collectcfg->{'saveas'}
326 && $collectcfg->{'saveas'} =~ /^(GreenstoneXML|GreenstoneMETS)$/) {
327 $inexport->{'saveas'} = $collectcfg->{'saveas'};
328 } else {
329 $inexport->{'saveas'} = "GreenstoneXML"; # the default
330 }
331 }
[31700]332 if (!defined $inexport->{'saveas_options'} || $inexport->{'saveas_options'} eq "") {
333 if (defined $collectcfg->{'saveas_options'} ){
334 $inexport->{'saveas_options'} = $collectcfg->{'saveas_options'};
335 }
336 }
[28639]337
338 my $sortmeta = $inexport->{'sortmeta'};
[28703]339 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
[28639]340 $sortmeta = $collectcfg->{'sortmeta'};
341 }
342 # sortmeta cannot be used with group size
343 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
344 if (defined $sortmeta && $inexport->{'groupsize'} > 1) {
345 &gsprintf($out, "{import.cannot_sort}\n\n");
346 $sortmeta = undef;
347 }
348 if (defined $sortmeta) {
349 &gsprintf($out, "{import.sortmeta_paired_with_ArchivesInfPlugin}\n\n");
350 }
351 $inexport->{'sortmeta'} = $sortmeta;
352
353 if (defined $collectcfg->{'removeprefix'} && $inexport->{'removeprefix'} eq "") {
354 $inexport->{'removeprefix'} = $collectcfg->{'removeprefix'};
355 }
356
357 if (defined $collectcfg->{'removesuffix'} && $inexport->{'removesuffix'} eq "") {
358 $inexport->{'removesuffix'} = $collectcfg->{'removesuffix'};
359 }
360
361 $inexport->set_collection_options($collectcfg);
362
363}
[22413]364&main();
Note: See TracBrowser for help on using the repository browser.