root/main/trunk/greenstone2/bin/script/import.pl @ 32518

Revision 32518, 12.1 KB (checked in by ak19, 13 months ago)

Untested first attempt at MySQLPlugout that is meant to write metadata and/or fulltext to a mysql db rather than to doc.xml. We're switching over to using perl mysql libs, dbd (mysql) for drivers and dbi (mysql), for db access. However, I still want to commit this initial attempt at manually doing database operations in perl.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
40
41    if (defined $ENV{'GSDLEXTS'}) {
42    my @extensions = split(/:/,$ENV{'GSDLEXTS'});
43    foreach my $e (@extensions) {
44        my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
45
46        unshift (@INC, "$ext_prefix/perllib");
47        unshift (@INC, "$ext_prefix/perllib/cpan");
48        unshift (@INC, "$ext_prefix/perllib/plugins");
49        unshift (@INC, "$ext_prefix/perllib/plugouts");
50    }
51    }
52    if (defined $ENV{'GSDL3EXTS'}) {
53    my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
54    foreach my $e (@extensions) {
55        my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
56
57        unshift (@INC, "$ext_prefix/perllib");
58        unshift (@INC, "$ext_prefix/perllib/cpan");
59        unshift (@INC, "$ext_prefix/perllib/plugins");
60        unshift (@INC, "$ext_prefix/perllib/plugouts");
61    }
62    }
63
64    if ((defined $ENV{'DEBUG_UNICODE'}) && (defined $ENV{'DEBUG_UNICODE'})) {
65    binmode(STDERR,":utf8");
66    }
67}
68
69# Pragma
70use strict;
71no strict 'subs'; # allow barewords (eg STDERR) as function arguments
72use warnings;
73
74# Modules
75use Symbol qw<qualify>; # Needed for runtime loading of modules [jmt12]
76
77# Greenstone Modules
78use FileUtils;
79use inexport;
80use util;
81use gsprintf 'gsprintf';
82
83
84# used to control output file format
85my $saveas_list =
86    [ { 'name' => "GreenstoneXML",
87        'desc' => "{export.saveas.GreenstoneXML}"},
88      { 'name' => "GreenstoneMETS",
89        'desc' => "{export.saveas.GreenstoneMETS}"},
90      { 'name' => "MySQL",
91        'desc' => "{export.saveas.MySQL}"},
92      { 'name' => "MySQL_metadata_only",
93        'desc' => "{export.saveas.MySQL.meta_only}"},
94      { 'name' => "MySQL_fulltext_only",
95        'desc' => "{export.saveas.MySQL.text_only}"},
96      ];
97
98
99# Possible attributes for each argument
100# name: The name of the argument
101# desc: A description (or more likely a reference to a description) for this argument
102# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
103# reqd: Is this argument required?
104# hiddengli: Is this argument hidden in GLI?
105# modegli: The lowest detail mode this argument is visible at in GLI
106
107my $saveas_argument
108    = { 'name' => "saveas",
109    'desc' => "{import.saveas}",
110    'type' => "enum",
111    'list' => $saveas_list,
112    'deft' => "GreenstoneXML",
113    'reqd' => "no",
114    'modegli' => "3" };
115
116
117my $arguments =
118    [
119      $saveas_argument,
120      { 'name' => "saveas_options",
121    'desc' => "{import.saveas_options}",
122    'type' => "string",
123    'reqd' => "no" },
124    { 'name' => "sortmeta",
125    'desc' => "{import.sortmeta}",
126    'type' => "string",
127    #'type' => "metadata", #doesn't work properly in GLI
128    'reqd' => "no",
129    'modegli' => "2" },
130      { 'name' => "removeprefix",
131    'desc' => "{BasClas.removeprefix}",
132    'type' => "regexp",
133    'deft' => "",
134    'reqd' => "no",
135    'modegli' => "3" },
136      { 'name' => "removesuffix",
137    'desc' => "{BasClas.removesuffix}",
138    'type' => "regexp",
139    'deft' => "",
140    'reqd' => "no",
141    'modegli' => "3" },
142      { 'name' => "groupsize",
143    'desc' => "{import.groupsize}",
144    'type' => "int",
145    'deft' => "1",
146    'reqd' => "no",
147    'modegli' => "2" },
148      { 'name' => "archivedir",
149    'desc' => "{import.archivedir}",
150    'type' => "string",
151    'reqd' => "no",
152    'deft' => "archives",
153        'hiddengli' => "yes" },
154      @$inexport::directory_arguments,
155      { 'name' => "gzip",
156    'desc' => "{import.gzip}",
157    'type' => "flag",
158    'reqd' => "no",
159    'modegli' => "3" },
160     @$inexport::arguments,
161      { 'name' => "NO_IMPORT",
162    'desc' => "{import.NO_IMPORT}",
163    'type' => "flag",
164    'reqd' => "no",
165    'modegli' => "3"}
166];
167
168my $options = { 'name' => "import.pl",
169        'desc' => "{import.desc}",
170        'args' => $arguments };
171
172my $function_to_inexport_subclass_mappings = {};
173
174sub main
175{
176  # Dynamically include arguments from any subclasses of inexport we find
177  # in the extensions directory
178  if (defined $ENV{'GSDLEXTS'})
179  {
180    &_scanForSubclasses($ENV{'GSDLHOME'}, $ENV{'GSDLEXTS'});
181  }
182  if (defined $ENV{'GSDL3EXTS'})
183  {
184    &_scanForSubclasses($ENV{'GSDL3SRCHOME'}, $ENV{'GSDL3EXTS'});
185  }
186
187  # Loop through arguments, checking to see if any depend on a specific
188  # subclass of InExport. Note that we load the first subclass we encounter
189  # so only support a single 'override' ATM.
190  my $inexport_subclass;
191  foreach my $argument (@ARGV)
192  {
193      if ($argument eq "-NO_IMPORT") {
194      &gsprintf(STDERR, "{import.NO_IMPORT_set}\n\n"); 
195      exit 0;
196      }
197    # proper arguments start with a hyphen
198    if ($argument =~ /^-/ && defined $function_to_inexport_subclass_mappings->{$argument})
199    {
200      my $required_inexport_subclass = $function_to_inexport_subclass_mappings->{$argument};
201      if (!defined $inexport_subclass)
202      {
203        $inexport_subclass = $required_inexport_subclass;
204      }
205      # Oh noes! The user has included specific arguments from two different
206      # inexport subclasses... this isn't supported
207      elsif ($inexport_subclass ne $required_inexport_subclass)
208      {
209        print STDERR "Error! You cannot specify arguments from two different extention specific inexport modules: " . $inexport_subclass . " != " . $required_inexport_subclass . "\n";
210        exit;
211      }
212    }
213  }
214 
215  my $inexport;
216  if (defined $inexport_subclass)
217  {
218    print "* Loading Overriding InExport Module: " . $inexport_subclass . "\n";
219    require $inexport_subclass . '.pm';
220    $inexport = new $inexport_subclass("import",\@ARGV,$options);
221  }
222 
223  # We don't have a overridden inexport, or the above command failed somehow
224  # so load the base inexport class
225  if (!defined $inexport)
226  {
227    $inexport = new inexport("import",\@ARGV,$options);
228  }
229
230  my $collection = $inexport->get_collection();
231 
232  if (defined $collection)
233  {
234    my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options);
235    if ($collect_cfg->{'NO_IMPORT'}) {
236    &gsprintf(STDERR, "{import.NO_IMPORT_set}\n\n");   
237    exit 0;
238    }
239    #$inexport->set_collection_options($collect_cfg);
240    &set_collection_options($inexport, $collect_cfg);
241
242   
243    my $pluginfo = $inexport->process_files($config_filename,$collect_cfg);
244
245    $inexport->generate_statistics($pluginfo);
246  }
247
248  $inexport->deinit();
249}
250# main()
251
252# @function _scanForSubclasses()
253# @param $dir The extension directory to look within
254# @param $exts A list of the available extensions (as a colon separated string)
255# @return The number of subclasses of InExport found as an Integer
256sub _scanForSubclasses
257{
258  my ($dir, $exts) = @_;
259  my $inexport_class_count = 0;
260  my $ext_prefix = &FileUtils::filenameConcatenate($dir, "ext");
261  my @extensions = split(/:/, $exts);
262  foreach my $e (@extensions)
263  {
264    # - any subclass of InExport must be prefixed with the name of the ext
265    my $package_name = $e . 'inexport';
266    $package_name =~ s/[^a-z]//gi; # package names have limited characters
267    my $inexport_filename = $package_name . '.pm';
268    my $inexport_path = &FileUtils::filenameConcatenate($ext_prefix, $e, 'perllib', $inexport_filename);
269    # see if we have a subclass of InExport lurking in that extension folder
270    if (-f $inexport_path)
271    {
272      # - note we load the filename (with pm) unlike normal modules
273      require $inexport_filename;
274      # - make call to the newly created package
275      my $symbol = qualify('getSupportedArguments', $package_name);
276      # - strict prevents strings being used as function calls, so temporarily
277      #   disable that pragma
278      no strict;
279      # - lets check that the function we are about to call actually exists
280      if ( defined &{$symbol} )
281      {
282        my $extra_arguments = &{$symbol}();
283        foreach my $argument (@{$extra_arguments})
284        {
285          # - record a mapping from each extra arguments to the inexport class
286          #   that supports it. We put the hyphen on here to make comparing
287          #   with command line arguments even easier
288          $function_to_inexport_subclass_mappings->{'-' . $argument->{'name'}} = $package_name;
289          # - and them add them as acceptable arguments to import.pl
290          push(@{$options->{'args'}}, $argument);
291        }
292        $inexport_class_count++;
293      }
294      else
295      {
296        print "Warning! A subclass of InExport module (named '" . $inexport_filename . "') does not implement the required getSupportedArguments() function - ignoring. Found in: " . $inexport_path . "\n";
297      }
298    }
299  }
300  return $inexport_class_count;
301}
302# _scanForInExportModules()
303
304# look up collect.cfg for import options, then all inexport version for the
305# common ones
306sub set_collection_options
307{
308
309    my ($inexport, $collectcfg) = @_;
310    my $out        = $inexport->{'out'};
311
312    # check all options for default_optname - this will be set if the parsing
313    # code has just set the value based on the arg default. In this case,
314    # check in collect.cfg for the option
315   
316    # groupsize can only be defined for import, not export, and actually only
317    # applies to GreenstoneXML format.
318    if (defined $inexport->{'default_groupsize'}) {
319    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
320        $inexport->{'groupsize'} = $collectcfg->{'groupsize'};
321    }
322
323    }
324    if (defined $inexport->{'default_saveas'}) {
325    if (defined $collectcfg->{'saveas'}
326        && $collectcfg->{'saveas'} =~ /^(GreenstoneXML|GreenstoneMETS)$/) {
327        $inexport->{'saveas'} = $collectcfg->{'saveas'};
328    } else {
329        $inexport->{'saveas'} = "GreenstoneXML"; # the default
330    }
331    }
332    if (!defined $inexport->{'saveas_options'} || $inexport->{'saveas_options'} eq "") {
333    if (defined $collectcfg->{'saveas_options'} ){
334        $inexport->{'saveas_options'} = $collectcfg->{'saveas_options'};
335    }
336    }
337   
338    my $sortmeta = $inexport->{'sortmeta'};
339    if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
340    $sortmeta = $collectcfg->{'sortmeta'};
341    }
342    # sortmeta cannot be used with group size
343    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
344    if (defined $sortmeta && $inexport->{'groupsize'} > 1) {
345    &gsprintf($out, "{import.cannot_sort}\n\n");
346    $sortmeta = undef;
347    }
348    if (defined $sortmeta) {
349    &gsprintf($out, "{import.sortmeta_paired_with_ArchivesInfPlugin}\n\n");
350    }
351    $inexport->{'sortmeta'} = $sortmeta;
352
353    if (defined $collectcfg->{'removeprefix'} && $inexport->{'removeprefix'} eq "") {
354    $inexport->{'removeprefix'} = $collectcfg->{'removeprefix'};
355    }
356   
357    if (defined $collectcfg->{'removesuffix'} && $inexport->{'removesuffix'} eq "") {
358    $inexport->{'removesuffix'} = $collectcfg->{'removesuffix'};
359    }
360
361    $inexport->set_collection_options($collectcfg);
362 
363}
364&main();
Note: See TracBrowser for help on using the browser.