source: trunk/gsdl/bin/script/import.pl@ 2766

Last change on this file since 2766 was 2766, checked in by sjboddie, 23 years ago

minor bug fix

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
RevLine 
[1031]1#!/usr/bin/perl -w
[4]2
[538]3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
[4]29# This program will import a number of files into a particular collection
30
[1424]31package import;
32
[4]33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
[9]36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
[946]38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
[4]39}
40
41use arcinfo;
42use colcfg;
43use plugin;
[783]44use docprint;
[130]45use util;
46use parsargv;
[1424]47use FileHandle;
[4]48
49sub print_usage {
[2359]50 print STDOUT "\n";
51 print STDOUT "import.pl: Converts documents in collections -importdir directory into\n";
52 print STDOUT " gml documents which are written to the -archivedir directory.\n\n";
53 print STDOUT " usage: $0 [options] collection-name\n\n";
54 print STDOUT " options:\n";
55 print STDOUT " -verbosity number 0=none, 3=lots\n";
56 print STDOUT " -importdir directory Where the original material lives\n";
57 print STDOUT " -archivedir directory Where the converted material ends up\n";
58 print STDOUT " -keepold Will not destroy the current contents of the\n";
59 print STDOUT " archives directory (the default)\n";
60 print STDOUT " -removeold Will remove the old contents of the archives\n";
61 print STDOUT " directory -- use with care\n";
62 print STDOUT " -gzip Use gzip to compress resulting gml documents\n";
63 print STDOUT " (don't forget to include ZIPPlug in your plugin\n";
64 print STDOUT " list when building from compressed documents)\n";
65 print STDOUT " -maxdocs number Maximum number of documents to import\n";
66 print STDOUT " -groupsize number Number of GML documents to group into one file\n";
67 print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n";
68 print STDOUT " identifiers for each document. \"hash\" (the\n";
69 print STDOUT " default) hashes the contents of the file and so\n";
70 print STDOUT " will be the same every time the collection is\n";
71 print STDOUT " imported. \"incremental\" is a simple document\n";
72 print STDOUT " count and so will be significantly faster than\n";
73 print STDOUT " \"hash\". It is not guaranteed to always assign\n";
74 print STDOUT " the same identifier to a given document though\n";
75 print STDOUT " and does not allow further documents to be added\n";
76 print STDOUT " to existing gml archives\n";
77 print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n";
78 print STDOUT " building. This will be disabled if groupsize > 1\n";
79 print STDOUT " -debug Print imported text to STDOUT\n";
80 print STDOUT " -collectdir directory Collection directory (defaults to " .
[2287]81 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
[2755]82 print STDOUT " -out name Filename or handle to print output status to.\n";
83 print STDOUT " -faillog name Filename to log import failures to.\n";
84 print STDOUT " The default is <collectdir>/colname/etc/fail.log\n\n";
[2359]85 print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
86 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
[4]87}
88
[1424]89&main();
[4]90
91sub main {
[783]92 my ($verbosity, $importdir, $archivedir, $keepold,
[2328]93 $removeold, $gzip, $groupsize, $OIDtype, $debug,
94 $maxdocs, $collection, $configfilename, $collectcfg,
95 $pluginfo, $sortmeta, $archive_info_filename,
[2755]96 $archive_info, $processor, $out, $faillogname, $collectdir);
[2355]97
98 # note that no defaults are passed for most options as they're set
99 # later (after we check the collect.cfg file)
100 if (!parsargv::parse(\@ARGV,
101 'verbosity/\d+/', \$verbosity,
[130]102 'importdir/.*/', \$importdir,
[2355]103 'archivedir/.*/', \$archivedir,
104 'keepold', \$keepold,
105 'removeold', \$removeold,
106 'gzip', \$gzip,
[2766]107 'groupsize/\d+/1', \$groupsize,
[2355]108 'OIDtype/^(hash|incremental)$/', \$OIDtype,
109 'sortmeta/.*/', \$sortmeta,
[783]110 'debug', \$debug,
[2355]111 'maxdocs/^\-?\d+/', \$maxdocs,
[2287]112 'collectdir/.*/', \$collectdir,
[2755]113 'out/.*/STDERR', \$out,
114 'faillog/.*/', \$faillogname)) {
[4]115 &print_usage();
116 die "\n";
117 }
[130]118
[1424]119 my $close_out = 0;
120 if ($out !~ /^(STDERR|STDOUT)$/i) {
[1431]121 open (OUT, ">$out") || die "Couldn't open output file $out\n";
[1424]122 $out = 'import::OUT';
123 $close_out = 1;
124 }
125 $out->autoflush(1);
126
[130]127 # set removeold to false if it has been defined
128 $removeold = 0 if ($keepold);
129
130 # get and check the collection name
[2287]131 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
[130]132 &print_usage();
[4]133 die "\n";
134 }
[2755]135
136 # check and/or set fail log file
137 if ($faillogname eq "") {
138 $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log";
139 } else {
140 my $can_open=1;
141 open (TESTOPEN, ">$faillogname") || ($can_open=0);
142 close (TESTOPEN);
143 if ($can_open==0) {
144 warn "fail.log - cannot write to \"$faillogname\", using default\n \"$ENV{GSDLCOLLECTDIR}/etc/fail.log\" instead.\n";
145 $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log";
146 }
147 }
148 # test that default is writable...
149 if ($faillogname eq "$ENV{GSDLCOLLECTDIR}/etc/fail.log") {
150 my $can_open=1;
151 open (TESTOPEN, ">$faillogname") || ($can_open=0);
152 close (TESTOPEN);
153 if ($can_open==0) {
154 warn "warning - cannot write to \"$faillogname\".\n";
155 $faillogname="";
156 }
157 }
[130]158
[2755]159
[1287]160 # check sortmeta
161 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
162 if (defined $sortmeta && $groupsize > 1) {
[1424]163 print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
164 print $out " sortmeta option will be ignored\n\n";
[1287]165 $sortmeta = undef;
166 }
167
[843]168 # dynamically load 'docsave' module so it can pick up on a collection
169 # specific docsave.pm is specified.
170
171 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
172 require docsave;
173
174
[2355]175 # get the list of plugins for this collection and set any options that
176 # were specified in the collect.cfg (all import.pl options except
[2755]177 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
[2355]178 # options must be known before we read the collect.cfg))
[814]179 my $plugins = [];
[2287]180 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
[130]181 if (-e $configfilename) {
182 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
[814]183 if (defined $collectcfg->{'plugin'}) {
184 $plugins = $collectcfg->{'plugin'};
[4]185 }
[2355]186
187 if ($verbosity !~ /\d+/) {
188 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
189 $verbosity = $collectcfg->{'verbosity'};
190 } else {
191 $verbosity = 2; # the default
192 }
193 }
[130]194 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
195 $importdir = $collectcfg->{'importdir'};
196 }
197 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
198 $archivedir = $collectcfg->{'archivedir'};
199 }
200 if (defined $collectcfg->{'removeold'}) {
201 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
202 $removeold = 1;
203 }
204 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
205 $removeold = 0;
206 }
207 }
[2355]208 if (defined $collectcfg->{'keepold'}) {
209 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
210 $removeold = 1;
211 }
212 }
213 if (defined $collectcfg->{'gzip'} && !$gzip) {
214 if ($collectcfg->{'gzip'} =~ /^true$/i) {
215 $gzip = 1;
216 }
217 }
218 if ($maxdocs !~ /\-?\d+/) {
219 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
220 $maxdocs = $collectcfg->{'maxdocs'};
221 } else {
222 $maxdocs = -1; # the default
223 }
224 }
[2766]225 if ($groupsize == 1) {
226 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
[2355]227 $groupsize = $collectcfg->{'groupsize'};
228 }
229 }
230 if ($OIDtype !~ /^(hash|incremental)$/) {
231 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
232 $OIDtype = $collectcfg->{'OIDtype'};
233 } else {
234 $OIDtype = "hash"; # the default
235 }
236 }
237 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
238 $sortmeta = $collectcfg->{'sortmeta'};
239 }
240 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
241 $debug = 1;
242 }
243
[98]244 } else {
[130]245 die "Couldn't find the configuration file $configfilename\n";
[4]246 }
247
[130]248 # fill in the default import and archives directories if none
249 # were supplied, turn all \ into / and remove trailing /
[2287]250 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
[130]251 $importdir =~ s/[\\\/]+/\//g;
252 $importdir =~ s/\/$//;
[2287]253 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
[130]254 $archivedir =~ s/[\\\/]+/\//g;
255 $archivedir =~ s/\/$//;
[4]256
257 # load all the plugins
[2755]258 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillogname);
[4]259 if (scalar(@$pluginfo) == 0) {
[1424]260 print $out "No plugins were loaded.\n";
[4]261 die "\n";
262 }
[843]263
[130]264 # remove the old contents of the archives directory if needed
265 if ($removeold && -e $archivedir) {
[2531]266 print $out "Removing current contents of the archives directory\n";
267 sleep(3); # just in case...
[130]268 &util::rm_r ($archivedir);
269 }
[843]270
[4]271 # read the archive information file
[783]272 if (!$debug) {
273 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
274 $archive_info = new arcinfo ();
275 $archive_info->load_info ($archive_info_filename);
[4]276
[783]277 # create a docsave object to process the documents
[1424]278 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
[783]279 $processor->setarchivedir ($archivedir);
[1287]280 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
[2328]281 $processor->set_OIDtype ($OIDtype);
[783]282 } else {
283 $processor = new docprint ();
284 }
[4]285
[843]286 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
287
[4]288 # process the import directory
[843]289 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
[4]290
[2287]291 &plugin::end($pluginfo, $processor);
[896]292
[4]293 # write out the archive information file
[783]294 if (!$debug) {
[1287]295 $processor->close_file_output() if $groupsize > 1;
[783]296 $archive_info->save_info($archive_info_filename);
297 }
[1424]298 close OUT if $close_out;
[4]299}
Note: See TracBrowser for help on using the repository browser.