source: tags/gsdl-2_37-distribution/gsdl/bin/script/import.pl@ 2843

Last change on this file since 2843 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use arcinfo;
42use colcfg;
43use plugin;
44use docprint;
45use util;
46use parsargv;
47use FileHandle;
48
49sub print_usage {
50 print STDOUT "\n";
51 print STDOUT "import.pl: Converts documents in collections -importdir directory into\n";
52 print STDOUT " gml documents which are written to the -archivedir directory.\n\n";
53 print STDOUT " usage: $0 [options] collection-name\n\n";
54 print STDOUT " options:\n";
55 print STDOUT " -verbosity number 0=none, 3=lots\n";
56 print STDOUT " -importdir directory Where the original material lives\n";
57 print STDOUT " -archivedir directory Where the converted material ends up\n";
58 print STDOUT " -keepold Will not destroy the current contents of the\n";
59 print STDOUT " archives directory (the default)\n";
60 print STDOUT " -removeold Will remove the old contents of the archives\n";
61 print STDOUT " directory -- use with care\n";
62 print STDOUT " -gzip Use gzip to compress resulting gml documents\n";
63 print STDOUT " (don't forget to include ZIPPlug in your plugin\n";
64 print STDOUT " list when building from compressed documents)\n";
65 print STDOUT " -maxdocs number Maximum number of documents to import\n";
66 print STDOUT " -groupsize number Number of GML documents to group into one file\n";
67 print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n";
68 print STDOUT " identifiers for each document. \"hash\" (the\n";
69 print STDOUT " default) hashes the contents of the file and so\n";
70 print STDOUT " will be the same every time the collection is\n";
71 print STDOUT " imported. \"incremental\" is a simple document\n";
72 print STDOUT " count and so will be significantly faster than\n";
73 print STDOUT " \"hash\". It is not guaranteed to always assign\n";
74 print STDOUT " the same identifier to a given document though\n";
75 print STDOUT " and does not allow further documents to be added\n";
76 print STDOUT " to existing gml archives\n";
77 print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n";
78 print STDOUT " building. This will be disabled if groupsize > 1\n";
79 print STDOUT " -debug Print imported text to STDOUT\n";
80 print STDOUT " -collectdir directory Collection directory (defaults to " .
81 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
82 print STDOUT " -out name Filename or handle to print output status to.\n";
83 print STDOUT " The default is STDERR\n";
84 print STDOUT " -statsfile name Filename or handle to print import statistics to.\n";
85 print STDOUT " The default is STDERR\n";
86 print STDOUT " -faillog name Fail log filename. This log receives the filenames\n";
87 print STDOUT " of any files which fail to be processed (defaults.\n";
88 print STDOUT " to " .
89 &util::filename_cat("<collectdir>", "colname", "etc", "fail.log") . ")\n";
90 print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
91 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
92}
93
94&main();
95
96sub main {
97 my ($verbosity, $importdir, $archivedir, $keepold,
98 $removeold, $gzip, $groupsize, $OIDtype, $debug,
99 $maxdocs, $collection, $configfilename, $collectcfg,
100 $pluginfo, $sortmeta, $archive_info_filename, $statsfile,
101 $archive_info, $processor, $out, $faillog, $collectdir);
102
103 # note that no defaults are passed for most options as they're set
104 # later (after we check the collect.cfg file)
105 if (!parsargv::parse(\@ARGV,
106 'verbosity/\d+/', \$verbosity,
107 'importdir/.*/', \$importdir,
108 'archivedir/.*/', \$archivedir,
109 'keepold', \$keepold,
110 'removeold', \$removeold,
111 'gzip', \$gzip,
112 'groupsize/\d+/1', \$groupsize,
113 'OIDtype/^(hash|incremental)$/', \$OIDtype,
114 'sortmeta/.*/', \$sortmeta,
115 'debug', \$debug,
116 'maxdocs/^\-?\d+/', \$maxdocs,
117 'collectdir/.*/', \$collectdir,
118 'out/.*/STDERR', \$out,
119 'statsfile/.*/STDERR', \$statsfile,
120 'faillog/.*/', \$faillog)) {
121 &print_usage();
122 die "\n";
123 }
124
125 my $close_out = 0;
126 if ($out !~ /^(STDERR|STDOUT)$/i) {
127 open (OUT, ">$out") || die "Couldn't open output file $out\n";
128 $out = 'import::OUT';
129 $close_out = 1;
130 }
131 $out->autoflush(1);
132
133 # set removeold to false if it has been defined
134 $removeold = 0 if ($keepold);
135
136 # get and check the collection name
137 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
138 &print_usage();
139 die "\n";
140 }
141
142 if ($faillog eq "") {
143 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
144 }
145 open (FAILLOG, ">$faillog") || die "Couldn't open fail log $faillog\n";
146 $faillog = 'import::FAILLOG';
147 $faillog->autoflush(1);
148
149 # check sortmeta
150 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
151 if (defined $sortmeta && $groupsize > 1) {
152 print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
153 print $out " sortmeta option will be ignored\n\n";
154 $sortmeta = undef;
155 }
156
157 # dynamically load 'docsave' module so it can pick up on a collection
158 # specific docsave.pm is specified.
159
160 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
161 require docsave;
162
163
164 # get the list of plugins for this collection and set any options that
165 # were specified in the collect.cfg (all import.pl options except
166 # -collectdir, -out and -faillog may be specified in the collect.cfg (these
167 # options must be known before we read the collect.cfg))
168 my $plugins = [];
169 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
170 if (-e $configfilename) {
171 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
172 if (defined $collectcfg->{'plugin'}) {
173 $plugins = $collectcfg->{'plugin'};
174 }
175
176 if ($verbosity !~ /\d+/) {
177 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
178 $verbosity = $collectcfg->{'verbosity'};
179 } else {
180 $verbosity = 2; # the default
181 }
182 }
183 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
184 $importdir = $collectcfg->{'importdir'};
185 }
186 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
187 $archivedir = $collectcfg->{'archivedir'};
188 }
189 if (defined $collectcfg->{'removeold'}) {
190 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
191 $removeold = 1;
192 }
193 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
194 $removeold = 0;
195 }
196 }
197 if (defined $collectcfg->{'keepold'}) {
198 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
199 $removeold = 1;
200 }
201 }
202 if (defined $collectcfg->{'gzip'} && !$gzip) {
203 if ($collectcfg->{'gzip'} =~ /^true$/i) {
204 $gzip = 1;
205 }
206 }
207 if ($maxdocs !~ /\-?\d+/) {
208 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
209 $maxdocs = $collectcfg->{'maxdocs'};
210 } else {
211 $maxdocs = -1; # the default
212 }
213 }
214 if ($groupsize == 1) {
215 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
216 $groupsize = $collectcfg->{'groupsize'};
217 }
218 }
219 if ($OIDtype !~ /^(hash|incremental)$/) {
220 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
221 $OIDtype = $collectcfg->{'OIDtype'};
222 } else {
223 $OIDtype = "hash"; # the default
224 }
225 }
226 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
227 $sortmeta = $collectcfg->{'sortmeta'};
228 }
229 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
230 $debug = 1;
231 }
232
233 } else {
234 die "Couldn't find the configuration file $configfilename\n";
235 }
236
237 # fill in the default import and archives directories if none
238 # were supplied, turn all \ into / and remove trailing /
239 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
240 $importdir =~ s/[\\\/]+/\//g;
241 $importdir =~ s/\/$//;
242 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
243 $archivedir =~ s/[\\\/]+/\//g;
244 $archivedir =~ s/\/$//;
245
246 # load all the plugins
247 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog);
248 if (scalar(@$pluginfo) == 0) {
249 print $out "No plugins were loaded.\n";
250 die "\n";
251 }
252
253 # remove the old contents of the archives directory if needed
254 if ($removeold && -e $archivedir) {
255 print $out "Removing current contents of the archives directory\n";
256 sleep(3); # just in case...
257 &util::rm_r ($archivedir);
258 }
259
260 # read the archive information file
261 if (!$debug) {
262 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
263 $archive_info = new arcinfo ();
264 $archive_info->load_info ($archive_info_filename);
265
266 # create a docsave object to process the documents
267 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
268 $processor->setarchivedir ($archivedir);
269 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
270 $processor->set_OIDtype ($OIDtype);
271 } else {
272 $processor = new docprint ();
273 }
274
275 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
276
277 # process the import directory
278 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
279
280 &plugin::end($pluginfo, $processor);
281
282 # write out the archive information file
283 if (!$debug) {
284 $processor->close_file_output() if $groupsize > 1;
285 $archive_info->save_info($archive_info_filename);
286 }
287
288 # write out import stats
289 my $close_stats = 0;
290 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
291 if (open (STATS, ">$statsfile")) {
292 $statsfile = 'import::STATS';
293 $close_stats = 1;
294 } else {
295 print $out "WARNING: couldn't open stats file $statsfile\n";
296 print $out " will print stats to STDERR instead\n";
297 $statsfile = 'STDERR';
298 }
299 }
300
301 print $out "\n";
302 print $out "*********************************************\n";
303 print $out "Import Complete\n";
304 print $out "*********************************************\n";
305
306 &plugin::write_stats($pluginfo, $statsfile);
307 if ($close_stats) {
308 close STATS;
309 }
310
311 close OUT if $close_out;
312 close FAILLOG;
313}
314
Note: See TracBrowser for help on using the repository browser.