source: trunk/gsdl/bin/script/import.pl@ 2355

Last change on this file since 2355 was 2355, checked in by sjboddie, 23 years ago

All options to import.pl and buildcol.pl may now be specified from
within a collect.cfg file

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use arcinfo;
42use colcfg;
43use plugin;
44use docprint;
45use util;
46use parsargv;
47use FileHandle;
48
49sub print_usage {
50 print STDERR "\n";
51 print STDERR "import.pl: Converts documents in collections -importdir directory into\n";
52 print STDERR " gml documents which are written to the -archivedir directory.\n\n";
53 print STDERR " usage: $0 [options] collection-name\n\n";
54 print STDERR " options:\n";
55 print STDERR " -verbosity number 0=none, 3=lots\n";
56 print STDERR " -importdir directory Where the original material lives\n";
57 print STDERR " -archivedir directory Where the converted material ends up\n";
58 print STDERR " -keepold Will not destroy the current contents of the\n";
59 print STDERR " archives directory (the default)\n";
60 print STDERR " -removeold Will remove the old contents of the archives\n";
61 print STDERR " directory -- use with care\n";
62 print STDERR " -gzip Use gzip to compress resulting gml documents\n";
63 print STDERR " (don't forget to include ZIPPlug in your plugin\n";
64 print STDERR " list when building from compressed documents)\n";
65 print STDERR " -maxdocs number Maximum number of documents to import\n";
66 print STDERR " -groupsize number Number of GML documents to group into one file\n";
67 print STDERR " -OIDtype hash|incremental The method to use when generating unique\n";
68 print STDERR " identifiers for each document. \"hash\" (the\n";
69 print STDERR " default) hashes the contents of the file and so\n";
70 print STDERR " will be the same every time the collection is\n";
71 print STDERR " imported. \"incremental\" is a simple document\n";
72 print STDERR " count and so will be significantly faster than\n";
73 print STDERR " \"hash\". It is not guaranteed to always assign\n";
74 print STDERR " the same identifier to a given document though\n";
75 print STDERR " and does not allow further documents to be added\n";
76 print STDERR " to existing gml archives\n";
77 print STDERR " -sortmeta metadata Sort documents alphabetically by metadata for\n";
78 print STDERR " building. This will be disabled if groupsize > 1\n";
79 print STDERR " -debug Print imported text to STDOUT\n";
80 print STDERR " -collectdir directory Collection directory (defaults to " .
81 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
82 print STDERR " -out Filename or handle to print output status to.\n";
83 print STDERR " The default is STDERR\n\n";
84}
85
86&main();
87
88sub main {
89 my ($verbosity, $importdir, $archivedir, $keepold,
90 $removeold, $gzip, $groupsize, $OIDtype, $debug,
91 $maxdocs, $collection, $configfilename, $collectcfg,
92 $pluginfo, $sortmeta, $archive_info_filename,
93 $archive_info, $processor, $out, $collectdir);
94
95 # note that no defaults are passed for most options as they're set
96 # later (after we check the collect.cfg file)
97 if (!parsargv::parse(\@ARGV,
98 'verbosity/\d+/', \$verbosity,
99 'importdir/.*/', \$importdir,
100 'archivedir/.*/', \$archivedir,
101 'keepold', \$keepold,
102 'removeold', \$removeold,
103 'gzip', \$gzip,
104 'groupsize/\d+/', \$groupsize,
105 'OIDtype/^(hash|incremental)$/', \$OIDtype,
106 'sortmeta/.*/', \$sortmeta,
107 'debug', \$debug,
108 'maxdocs/^\-?\d+/', \$maxdocs,
109 'collectdir/.*/', \$collectdir,
110 'out/.*/STDERR', \$out)) {
111 &print_usage();
112 die "\n";
113 }
114
115 my $close_out = 0;
116 if ($out !~ /^(STDERR|STDOUT)$/i) {
117 open (OUT, ">$out") || die "Couldn't open output file $out\n";
118 $out = 'import::OUT';
119 $close_out = 1;
120 }
121 $out->autoflush(1);
122
123 # set removeold to false if it has been defined
124 $removeold = 0 if ($keepold);
125
126 # get and check the collection name
127 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
128 &print_usage();
129 die "\n";
130 }
131
132 # check sortmeta
133 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
134 if (defined $sortmeta && $groupsize > 1) {
135 print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
136 print $out " sortmeta option will be ignored\n\n";
137 $sortmeta = undef;
138 }
139
140 # dynamically load 'docsave' module so it can pick up on a collection
141 # specific docsave.pm is specified.
142
143 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
144 require docsave;
145
146
147 # get the list of plugins for this collection and set any options that
148 # were specified in the collect.cfg (all import.pl options except
149 # -collectdir and -out may be specified in the collect.cfg (these
150 # options must be known before we read the collect.cfg))
151 my $plugins = [];
152 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
153 if (-e $configfilename) {
154 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
155 if (defined $collectcfg->{'plugin'}) {
156 $plugins = $collectcfg->{'plugin'};
157 }
158
159 if ($verbosity !~ /\d+/) {
160 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
161 $verbosity = $collectcfg->{'verbosity'};
162 } else {
163 $verbosity = 2; # the default
164 }
165 }
166 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
167 $importdir = $collectcfg->{'importdir'};
168 }
169 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
170 $archivedir = $collectcfg->{'archivedir'};
171 }
172 if (defined $collectcfg->{'removeold'}) {
173 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
174 $removeold = 1;
175 }
176 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
177 $removeold = 0;
178 }
179 }
180 if (defined $collectcfg->{'keepold'}) {
181 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
182 $removeold = 1;
183 }
184 }
185 if (defined $collectcfg->{'gzip'} && !$gzip) {
186 if ($collectcfg->{'gzip'} =~ /^true$/i) {
187 $gzip = 1;
188 }
189 }
190 if ($maxdocs !~ /\-?\d+/) {
191 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
192 $maxdocs = $collectcfg->{'maxdocs'};
193 } else {
194 $maxdocs = -1; # the default
195 }
196 }
197 if ($groupsize !~ /\d+/) {
198 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/ && $groupsize !~ /\d+/) {
199 $groupsize = $collectcfg->{'groupsize'};
200 } else {
201 $groupsize = 1; # the default
202 }
203 }
204 if ($OIDtype !~ /^(hash|incremental)$/) {
205 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
206 $OIDtype = $collectcfg->{'OIDtype'};
207 } else {
208 $OIDtype = "hash"; # the default
209 }
210 }
211 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
212 $sortmeta = $collectcfg->{'sortmeta'};
213 }
214 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
215 $debug = 1;
216 }
217
218 } else {
219 die "Couldn't find the configuration file $configfilename\n";
220 }
221
222 # fill in the default import and archives directories if none
223 # were supplied, turn all \ into / and remove trailing /
224 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
225 $importdir =~ s/[\\\/]+/\//g;
226 $importdir =~ s/\/$//;
227 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
228 $archivedir =~ s/[\\\/]+/\//g;
229 $archivedir =~ s/\/$//;
230
231 # load all the plugins
232 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out);
233 if (scalar(@$pluginfo) == 0) {
234 print $out "No plugins were loaded.\n";
235 die "\n";
236 }
237
238 # remove the old contents of the archives directory if needed
239 if ($removeold && -e $archivedir) {
240 print $out "Warning - removing current contents of the archives directory\n";
241 print $out " in preparation for the import\n";
242 sleep(5); # just in case...
243 &util::rm_r ($archivedir);
244 }
245
246 # read the archive information file
247 if (!$debug) {
248 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
249 $archive_info = new arcinfo ();
250 $archive_info->load_info ($archive_info_filename);
251
252 # create a docsave object to process the documents
253 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
254 $processor->setarchivedir ($archivedir);
255 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
256 $processor->set_OIDtype ($OIDtype);
257 } else {
258 $processor = new docprint ();
259 }
260
261 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
262
263 # process the import directory
264 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
265
266 &plugin::end($pluginfo, $processor);
267
268 # write out the archive information file
269 if (!$debug) {
270 $processor->close_file_output() if $groupsize > 1;
271 $archive_info->save_info($archive_info_filename);
272 }
273 close OUT if $close_out;
274}
Note: See TracBrowser for help on using the repository browser.