source: trunk/gsdl/bin/script/import.pl@ 2447

Last change on this file since 2447 was 2359, checked in by sjboddie, 23 years ago

Altered the help text a little for mkcol.pl, import.pl, buildcol.pl, and
build so that they now suggest using the "more" pager if the help text
scrolls off the screen (brought about by usability studies under DOS).
Note that this means some debug info that was once printed to STDERR is
now being printed to STDOUT.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use arcinfo;
42use colcfg;
43use plugin;
44use docprint;
45use util;
46use parsargv;
47use FileHandle;
48
49sub print_usage {
50 print STDOUT "\n";
51 print STDOUT "import.pl: Converts documents in collections -importdir directory into\n";
52 print STDOUT " gml documents which are written to the -archivedir directory.\n\n";
53 print STDOUT " usage: $0 [options] collection-name\n\n";
54 print STDOUT " options:\n";
55 print STDOUT " -verbosity number 0=none, 3=lots\n";
56 print STDOUT " -importdir directory Where the original material lives\n";
57 print STDOUT " -archivedir directory Where the converted material ends up\n";
58 print STDOUT " -keepold Will not destroy the current contents of the\n";
59 print STDOUT " archives directory (the default)\n";
60 print STDOUT " -removeold Will remove the old contents of the archives\n";
61 print STDOUT " directory -- use with care\n";
62 print STDOUT " -gzip Use gzip to compress resulting gml documents\n";
63 print STDOUT " (don't forget to include ZIPPlug in your plugin\n";
64 print STDOUT " list when building from compressed documents)\n";
65 print STDOUT " -maxdocs number Maximum number of documents to import\n";
66 print STDOUT " -groupsize number Number of GML documents to group into one file\n";
67 print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n";
68 print STDOUT " identifiers for each document. \"hash\" (the\n";
69 print STDOUT " default) hashes the contents of the file and so\n";
70 print STDOUT " will be the same every time the collection is\n";
71 print STDOUT " imported. \"incremental\" is a simple document\n";
72 print STDOUT " count and so will be significantly faster than\n";
73 print STDOUT " \"hash\". It is not guaranteed to always assign\n";
74 print STDOUT " the same identifier to a given document though\n";
75 print STDOUT " and does not allow further documents to be added\n";
76 print STDOUT " to existing gml archives\n";
77 print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n";
78 print STDOUT " building. This will be disabled if groupsize > 1\n";
79 print STDOUT " -debug Print imported text to STDOUT\n";
80 print STDOUT " -collectdir directory Collection directory (defaults to " .
81 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
82 print STDOUT " -out Filename or handle to print output status to.\n";
83 print STDOUT " The default is STDERR\n\n";
84 print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
85 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
86}
87
88&main();
89
90sub main {
91 my ($verbosity, $importdir, $archivedir, $keepold,
92 $removeold, $gzip, $groupsize, $OIDtype, $debug,
93 $maxdocs, $collection, $configfilename, $collectcfg,
94 $pluginfo, $sortmeta, $archive_info_filename,
95 $archive_info, $processor, $out, $collectdir);
96
97 # note that no defaults are passed for most options as they're set
98 # later (after we check the collect.cfg file)
99 if (!parsargv::parse(\@ARGV,
100 'verbosity/\d+/', \$verbosity,
101 'importdir/.*/', \$importdir,
102 'archivedir/.*/', \$archivedir,
103 'keepold', \$keepold,
104 'removeold', \$removeold,
105 'gzip', \$gzip,
106 'groupsize/\d+/', \$groupsize,
107 'OIDtype/^(hash|incremental)$/', \$OIDtype,
108 'sortmeta/.*/', \$sortmeta,
109 'debug', \$debug,
110 'maxdocs/^\-?\d+/', \$maxdocs,
111 'collectdir/.*/', \$collectdir,
112 'out/.*/STDERR', \$out)) {
113 &print_usage();
114 die "\n";
115 }
116
117 my $close_out = 0;
118 if ($out !~ /^(STDERR|STDOUT)$/i) {
119 open (OUT, ">$out") || die "Couldn't open output file $out\n";
120 $out = 'import::OUT';
121 $close_out = 1;
122 }
123 $out->autoflush(1);
124
125 # set removeold to false if it has been defined
126 $removeold = 0 if ($keepold);
127
128 # get and check the collection name
129 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
130 &print_usage();
131 die "\n";
132 }
133
134 # check sortmeta
135 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
136 if (defined $sortmeta && $groupsize > 1) {
137 print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
138 print $out " sortmeta option will be ignored\n\n";
139 $sortmeta = undef;
140 }
141
142 # dynamically load 'docsave' module so it can pick up on a collection
143 # specific docsave.pm is specified.
144
145 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
146 require docsave;
147
148
149 # get the list of plugins for this collection and set any options that
150 # were specified in the collect.cfg (all import.pl options except
151 # -collectdir and -out may be specified in the collect.cfg (these
152 # options must be known before we read the collect.cfg))
153 my $plugins = [];
154 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
155 if (-e $configfilename) {
156 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
157 if (defined $collectcfg->{'plugin'}) {
158 $plugins = $collectcfg->{'plugin'};
159 }
160
161 if ($verbosity !~ /\d+/) {
162 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
163 $verbosity = $collectcfg->{'verbosity'};
164 } else {
165 $verbosity = 2; # the default
166 }
167 }
168 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
169 $importdir = $collectcfg->{'importdir'};
170 }
171 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
172 $archivedir = $collectcfg->{'archivedir'};
173 }
174 if (defined $collectcfg->{'removeold'}) {
175 if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
176 $removeold = 1;
177 }
178 if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
179 $removeold = 0;
180 }
181 }
182 if (defined $collectcfg->{'keepold'}) {
183 if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
184 $removeold = 1;
185 }
186 }
187 if (defined $collectcfg->{'gzip'} && !$gzip) {
188 if ($collectcfg->{'gzip'} =~ /^true$/i) {
189 $gzip = 1;
190 }
191 }
192 if ($maxdocs !~ /\-?\d+/) {
193 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
194 $maxdocs = $collectcfg->{'maxdocs'};
195 } else {
196 $maxdocs = -1; # the default
197 }
198 }
199 if ($groupsize !~ /\d+/) {
200 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/ && $groupsize !~ /\d+/) {
201 $groupsize = $collectcfg->{'groupsize'};
202 } else {
203 $groupsize = 1; # the default
204 }
205 }
206 if ($OIDtype !~ /^(hash|incremental)$/) {
207 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
208 $OIDtype = $collectcfg->{'OIDtype'};
209 } else {
210 $OIDtype = "hash"; # the default
211 }
212 }
213 if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
214 $sortmeta = $collectcfg->{'sortmeta'};
215 }
216 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
217 $debug = 1;
218 }
219
220 } else {
221 die "Couldn't find the configuration file $configfilename\n";
222 }
223
224 # fill in the default import and archives directories if none
225 # were supplied, turn all \ into / and remove trailing /
226 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
227 $importdir =~ s/[\\\/]+/\//g;
228 $importdir =~ s/\/$//;
229 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
230 $archivedir =~ s/[\\\/]+/\//g;
231 $archivedir =~ s/\/$//;
232
233 # load all the plugins
234 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out);
235 if (scalar(@$pluginfo) == 0) {
236 print $out "No plugins were loaded.\n";
237 die "\n";
238 }
239
240 # remove the old contents of the archives directory if needed
241 if ($removeold && -e $archivedir) {
242 print $out "Warning - removing current contents of the archives directory\n";
243 print $out " in preparation for the import\n";
244 sleep(5); # just in case...
245 &util::rm_r ($archivedir);
246 }
247
248 # read the archive information file
249 if (!$debug) {
250 $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
251 $archive_info = new arcinfo ();
252 $archive_info->load_info ($archive_info_filename);
253
254 # create a docsave object to process the documents
255 $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
256 $processor->setarchivedir ($archivedir);
257 $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
258 $processor->set_OIDtype ($OIDtype);
259 } else {
260 $processor = new docprint ();
261 }
262
263 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
264
265 # process the import directory
266 &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
267
268 &plugin::end($pluginfo, $processor);
269
270 # write out the archive information file
271 if (!$debug) {
272 $processor->close_file_output() if $groupsize > 1;
273 $archive_info->save_info($archive_info_filename);
274 }
275 close OUT if $close_out;
276}
Note: See TracBrowser for help on using the repository browser.