source: main/trunk/greenstone2/bin/script/parallel_import.pl@ 22642

Last change on this file since 22642 was 22446, checked in by davidb, 14 years ago

Incorporation of top-level parallel_import functionality into code base.

  • Property svn:executable set to *
File size: 8.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package parallel_import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
40
41 if (defined $ENV{'GSDLEXTS'}) {
42 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
43 foreach my $e (@extensions) {
44 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
45
46 unshift (@INC, "$ext_prefix/perllib");
47 unshift (@INC, "$ext_prefix/perllib/cpan");
48 unshift (@INC, "$ext_prefix/perllib/plugins");
49 unshift (@INC, "$ext_prefix/perllib/plugouts");
50 }
51 }
52 if (defined $ENV{'GSDL3EXTS'}) {
53 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
54 foreach my $e (@extensions) {
55 my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
56
57 unshift (@INC, "$ext_prefix/perllib");
58 unshift (@INC, "$ext_prefix/perllib/cpan");
59 unshift (@INC, "$ext_prefix/perllib/plugins");
60 unshift (@INC, "$ext_prefix/perllib/plugouts");
61 }
62 }
63}
64
65use strict;
66
67use inexport;
68
69my $oidtype_list =
70 [ { 'name' => "hash",
71 'desc' => "{import.OIDtype.hash}" },
72 { 'name' => "assigned",
73 'desc' => "{import.OIDtype.assigned}" },
74 { 'name' => "incremental",
75 'desc' => "{import.OIDtype.incremental}" },
76 { 'name' => "dirname",
77 'desc' => "{import.OIDtype.dirname}" } ];
78
79
80# used to control output file format
81my $saveas_list =
82 [ { 'name' => "GreenstoneXML",
83 'desc' => "{export.saveas.GreenstoneXML}"},
84 { 'name' => "GreenstoneMETS",
85 'desc' => "{export.saveas.GreenstoneMETS}"},
86 ];
87
88
89# Possible attributes for each argument
90# name: The name of the argument
91# desc: A description (or more likely a reference to a description) for this argument
92# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
93# reqd: Is this argument required?
94# hiddengli: Is this argument hidden in GLI?
95# modegli: The lowest detail mode this argument is visible at in GLI
96
97my $saveas_argument
98 = { 'name' => "saveas",
99 'desc' => "{import.saveas}",
100 'type' => "enum",
101 'list' => $saveas_list,
102 'deft' => "GreenstoneXML",
103 'reqd' => "no",
104 'modegli' => "3" };
105
106
107my $arguments =
108 [
109 $saveas_argument,
110 { 'name' => "archivedir",
111 'desc' => "{import.archivedir}",
112 'type' => "string",
113 'reqd' => "no",
114 'hiddengli' => "yes" },
115 { 'name' => "importdir",
116 'desc' => "{import.importdir}",
117 'type' => "string",
118 'reqd' => "no",
119 'hiddengli' => "yes" },
120 { 'name' => "collectdir",
121 'desc' => "{import.collectdir}",
122 'type' => "string",
123 # parsearg left "" as default
124 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
125 'deft' => "",
126 'reqd' => "no",
127 'hiddengli' => "yes" },
128 { 'name' => "site",
129 'desc' => "{import.site}",
130 'type' => "string",
131 'deft' => "",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
134 { 'name' => "manifest",
135 'desc' => "{import.manifest}",
136 'type' => "string",
137 'deft' => "",
138 'reqd' => "no",
139 'hiddengli' => "yes" },
140 { 'name' => "debug",
141 'desc' => "{import.debug}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "faillog",
146 'desc' => "{import.faillog}",
147 'type' => "string",
148 # parsearg left "" as default
149 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
150 'deft' => "",
151 'reqd' => "no",
152 'modegli' => "3" },
153 { 'name' => "incremental",
154 'desc' => "{import.incremental}",
155 'type' => "flag",
156 'hiddengli' => "yes" },
157 { 'name' => "keepold",
158 'desc' => "{import.keepold}",
159 'type' => "flag",
160 'reqd' => "no",
161 'hiddengli' => "yes" },
162 { 'name' => "removeold",
163 'desc' => "{import.removeold}",
164 'type' => "flag",
165 'reqd' => "no",
166 'hiddengli' => "yes" },
167 { 'name' => "language",
168 'desc' => "{scripts.language}",
169 'type' => "string",
170 'reqd' => "no",
171 'hiddengli' => "yes" },
172 { 'name' => "maxdocs",
173 'desc' => "{import.maxdocs}",
174 'type' => "int",
175 'reqd' => "no",
176 # parsearg left "" as default
177 #'deft' => "-1",
178 'range' => "1,",
179 'modegli' => "1" },
180 # don't set the default to hash - want to allow this to come from
181 # entry in collect.cfg but want to override it here
182 { 'name' => "OIDtype",
183 'desc' => "{import.OIDtype}",
184 'type' => "enum",
185 'list' => $oidtype_list,
186 # parsearg left "" as default
187 #'deft' => "hash",
188 'reqd' => "no",
189 'modegli' => "2" },
190 { 'name' => "OIDmetadata",
191 'desc' => "{import.OIDmetadata}",
192 'type' => "string",
193 #'type' => "metadata", #doesn't work properly in GLI
194 # parsearg left "" as default
195 #'deft' => "dc.Identifier",
196 'reqd' => "no",
197 'modegli' => "2" },
198 { 'name' => "out",
199 'desc' => "{import.out}",
200 'type' => "string",
201 'deft' => "STDERR",
202 'reqd' => "no",
203 'hiddengli' => "yes" },
204 { 'name' => "sortmeta",
205 'desc' => "{import.sortmeta}",
206 'type' => "string",
207 #'type' => "metadata", #doesn't work properly in GLI
208 'reqd' => "no",
209 'modegli' => "2" },
210 { 'name' => "removeprefix",
211 'desc' => "{BasClas.removeprefix}",
212 'type' => "regexp",
213 'deft' => "",
214 'reqd' => "no",
215 'modegli' => "3" },
216 { 'name' => "removesuffix",
217 'desc' => "{BasClas.removesuffix}",
218 'type' => "regexp",
219 'deft' => "",
220 'reqd' => "no",
221 'modegli' => "3" },
222 { 'name' => "groupsize",
223 'desc' => "{import.groupsize}",
224 'type' => "int",
225 'deft' => "1",
226 'reqd' => "no",
227 'modegli' => "2" },
228 { 'name' => "gzip",
229 'desc' => "{import.gzip}",
230 'type' => "flag",
231 'reqd' => "no",
232 'modegli' => "3" },
233 { 'name' => "statsfile",
234 'desc' => "{import.statsfile}",
235 'type' => "string",
236 'deft' => "STDERR",
237 'reqd' => "no",
238 'hiddengli' => "yes" },
239 { 'name' => "verbosity",
240 'desc' => "{import.verbosity}",
241 'type' => "int",
242 'range' => "0,",
243 # parsearg left "" as default
244 #'deft' => "2",
245 'reqd' => "no",
246 'modegli' => "3" },
247 { 'name' => "gli",
248 'desc' => "{scripts.gli}",
249 'type' => "flag",
250 'reqd' => "no",
251 'hiddengli' => "yes" },
252 { 'name' => "xml",
253 'desc' => "{scripts.xml}",
254 'type' => "flag",
255 'reqd' => "no",
256 'hiddengli' => "yes" },
257# jobs and epoch added for parallel processing
258# [hs, 1 july 2010]
259 { 'name' => "epoch",
260 'desc' => "{import.epoch}",
261 'type' => "int",
262 'range' => "1,",
263 'deft' => "1",
264 'reqd' => "no",
265 'hiddengli' => "yes" },
266 { 'name' => "jobs",
267 'desc' => "{import.jobs}",
268 'type' => "int",
269 'range' => "1,",
270 'deft' => "1",
271 'reqd' => "no",
272 'hiddengli' => "yes" }];
273
274my $options = { 'name' => "import.pl",
275 'desc' => "{import.desc}",
276 'args' => $arguments };
277
278
279
280
281
282sub main
283{
284 my $inexport = new inexport("import",\@ARGV,$options);
285
286 my $collection = $inexport->get_collection();
287 my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options);
288 $inexport->set_collection_options($collect_cfg);
289
290 my $pluginfo = $inexport->process_files($config_filename,$collect_cfg);
291
292 $inexport->generate_statistics($pluginfo);
293}
294
295&main();
Note: See TracBrowser for help on using the repository browser.