source: gs2-extensions/parallel-building/trunk/src/bin/script/parallel_import.pl@ 28768

Last change on this file since 28768 was 28768, checked in by jmt12, 10 years ago

Initially added microtime to this script, but then remembered it isn't actually used any more. Added message about being obsolete - and then changed file format to utf-8-unix, thus forever obscuring any other changes I've made. sigh. Oh well, I'm going to remove this as soon as I get it up-to-date

  • Property svn:executable set to *
File size: 8.8 KB
Line 
1#!/usr/bin/perl -w
2
3###############################################################################
4#
5# import.pl -- A component of the Greenstone digital library software from the
6# New Zealand Digital Library Project at the University of Waikato, New
7# Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify it under
12# the terms of the GNU General Public License as published by the Free Software
13# Foundation; either version 2 of the License, or (at your option) any later
14# version.
15#
16# This program is distributed in the hope that it will be useful, but WITHOUT
17# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19# details.
20#
21# You should have received a copy of the GNU General Public License along with
22# this program; if not, write to the Free Software Foundation, Inc., 675 Mass
23# Ave, Cambridge, MA 02139, USA.
24#
25###############################################################################
26
27# This program will import a number of files into a particular collection
28
29package parallel_import;
30
31BEGIN
32{
33 if (!defined $ENV{'GSDLHOME'})
34 {
35 die("GSDLHOME not set\n");
36 }
37 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
38 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
39 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/plugins');
40 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/plugouts');
41
42 if (defined $ENV{'GSDLEXTS'})
43 {
44 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
45 foreach my $e (@extensions)
46 {
47 my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
48 unshift (@INC, $ext_prefix . '/perllib');
49 unshift (@INC, $ext_prefix . '/perllib/cpan');
50 unshift (@INC, $ext_prefix . '/perllib/plugins');
51 unshift (@INC, $ext_prefix . '/perllib/plugouts');
52 }
53 }
54 # We may want to make use of GSDL3 Extensions too, such as Solr
55 if (defined $ENV{'GSDL3EXTS'})
56 {
57 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
58 foreach my $e (@extensions)
59 {
60 my $ext_prefix = $ENV{'GSDL3SRCHOME'} . '/ext/' . $e;
61 unshift (@INC, $ext_prefix . '/perllib');
62 unshift (@INC, $ext_prefix . '/perllib/cpan');
63 unshift (@INC, $ext_prefix . '/perllib/plugins');
64 unshift (@INC, $ext_prefix . '/perllib/plugouts');
65 }
66 }
67}
68
69use strict;
70
71use Time::HiRes qw( gettimeofday tv_interval );
72use inexport;
73
74my $oidtype_list =
75 [ { 'name' => "hash",
76 'desc' => "{import.OIDtype.hash}" },
77 { 'name' => "assigned",
78 'desc' => "{import.OIDtype.assigned}" },
79 { 'name' => "incremental",
80 'desc' => "{import.OIDtype.incremental}" },
81 { 'name' => "dirname",
82 'desc' => "{import.OIDtype.dirname}" } ];
83
84
85# used to control output file format
86my $saveas_list =
87 [ { 'name' => "GreenstoneXML",
88 'desc' => "{export.saveas.GreenstoneXML}"},
89 { 'name' => "GreenstoneMETS",
90 'desc' => "{export.saveas.GreenstoneMETS}"},
91 ];
92
93
94# Possible attributes for each argument
95# name: The name of the argument
96# desc: A description (or more likely a reference to a description) for this argument
97# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
98# reqd: Is this argument required?
99# hiddengli: Is this argument hidden in GLI?
100# modegli: The lowest detail mode this argument is visible at in GLI
101
102my $saveas_argument
103 = { 'name' => "saveas",
104 'desc' => "{import.saveas}",
105 'type' => "enum",
106 'list' => $saveas_list,
107 'deft' => "GreenstoneXML",
108 'reqd' => "no",
109 'modegli' => "3" };
110
111
112my $arguments =
113 [
114 $saveas_argument,
115 { 'name' => "archivedir",
116 'desc' => "{import.archivedir}",
117 'type' => "string",
118 'reqd' => "no",
119 'hiddengli' => "yes" },
120 { 'name' => "importdir",
121 'desc' => "{import.importdir}",
122 'type' => "string",
123 'reqd' => "no",
124 'hiddengli' => "yes" },
125 { 'name' => "collectdir",
126 'desc' => "{import.collectdir}",
127 'type' => "string",
128 # parsearg left "" as default
129 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
130 'deft' => "",
131 'reqd' => "no",
132 'hiddengli' => "yes" },
133 { 'name' => "site",
134 'desc' => "{import.site}",
135 'type' => "string",
136 'deft' => "",
137 'reqd' => "no",
138 'hiddengli' => "yes" },
139 { 'name' => "manifest",
140 'desc' => "{import.manifest}",
141 'type' => "string",
142 'deft' => "",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "debug",
146 'desc' => "{import.debug}",
147 'type' => "flag",
148 'reqd' => "no",
149 'hiddengli' => "yes" },
150 { 'name' => "faillog",
151 'desc' => "{import.faillog}",
152 'type' => "string",
153 # parsearg left "" as default
154 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
155 'deft' => "",
156 'reqd' => "no",
157 'modegli' => "3" },
158 { 'name' => "incremental",
159 'desc' => "{import.incremental}",
160 'type' => "flag",
161 'hiddengli' => "yes" },
162 { 'name' => "keepold",
163 'desc' => "{import.keepold}",
164 'type' => "flag",
165 'reqd' => "no",
166 'hiddengli' => "yes" },
167 { 'name' => "removeold",
168 'desc' => "{import.removeold}",
169 'type' => "flag",
170 'reqd' => "no",
171 'hiddengli' => "yes" },
172 { 'name' => "language",
173 'desc' => "{scripts.language}",
174 'type' => "string",
175 'reqd' => "no",
176 'hiddengli' => "yes" },
177 { 'name' => "maxdocs",
178 'desc' => "{import.maxdocs}",
179 'type' => "int",
180 'reqd' => "no",
181 # parsearg left "" as default
182 #'deft' => "-1",
183 'range' => "1,",
184 'modegli' => "1" },
185 # don't set the default to hash - want to allow this to come from
186 # entry in collect.cfg but want to override it here
187 { 'name' => "OIDtype",
188 'desc' => "{import.OIDtype}",
189 'type' => "enum",
190 'list' => $oidtype_list,
191 # parsearg left "" as default
192 #'deft' => "hash",
193 'reqd' => "no",
194 'modegli' => "2" },
195 { 'name' => "OIDmetadata",
196 'desc' => "{import.OIDmetadata}",
197 'type' => "string",
198 #'type' => "metadata", #doesn't work properly in GLI
199 # parsearg left "" as default
200 #'deft' => "dc.Identifier",
201 'reqd' => "no",
202 'modegli' => "2" },
203 { 'name' => "out",
204 'desc' => "{import.out}",
205 'type' => "string",
206 'deft' => "STDERR",
207 'reqd' => "no",
208 'hiddengli' => "yes" },
209 { 'name' => "sortmeta",
210 'desc' => "{import.sortmeta}",
211 'type' => "string",
212 #'type' => "metadata", #doesn't work properly in GLI
213 'reqd' => "no",
214 'modegli' => "2" },
215 { 'name' => "removeprefix",
216 'desc' => "{BasClas.removeprefix}",
217 'type' => "regexp",
218 'deft' => "",
219 'reqd' => "no",
220 'modegli' => "3" },
221 { 'name' => "removesuffix",
222 'desc' => "{BasClas.removesuffix}",
223 'type' => "regexp",
224 'deft' => "",
225 'reqd' => "no",
226 'modegli' => "3" },
227 { 'name' => "groupsize",
228 'desc' => "{import.groupsize}",
229 'type' => "int",
230 'deft' => "1",
231 'reqd' => "no",
232 'modegli' => "2" },
233 { 'name' => "gzip",
234 'desc' => "{import.gzip}",
235 'type' => "flag",
236 'reqd' => "no",
237 'modegli' => "3" },
238 { 'name' => "statsfile",
239 'desc' => "{import.statsfile}",
240 'type' => "string",
241 'deft' => "STDERR",
242 'reqd' => "no",
243 'hiddengli' => "yes" },
244 { 'name' => "verbosity",
245 'desc' => "{import.verbosity}",
246 'type' => "int",
247 'range' => "0,",
248 # parsearg left "" as default
249 #'deft' => "2",
250 'reqd' => "no",
251 'modegli' => "3" },
252 { 'name' => "gli",
253 'desc' => "{scripts.gli}",
254 'type' => "flag",
255 'reqd' => "no",
256 'hiddengli' => "yes" },
257 { 'name' => "xml",
258 'desc' => "{scripts.xml}",
259 'type' => "flag",
260 'reqd' => "no",
261 'hiddengli' => "yes" },
262# jobs and epoch added for parallel processing
263# [hs, 1 july 2010]
264 { 'name' => "epoch",
265 'desc' => "{import.epoch}",
266 'type' => "int",
267 'range' => "1,",
268 'deft' => "1",
269 'reqd' => "no",
270 'hiddengli' => "yes" },
271 { 'name' => "jobs",
272 'desc' => "{import.jobs}",
273 'type' => "int",
274 'range' => "1,",
275 'deft' => "1",
276 'reqd' => "no",
277 'hiddengli' => "yes" }];
278
279my $options = { 'name' => "import.pl",
280 'desc' => "{import.desc}",
281 'args' => $arguments };
282
283## @function main()
284#
285sub main
286{
287 my $start_time = [gettimeofday()];
288 print "[" . @{$start_time}[0] . '.' . @{$start_time}[1] . "] Parallel Import Started\n";
289
290 print "Warning! Script is obsolete - use: import.pl -workers <int> [-batchsize <int>] <collection>\n";
291
292 my $inexport = new inexport("import", \@ARGV, $options);
293
294 # get the collection configuration (we'll need it to init the plugins)
295 my $collection = $inexport->get_collection();
296 my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options);
297 $inexport->set_collection_options($collect_cfg);
298
299 my $pluginfo = $inexport->process_files($config_filename,$collect_cfg);
300
301 $inexport->generate_statistics($pluginfo);
302
303 # Done
304 my $end_time = [gettimeofday()];
305 my $duration = tv_interval($start_time, $end_time);
306 print "[" . @{$end_time}[0] . "." . @{$end_time}[1] . "] Parallel Import Complete in: " . sprintf('%0.6f', $duration) . " seconds\n";
307}
308## main() ##
309
310&main();
311
3121;
Note: See TracBrowser for help on using the repository browser.