source: main/trunk/greenstone2/bin/script/g2f-buildcol.pl@ 26183

Last change on this file since 26183 was 26183, checked in by ak19, 12 years ago

Committing changes Dr Bainbridge and I made to the g2f-import and g2f-building long ago. Note g2f-import runs both import and export.

  • Property svn:executable set to *
File size: 12.3 KB
Line 
1#!/usr/bin/perl -w
2
3BEGIN
4{
5 if (!defined $ENV{'GSDLHOME'}) {
6 print STDERR "Environment variable GSDLHOME not set.\n";
7 print STDERR " Have you sourced Greenstone's 'setup.bash' file?\n";
8 exit 1;
9 }
10
11 if (!defined $ENV{'JAVA_HOME'}) {
12 print STDERR "Environment variable JAVA_HOME not set.\n";
13 print STDERR "Needed by Fedora command line scripts.\n";
14 exit 1;
15 }
16
17 $ENV{'FEDORA_HOSTNAME'} = "localhost" if (!defined $ENV{'FEDORA_HOSTNAME'});
18 $ENV{'FEDORA_SERVER_PORT'} = "8080" if (!defined $ENV{'FEDORA_SERVER_PORT'});
19 $ENV{'FEDORA_USER'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_USER'});
20 $ENV{'FEDORA_PASS'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_PASS'});
21 $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'});
22 $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'});
23 $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'});
24
25 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/");
26
27}
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and vice versa
32no strict 'subs'; # allow barewords (e.g. STDERR) as function arguments
33
34use util;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use cfgread;
39use colcfg;
40
41use g2futil;
42
43use dbutil;
44
45my $arguments =
46 [
47 { 'name' => "verbosity",
48 'desc' => "Level of verbosity generated",
49 'type' => "string",
50 'deft' => '1',
51 'reqd' => "no",
52 'hiddengli' => "no" },
53 { 'name' => "hostname",
54 'desc' => "Domain hostname of Fedora server",
55 'type' => "string",
56 'deft' => $ENV{'FEDORA_HOSTNAME'},
57 'reqd' => "no",
58 'hiddengli' => "no" },
59 { 'name' => "port",
60 'desc' => "Port that the Fedora server is running on.",
61 'type' => "string",
62 'deft' => $ENV{'FEDORA_SERVER_PORT'},
63 'reqd' => "no",
64 'hiddengli' => "no" },
65 { 'name' => "username",
66 'desc' => "Fedora admin username",
67 'type' => "string",
68 'deft' => $ENV{'FEDORA_USER'},
69 'reqd' => "no",
70 'hiddengli' => "no" },
71 { 'name' => "password",
72 'desc' => "Fedora admin password",
73 'type' => "string",
74 'deft' => $ENV{'FEDORA_PASS'},
75 'reqd' => "no",
76 'hiddengli' => "no" },
77 { 'name' => "protocol",
78 'desc' => "Fedora protocol, e.g. 'http' or 'https'",
79 'type' => "string",
80 'deft' => $ENV{'FEDORA_PROTOCOL'},
81 'reqd' => "no",
82 'hiddengli' => "no" },
83 { 'name' => "pidnamespace",
84 'desc' => "Fedora prefix for PIDs",
85 'type' => "string",
86 'deft' => $ENV{'FEDORA_PID_NAMESPACE'},
87 'reqd' => "no",
88 'hiddengli' => "no" },
89 { 'name' => "gli",
90 'desc' => "",
91 'type' => "flag",
92 'reqd' => "no",
93 'hiddengli' => "yes" },
94 { 'name' => "xml",
95 'desc' => "{scripts.xml}",
96 'type' => "flag",
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "removeold",
100 'desc' => "{import.removeold}",
101 'type' => "flag",
102 'reqd' => "no",
103 'modegli' => "3" },
104 { 'name' => "language",
105 'desc' => "{scripts.language}",
106 'type' => "string",
107 'reqd' => "no",
108 'modegli' => "3" },
109 { 'name' => "collectdir",
110 'desc' => "{import.collectdir}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no",
114 'hiddengli' => "yes" }
115 ];
116
117my $prog_options
118 = { 'name' => "g2fbuildcol.pl",
119 'desc' => "Ingest Greenstone directory of FedoraMETS documents into Fedora",
120 'args' => $arguments };
121
122
123sub main
124{
125 my (@ARGV) = @_;
126
127 my $GSDLHOME = $ENV{'GSDLHOME'};
128
129
130 my $options = {};
131 # general options available to all plugins
132 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$options,"allow_extra_options");
133
134 # Something went wrong with parsing
135 if ($intArgLeftinAfterParsing ==-1)
136 {
137 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
138 die "\n";
139 }
140
141 my $xml = $options->{'xml'};
142 my $gli = $options->{'gli'};
143
144 if ($intArgLeftinAfterParsing != 1)
145 {
146 if ($xml) {
147 &PrintUsage::print_xml_usage($prog_options);
148 print "\n";
149 return;
150 }
151 else {
152 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
153 print "\n";
154 return;
155 }
156
157 }
158
159 my $gs_col = $ARGV[0];
160
161 my $verbosity = $options->{'verbosity'};
162 my $hostname = $options->{'hostname'};
163 my $port = $options->{'port'};
164 my $username = $options->{'username'};
165 my $password = $options->{'password'};
166 my $protocol = $options->{'protocol'};
167 my $pid_namespace = $options->{'pidnamespace'};
168
169 # The following are needed in the FedoraMETS plugout
170 $ENV{'FEDORA_HOSTNAME'} = $hostname;
171 $ENV{'FEDORA_SERVER_PORT'} = $port;
172
173 my $collectdir = $options->{'collectdir'};
174
175 if (!$collectdir) {
176 if($ENV{'GSDL3HOME'}) {
177 $collectdir = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite","collect");
178 } else {
179 $collectdir = util::filename_cat($ENV{'GSDLHOME'},"collect");
180 }
181 }
182
183 my $full_gs_col = util::filename_cat($collectdir,$gs_col);
184
185
186 if (!-e $full_gs_col ) {
187 print STDERR "Unable to find Greenstone collection $full_gs_col\n";
188 exit 1;
189 }
190
191## my $archives_dir = &util::filename_cat($full_gs_col,"archives");
192 my $export_dir = &util::filename_cat($full_gs_col,"export");
193
194
195 print "***\n";
196 print "* Ingesting Greenstone processed files into Fedora $pid_namespace\n";
197 print "***\n";
198
199 # Following falls foul of Schematron rule checking
200 my $fd_add_prog = "fedora-ingest";
201# my $fd_add_cmd;
202# $fd_add_args = "dir $export_dir O metslikefedora1 $hostname:$port $username $password \\\n";
203# $fd_add_args .= " \"Automated_ingest_by_gs2fed.pl\"";
204
205# &g2futil::run_cmd($fd_add_prog,$fd_add_args,$options);
206
207
208 # => Ingest individually!
209
210 if (opendir(DIR, $export_dir)) {
211 closedir DIR;
212 ## my @hash_dirs = grep { /\.dir$/ } readdir(DIR);
213 my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir);
214
215
216 # for each hash dir, purge its respective PID
217 foreach my $hd (@hash_dirs) {
218
219 my $hash_id = &g2futil::get_hash_id($hd);
220
221 if (defined $hash_id) {
222
223 my $pid = "$pid_namespace:$gs_col-$hash_id";
224
225
226 my $dsinfo_status = &g2futil::run_datastore_info($pid,$options);
227
228 if ($dsinfo_status == 0) {
229 print " $pid being updated.\n";
230 &g2futil::run_purge($pid,$options);
231 }
232 else {
233 print " $pid not present.\n";
234 }
235 }
236
237 my $docmets_filename
238 = &util::filename_cat($hd,"docmets.xml");
239
240 print STDERR "<Build>\n" if $gli;
241
242 print "Ingesting $docmets_filename\n";
243
244 &g2futil::run_ingest($docmets_filename,$options);
245 print STDERR "</Build>\n" if $gli;
246
247 }
248 }
249 else {
250 print STDERR "Error: Unable to open directory $export_dir: $!\n";
251 exit 1;
252 }
253
254
255# can possibly use inexport instead of running buildcol.pl through system()
256 print STDERR "**** Just for now, also run Greenstone's buildcol.pl\n";
257
258 my $gs_opts = " -verbosity $verbosity";
259 $gs_opts .= " -gli" if ($gli);
260 $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
261 $gs_opts .= " -mode infodb";
262
263 my $gs_buildcol_arguments = "$gs_opts $gs_col";
264
265 &g2futil::run_cmd("buildcol.pl", $gs_buildcol_arguments, $options);
266
267 # read in collect cfg file to work out db type
268 my $collectcfg = &util::filename_cat ($collectdir, $gs_col, "etc", "collectionConfig.xml");
269 #print STDERR "**** collectcfg file: $collectcfg\n";
270 unless(open(FIN, "<$collectcfg")) {
271 print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
272 exit 1;
273 }
274 close(FIN);
275
276 # for now we assume GS3, since that's what the following gets implemented for
277 my $collect_cfg = &colcfg::read_collection_cfg ($collectcfg, "gs3");
278 # get the database type for this collection from its configuration file (may be undefined)
279 my $infodbtype = $collect_cfg->{'infodbtype'} || &dbutil::get_default_infodb_type();
280
281 # open .gdbm database file in building/text/$colname.gdb, using dbutil
282 my $colname = $gs_col;
283 $colname =~ s/(:?\\|\/)(.*)$/$1/; # remove any collect group from collection name to get tailname
284
285 my $building_txt_dir = &util::filename_cat ($collectdir, $gs_col, "building", "text");
286 my $building_txt_db = &dbutil::get_infodb_file_path($infodbtype, "$colname", $building_txt_dir);
287
288 # foreach key that matches http://dir1/dir2/....file.xxx
289 my $db_keys = {};
290 &dbutil::read_infodb_keys($infodbtype,$building_txt_db, $db_keys);
291
292 foreach my $key (keys %$db_keys) {
293 if($key =~ m@^http://@) {
294
295 # get value for the key
296 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype,$building_txt_db, $key);
297 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
298 my $OID_hash_value = $src_rec->{'section'}->[0];
299 $OID_hash_value = "$pid_namespace:$gs_col-".$OID_hash_value; # convert to fedoraPID
300
301 # its fedora pid = "greenstone-http:$colname-http:||dir|file.xxx"
302 # except that fedorapids don't like extra colons and don't like |
303 my $fedora_identifier = "$pid_namespace-http:$gs_col-$key";
304 # CAN'T HAVE | OR : (as in "http:||one|two.html") in fedoraPID
305 $key =~ s@/@_@g;
306 $key =~ s@:@-@g;
307 my $fedora_pid = "$pid_namespace-http:$gs_col-$key";
308
309 # To run fedora ingest on the new file need to have sensible
310 # filenames that won't offend windows
311 my $fedora_key_file_name = "$fedora_pid";
312 $fedora_key_file_name =~ s@\.@-@g;
313 $fedora_key_file_name =~ s/\:/=/g;
314 $fedora_key_file_name .= ".xml";
315 print STDERR "+++++ fpid: $fedora_pid, fedora-key filename: $fedora_key_file_name\n";
316
317 # write out a FedoraMets File for this key (in /tmp)
318 # -> it has one metadata value, which is 'dc:title' = HASHxxxxxx
319
320 # The HASHID shouldn't be the title: then will have
321 # duplicate titles and it will be hard to search for
322 # unique ones. What about making the filename the
323 # dc.title and the HASHID the dc.identifier
324
325 my $contents = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
326 $contents .= "<mets:mets xmlns:mets=\"http://www.loc.gov/METS/\"\n";
327 $contents .= " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
328 $contents .= " xmlns:gsdl3=\"http://www.greenstone.org/namespace/gsdlmetadata/1.0/\"\n";
329 $contents .= " xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n";
330 $contents .= " xsi:schemaLocation=\"http://www.loc.gov/METS/\n";
331 $contents .= " http://www.loc.gov/standards/mets/mets.xsd\n";
332 $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/\n";
333 $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd\"\n";
334 $contents .= " OBJID=\"$fedora_pid\"\n";
335# $contents .= " OBJID=\"greenstone:$gs_col-HASH1f814d07252c354039ee11\"\n";
336 $contents .= " TYPE=\"FedoraObject\" LABEL=\"$fedora_pid\" EXT_VERSION=\"1.1\">\n";
337 $contents .= "<mets:metsHdr RECORDSTATUS=\"A\"/>\n";
338 $contents .= " <mets:amdSec ID=\"DC\" >\n";
339 $contents .= " <mets:techMD ID=\"DC.0\">\n";
340 $contents .= " <mets:mdWrap LABEL=\"Metadata\" MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"DCgsdl1\">\n";
341 $contents .= " <mets:xmlData>\n";
342 $contents .= " <oai_dc:dc xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" >\n";
343 $contents .= " <dc:title>$OID_hash_value</dc:title>\n";
344# $contents .= " <dc:identifier>$fedora_identifier</dc:identifier>\n";
345 $contents .= " </oai_dc:dc>\n";
346 $contents .= " </mets:xmlData>\n";
347 $contents .= " </mets:mdWrap>\n";
348 $contents .= " </mets:techMD>\n";
349 $contents .= " </mets:amdSec>\n";
350 $contents .= "</mets:mets>\n";
351
352
353 # write out the file and then run fedora ingest on that file
354 # The file gets purged in g2f-import.pl, so don't remove it from export dir now
355 my $fedora_key_file_path = &util::filename_cat($export_dir, $fedora_key_file_name);
356 unless(open(FOUT, ">$fedora_key_file_path")) {
357 print STDERR "g2f-buildcol.pl: Unable to open $fedora_key_file_path...ERROR: $!\n";
358 exit 1;
359 }
360 print FOUT $contents;
361 close(FOUT);
362
363 print STDERR "<Build>\n" if $gli;
364 print STDERR "Ingesting $fedora_key_file_name\n";
365 print STDERR "#### ".join(",", %$options)."\n";
366
367 &g2futil::run_ingest($fedora_key_file_path,$options);
368 print STDERR "</Build>\n" if $gli;
369 }
370
371 }
372
373
374 # If successful!!! Then need to think about:
375 # [CLX] nodes
376 # Doing this with FedoraMETSPlugin
377
378
379}
380
381&main(@ARGV);
382
383
384
Note: See TracBrowser for help on using the repository browser.