source: main/trunk/greenstone2/bin/script/g2f-buildcol.pl

Last change on this file was 31756, checked in by ak19, 7 years ago

For gs-fedora scripts, check for JRE_HOME as fallback if JAVA_HOME is not set

  • Property svn:executable set to *
File size: 14.6 KB
Line 
1#!/usr/bin/perl -w
2
3BEGIN
4{
5 if (!defined $ENV{'GSDLHOME'}) {
6 print STDERR "Environment variable GSDLHOME not set.\n";
7 print STDERR " Have you sourced Greenstone's 'setup.bash' file?\n";
8 exit 1;
9 }
10
11 if (!defined $ENV{'JAVA_HOME'} && !defined $ENV{'JRE_HOME'}) {
12 print STDERR "Neither JAVA_HOME nor JRE_HOME set.\n";
13 print STDERR "Needed by Fedora command line scripts.\n";
14 exit 1;
15 }
16
17 $ENV{'FEDORA_HOSTNAME'} = "localhost" if (!defined $ENV{'FEDORA_HOSTNAME'});
18 $ENV{'FEDORA_SERVER_PORT'} = "8080" if (!defined $ENV{'FEDORA_SERVER_PORT'});
19 $ENV{'FEDORA_USER'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_USER'});
20 $ENV{'FEDORA_PASS'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_PASS'});
21 $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'});
22 $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'});
23 $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'});
24
25 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/");
26
27}
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and vice versa
32no strict 'subs'; # allow barewords (e.g. STDERR) as function arguments
33
34use util;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use cfgread;
39use colcfg;
40
41use g2futil;
42
43use dbutil;
44
45my $arguments =
46 [
47 { 'name' => "verbosity",
48 'desc' => "Level of verbosity generated",
49 'type' => "string",
50 'deft' => '1',
51 'reqd' => "no",
52 'hiddengli' => "no" },
53 { 'name' => "hostname",
54 'desc' => "Domain hostname of Fedora server",
55 'type' => "string",
56 'deft' => $ENV{'FEDORA_HOSTNAME'},
57 'reqd' => "no",
58 'hiddengli' => "no" },
59 { 'name' => "port",
60 'desc' => "Port that the Fedora server is running on.",
61 'type' => "string",
62 'deft' => $ENV{'FEDORA_SERVER_PORT'},
63 'reqd' => "no",
64 'hiddengli' => "no" },
65 { 'name' => "username",
66 'desc' => "Fedora admin username",
67 'type' => "string",
68 'deft' => $ENV{'FEDORA_USER'},
69 'reqd' => "no",
70 'hiddengli' => "no" },
71 { 'name' => "password",
72 'desc' => "Fedora admin password",
73 'type' => "string",
74 'deft' => $ENV{'FEDORA_PASS'},
75 'reqd' => "no",
76 'hiddengli' => "no" },
77 { 'name' => "protocol",
78 'desc' => "Fedora protocol, e.g. 'http' or 'https'",
79 'type' => "string",
80 'deft' => $ENV{'FEDORA_PROTOCOL'},
81 'reqd' => "no",
82 'hiddengli' => "no" },
83 { 'name' => "pidnamespace",
84 'desc' => "Fedora prefix for PIDs",
85 'type' => "string",
86 'deft' => $ENV{'FEDORA_PID_NAMESPACE'},
87 'reqd' => "no",
88 'hiddengli' => "no" },
89 { 'name' => "gli",
90 'desc' => "",
91 'type' => "flag",
92 'reqd' => "no",
93 'hiddengli' => "yes" },
94 { 'name' => "xml",
95 'desc' => "{scripts.xml}",
96 'type' => "flag",
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "removeold",
100 'desc' => "{import.removeold}",
101 'type' => "flag",
102 'reqd' => "no",
103 'modegli' => "3" },
104 { 'name' => "language",
105 'desc' => "{scripts.language}",
106 'type' => "string",
107 'reqd' => "no",
108 'modegli' => "3" },
109 { 'name' => "collectdir",
110 'desc' => "{import.collectdir}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no",
114 'hiddengli' => "yes" }
115 ];
116
117my $prog_options
118 = { 'name' => "g2fbuildcol.pl",
119 'desc' => "Ingest Greenstone directory of FedoraMETS documents into Fedora",
120 'args' => $arguments };
121
122
123sub main
124{
125 my (@ARGV) = @_;
126
127 my $GSDLHOME = $ENV{'GSDLHOME'};
128
129
130 my $options = {};
131 # general options available to all plugins
132 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$options,"allow_extra_options");
133
134 # Something went wrong with parsing
135 if ($intArgLeftinAfterParsing ==-1)
136 {
137 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
138 die "\n";
139 }
140
141 my $xml = $options->{'xml'};
142 my $gli = $options->{'gli'};
143
144 if ($intArgLeftinAfterParsing != 1)
145 {
146 if ($xml) {
147 &PrintUsage::print_xml_usage($prog_options);
148 print "\n";
149 return;
150 }
151 else {
152 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
153 print "\n";
154 return;
155 }
156
157 }
158
159 my $gs_col = $ARGV[0];
160
161 my $verbosity = $options->{'verbosity'};
162 my $hostname = $options->{'hostname'};
163 my $port = $options->{'port'};
164 my $username = $options->{'username'};
165 my $password = $options->{'password'};
166 my $protocol = $options->{'protocol'};
167 my $pid_namespace = $options->{'pidnamespace'};
168
169 # The following are needed in the FedoraMETS plugout
170 $ENV{'FEDORA_HOSTNAME'} = $hostname;
171 $ENV{'FEDORA_SERVER_PORT'} = $port;
172
173 my $collectdir = $options->{'collectdir'};
174
175 if (!$collectdir) {
176 if($ENV{'GSDL3HOME'}) {
177 $collectdir = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite","collect");
178 } else {
179 $collectdir = &util::filename_cat($ENV{'GSDLHOME'},"collect");
180 }
181 }
182
183 my $full_gs_col = &util::filename_cat($collectdir,$gs_col);
184
185
186 if (!-e $full_gs_col ) {
187 print STDERR "Unable to find Greenstone collection $full_gs_col\n";
188 exit 1;
189 }
190
191## my $archives_dir = &util::filename_cat($full_gs_col,"archives");
192 my $export_dir = &util::filename_cat($full_gs_col,"export");
193
194
195 print "***\n";
196 print "* Ingesting Greenstone processed files into Fedora $pid_namespace\n";
197 print "***\n";
198
199 # Following falls foul of Schematron rule checking
200 my $fd_add_prog = "fedora-ingest";
201# my $fd_add_cmd;
202# $fd_add_args = "dir $export_dir O metslikefedora1 $hostname:$port $username $password \\\n";
203# $fd_add_args .= " \"Automated_ingest_by_gs2fed.pl\"";
204
205# &g2futil::run_cmd($fd_add_prog,$fd_add_args,$options);
206
207
208 # => Ingest individually!
209
210 # set up fedoragsearch for updating the index upon ingesting documents
211 my $fedoragsearch_webapp = &g2futil::gsearch_webapp_folder();
212
213 # need the username and password preset in order to run fedoraGSearch's RESTClient script
214 # this assumes that the fedoragsearch authentication details are the same as for fedora
215 if (defined $fedoragsearch_webapp) {
216 $ENV{'fgsUserName'} = $options->{'username'};
217 $ENV{'fgsPassword'} = $options->{'password'};
218 }
219
220 if (opendir(DIR, $export_dir)) {
221 closedir DIR;
222 ## my @hash_dirs = grep { /\.dir$/ } readdir(DIR);
223 my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir);
224
225
226 # for each hash dir, purge its respective PID
227 foreach my $hd (@hash_dirs) {
228
229 my $hash_id = &g2futil::get_hash_id($hd);
230
231 if (defined $hash_id) {
232
233 my $pid = "$pid_namespace:$gs_col-$hash_id";
234
235
236 my $dsinfo_status = &g2futil::run_datastore_info($pid,$options);
237
238 if ($dsinfo_status == 0) {
239 # first remove the doc from the gsearch index before removing it from the fedora repository
240 print " deleting $pid from GSearch index\n";
241 &g2futil::run_delete_from_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
242
243 print " $pid being updated.\n";
244 &g2futil::run_purge($pid,$options);
245 }
246 else {
247 print " $pid not present.\n";
248 }
249 }
250
251 my $docmets_filename
252 = &util::filename_cat($hd,"docmets.xml");
253
254 print STDERR "<Build>\n" if $gli;
255
256 print "Ingesting $docmets_filename\n";
257
258 my $status = &g2futil::run_ingest($docmets_filename,$options);
259
260 # if the document was ingested into Fedora successfully, index it with GSearch next
261 if($status == 0) {
262 if(defined $hash_id) {
263 my $pid = "$pid_namespace:$gs_col-$hash_id";
264 # now update the fedoragsearch index with the newly ingested document
265 &g2futil::run_update_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
266 }
267 }
268
269 print STDERR "</Build>\n" if $gli;
270
271 }
272 }
273 else {
274 print STDERR "Error: Unable to open directory $export_dir: $!\n";
275 exit 1;
276 }
277
278
279# can possibly use inexport instead of running buildcol.pl through system()
280 print STDERR "**** Just for now, also run Greenstone's buildcol.pl\n";
281
282 my $gs_opts = " -verbosity $verbosity";
283 $gs_opts .= " -gli" if ($gli);
284 $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
285 $gs_opts .= " -mode infodb";
286
287 my $gs_buildcol_arguments = "$gs_opts $gs_col";
288
289 &g2futil::run_cmd("buildcol.pl", $gs_buildcol_arguments, $options);
290
291 # read in collect cfg file to work out db type
292 my $collectcfg = &util::filename_cat ($collectdir, $gs_col, "etc", "collectionConfig.xml");
293 #print STDERR "**** collectcfg file: $collectcfg\n";
294 unless(open(FIN, "<$collectcfg")) {
295 print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
296 exit 1;
297 }
298 close(FIN);
299
300 # for now we assume GS3, since that's what the following gets implemented for
301 my $collect_cfg = &colcfg::read_collection_cfg ($collectcfg, "gs3");
302 # get the database type for this collection from its configuration file (may be undefined)
303 my $infodbtype = $collect_cfg->{'infodbtype'} || &dbutil::get_default_infodb_type();
304
305 # open .gdbm database file in building/text/$colname.gdb, using dbutil
306 my $colname = $gs_col;
307 $colname =~ s/(:?\\|\/)(.*)$/$1/; # remove any collect group from collection name to get tailname
308
309 my $building_txt_dir = &util::filename_cat ($collectdir, $gs_col, "building", "text");
310 my $building_txt_db = &dbutil::get_infodb_file_path($infodbtype, "$colname", $building_txt_dir);
311
312 # foreach key that matches http://dir1/dir2/....file.xxx
313 my $db_keys = {};
314 &dbutil::read_infodb_keys($infodbtype,$building_txt_db, $db_keys);
315
316 foreach my $key (keys %$db_keys) {
317 if($key =~ m@^http://@) {
318
319 # get value for the key
320 my $src_rec = &dbutil::read_infodb_entry($infodbtype,$building_txt_db, $key);
321 my $OID_hash_value = $src_rec->{'section'}->[0];
322 $OID_hash_value = "$pid_namespace:$gs_col-".$OID_hash_value; # convert to fedoraPID
323
324 # its fedora pid = "greenstone-http:$colname-http:||dir|file.xxx"
325 # except that fedorapids don't like extra colons and don't like |
326 my $fedora_identifier = "$pid_namespace-http:$gs_col-$key";
327 # CAN'T HAVE | OR : (as in "http:||one|two.html") in fedoraPID
328 $key =~ s@/@_@g;
329 $key =~ s@:@-@g;
330 my $fedora_pid = "$pid_namespace-http:$gs_col-$key";
331
332 # To run fedora ingest on the new file need to have sensible
333 # filenames that won't offend windows
334 my $fedora_key_file_name = "$fedora_pid";
335 $fedora_key_file_name =~ s@\.@-@g;
336 $fedora_key_file_name =~ s/\:/=/g;
337 $fedora_key_file_name .= ".xml";
338# print STDERR "+++++ fpid: $fedora_pid, fedora-key filename: $fedora_key_file_name\n";
339
340 # write out a FedoraMets File for this key (in /tmp)
341 # -> it has one metadata value, which is 'dc:title' = HASHxxxxxx
342
343 # The HASHID shouldn't be the title: then will have
344 # duplicate titles and it will be hard to search for
345 # unique ones. What about making the filename the
346 # dc.title and the HASHID the dc.identifier
347
348 my $contents = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
349 $contents .= "<mets:mets xmlns:mets=\"http://www.loc.gov/METS/\"\n";
350 $contents .= " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
351 $contents .= " xmlns:gsdl3=\"http://www.greenstone.org/namespace/gsdlmetadata/1.0/\"\n";
352 $contents .= " xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n";
353 $contents .= " xsi:schemaLocation=\"http://www.loc.gov/METS/\n";
354 $contents .= " http://www.loc.gov/standards/mets/mets.xsd\n";
355 $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/\n";
356 $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd\"\n";
357 $contents .= " OBJID=\"$fedora_pid\"\n";
358# $contents .= " OBJID=\"greenstone:$gs_col-HASH1f814d07252c354039ee11\"\n";
359 $contents .= " TYPE=\"FedoraObject\" LABEL=\"$fedora_pid\" EXT_VERSION=\"1.1\">\n";
360 $contents .= "<mets:metsHdr RECORDSTATUS=\"A\"/>\n";
361 $contents .= " <mets:amdSec ID=\"DC\" >\n";
362 $contents .= " <mets:techMD ID=\"DC.0\">\n";
363 $contents .= " <mets:mdWrap LABEL=\"Metadata\" MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"DCgsdl1\">\n";
364 $contents .= " <mets:xmlData>\n";
365 $contents .= " <oai_dc:dc xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" >\n";
366 $contents .= " <dc:title>$OID_hash_value</dc:title>\n";
367# $contents .= " <dc:identifier>$fedora_identifier</dc:identifier>\n";
368 $contents .= " </oai_dc:dc>\n";
369 $contents .= " </mets:xmlData>\n";
370 $contents .= " </mets:mdWrap>\n";
371 $contents .= " </mets:techMD>\n";
372 $contents .= " </mets:amdSec>\n";
373 $contents .= "</mets:mets>\n";
374
375
376 # write out the file and then run fedora ingest on that file
377 # The file gets purged in g2f-import.pl, so don't remove it from export dir now
378 my $fedora_key_file_path = &util::filename_cat($export_dir, $fedora_key_file_name);
379 unless(open(FOUT, ">$fedora_key_file_path")) {
380 print STDERR "g2f-buildcol.pl: Unable to open $fedora_key_file_path...ERROR: $!\n";
381 exit 1;
382 }
383 print FOUT $contents;
384 close(FOUT);
385
386 print STDERR "<Build>\n" if $gli;
387 print STDERR "Ingesting $fedora_key_file_name\n";
388# print STDERR "#### ".join(",", %$options)."\n";
389
390 &g2futil::run_ingest($fedora_key_file_path,$options);
391 print STDERR "</Build>\n" if $gli;
392 }
393
394 }
395
396
397 # If successful!!! Then need to think about:
398 # [CLX] nodes
399 # Doing this with FedoraMETSPlugin
400
401
402 # for the Greenstone reader interface to make the new Fedora collection available,
403 # need to write out buildConfig.xml with FedoraServiceProxy as a new ServiceRack element
404 # Kathy thinks it's better to create a buildConfig.xml than put it in collectionConfig.xml
405
406 my $indexdir = &util::filename_cat ($collectdir, $gs_col, "index");
407 &util::mk_dir($indexdir) unless &util::dir_exists($indexdir);
408
409 my $buildcfg = &util::filename_cat ($indexdir, "buildConfig.xml");
410 if(-e $buildcfg) {
411 print STDERR "***** $buildcfg already exists for this fedora collection.\n";
412 print STDERR "***** Not modifying it to insert a FedoraServiceProxy ServiceRack.\n";
413 }
414 else { # or do I just have a template buildConfig.xml that I copy over?
415
416 my $contents = "<buildConfig>\n";
417 $contents .= " <metadataList/>\n";
418 $contents .= " <serviceRackList>\n";
419 $contents .= " <serviceRack name=\"FedoraServiceProxy\" />\n";
420 $contents .= " </serviceRackList>\n";
421 $contents .= "</buildConfig>\n";
422
423 #print STDERR "**** buildcfg file: $buildcfg\n";
424 unless(open(FOUT, ">$buildcfg")) {
425 print STDERR "g2f-buildcol.pl: Unable to open $buildcfg...ERROR: $!\n";
426 exit 1;
427 }
428 print FOUT $contents;
429 close(FOUT);
430 }
431}
432
433&main(@ARGV);
434
435
436
Note: See TracBrowser for help on using the repository browser.