source: main/trunk/greenstone2/bin/script/g2f-buildcol.pl@ 26324

Last change on this file since 26324 was 26324, checked in by ak19, 12 years ago
  1. When purging documents from the fedora repository during import and buildcol, these documents are also removed from the Fedora GSearch index. Likewise, when documents are ingested into the Fedora repository during buildcol, these documents are also indexed by Fedora GSearch. 2. Added methods in g2futil.pm to be able to do this
  • Property svn:executable set to *
File size: 14.6 KB
Line 
1#!/usr/bin/perl -w
2
3BEGIN
4{
5 if (!defined $ENV{'GSDLHOME'}) {
6 print STDERR "Environment variable GSDLHOME not set.\n";
7 print STDERR " Have you sourced Greenstone's 'setup.bash' file?\n";
8 exit 1;
9 }
10
11 if (!defined $ENV{'JAVA_HOME'}) {
12 print STDERR "Environment variable JAVA_HOME not set.\n";
13 print STDERR "Needed by Fedora command line scripts.\n";
14 exit 1;
15 }
16
17 $ENV{'FEDORA_HOSTNAME'} = "localhost" if (!defined $ENV{'FEDORA_HOSTNAME'});
18 $ENV{'FEDORA_SERVER_PORT'} = "8080" if (!defined $ENV{'FEDORA_SERVER_PORT'});
19 $ENV{'FEDORA_USER'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_USER'});
20 $ENV{'FEDORA_PASS'} = "fedoraAdmin" if (!defined $ENV{'FEDORA_PASS'});
21 $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'});
22 $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'});
23 $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'});
24
25 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/");
26
27}
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and vice versa
32no strict 'subs'; # allow barewords (e.g. STDERR) as function arguments
33
34use util;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use cfgread;
39use colcfg;
40
41use g2futil;
42
43use dbutil;
44
45my $arguments =
46 [
47 { 'name' => "verbosity",
48 'desc' => "Level of verbosity generated",
49 'type' => "string",
50 'deft' => '1',
51 'reqd' => "no",
52 'hiddengli' => "no" },
53 { 'name' => "hostname",
54 'desc' => "Domain hostname of Fedora server",
55 'type' => "string",
56 'deft' => $ENV{'FEDORA_HOSTNAME'},
57 'reqd' => "no",
58 'hiddengli' => "no" },
59 { 'name' => "port",
60 'desc' => "Port that the Fedora server is running on.",
61 'type' => "string",
62 'deft' => $ENV{'FEDORA_SERVER_PORT'},
63 'reqd' => "no",
64 'hiddengli' => "no" },
65 { 'name' => "username",
66 'desc' => "Fedora admin username",
67 'type' => "string",
68 'deft' => $ENV{'FEDORA_USER'},
69 'reqd' => "no",
70 'hiddengli' => "no" },
71 { 'name' => "password",
72 'desc' => "Fedora admin password",
73 'type' => "string",
74 'deft' => $ENV{'FEDORA_PASS'},
75 'reqd' => "no",
76 'hiddengli' => "no" },
77 { 'name' => "protocol",
78 'desc' => "Fedora protocol, e.g. 'http' or 'https'",
79 'type' => "string",
80 'deft' => $ENV{'FEDORA_PROTOCOL'},
81 'reqd' => "no",
82 'hiddengli' => "no" },
83 { 'name' => "pidnamespace",
84 'desc' => "Fedora prefix for PIDs",
85 'type' => "string",
86 'deft' => $ENV{'FEDORA_PID_NAMESPACE'},
87 'reqd' => "no",
88 'hiddengli' => "no" },
89 { 'name' => "gli",
90 'desc' => "",
91 'type' => "flag",
92 'reqd' => "no",
93 'hiddengli' => "yes" },
94 { 'name' => "xml",
95 'desc' => "{scripts.xml}",
96 'type' => "flag",
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "removeold",
100 'desc' => "{import.removeold}",
101 'type' => "flag",
102 'reqd' => "no",
103 'modegli' => "3" },
104 { 'name' => "language",
105 'desc' => "{scripts.language}",
106 'type' => "string",
107 'reqd' => "no",
108 'modegli' => "3" },
109 { 'name' => "collectdir",
110 'desc' => "{import.collectdir}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no",
114 'hiddengli' => "yes" }
115 ];
116
117my $prog_options
118 = { 'name' => "g2fbuildcol.pl",
119 'desc' => "Ingest Greenstone directory of FedoraMETS documents into Fedora",
120 'args' => $arguments };
121
122
123sub main
124{
125 my (@ARGV) = @_;
126
127 my $GSDLHOME = $ENV{'GSDLHOME'};
128
129
130 my $options = {};
131 # general options available to all plugins
132 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$options,"allow_extra_options");
133
134 # Something went wrong with parsing
135 if ($intArgLeftinAfterParsing ==-1)
136 {
137 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
138 die "\n";
139 }
140
141 my $xml = $options->{'xml'};
142 my $gli = $options->{'gli'};
143
144 if ($intArgLeftinAfterParsing != 1)
145 {
146 if ($xml) {
147 &PrintUsage::print_xml_usage($prog_options);
148 print "\n";
149 return;
150 }
151 else {
152 &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
153 print "\n";
154 return;
155 }
156
157 }
158
159 my $gs_col = $ARGV[0];
160
161 my $verbosity = $options->{'verbosity'};
162 my $hostname = $options->{'hostname'};
163 my $port = $options->{'port'};
164 my $username = $options->{'username'};
165 my $password = $options->{'password'};
166 my $protocol = $options->{'protocol'};
167 my $pid_namespace = $options->{'pidnamespace'};
168
169 # The following are needed in the FedoraMETS plugout
170 $ENV{'FEDORA_HOSTNAME'} = $hostname;
171 $ENV{'FEDORA_SERVER_PORT'} = $port;
172
173 my $collectdir = $options->{'collectdir'};
174
175 if (!$collectdir) {
176 if($ENV{'GSDL3HOME'}) {
177 $collectdir = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite","collect");
178 } else {
179 $collectdir = util::filename_cat($ENV{'GSDLHOME'},"collect");
180 }
181 }
182
183 my $full_gs_col = util::filename_cat($collectdir,$gs_col);
184
185
186 if (!-e $full_gs_col ) {
187 print STDERR "Unable to find Greenstone collection $full_gs_col\n";
188 exit 1;
189 }
190
191## my $archives_dir = &util::filename_cat($full_gs_col,"archives");
192 my $export_dir = &util::filename_cat($full_gs_col,"export");
193
194
195 print "***\n";
196 print "* Ingesting Greenstone processed files into Fedora $pid_namespace\n";
197 print "***\n";
198
199 # Following falls foul of Schematron rule checking
200 my $fd_add_prog = "fedora-ingest";
201# my $fd_add_cmd;
202# $fd_add_args = "dir $export_dir O metslikefedora1 $hostname:$port $username $password \\\n";
203# $fd_add_args .= " \"Automated_ingest_by_gs2fed.pl\"";
204
205# &g2futil::run_cmd($fd_add_prog,$fd_add_args,$options);
206
207
208 # => Ingest individually!
209
210 # set up fedoragsearch for updating the index upon ingesting documents
211 my $fedoragsearch_webapp = &g2futil::gsearch_webapp_folder();
212
213 # need the username and password preset in order to run fedoraGSearch's RESTClient script
214 # this assumes that the fedoragsearch authentication details are the same as for fedora
215 if (defined $fedoragsearch_webapp) {
216 $ENV{'fgsUserName'} = $options->{'username'};
217 $ENV{'fgsPassword'} = $options->{'password'};
218 }
219
220 if (opendir(DIR, $export_dir)) {
221 closedir DIR;
222 ## my @hash_dirs = grep { /\.dir$/ } readdir(DIR);
223 my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir);
224
225
226 # for each hash dir, purge its respective PID
227 foreach my $hd (@hash_dirs) {
228
229 my $hash_id = &g2futil::get_hash_id($hd);
230
231 if (defined $hash_id) {
232
233 my $pid = "$pid_namespace:$gs_col-$hash_id";
234
235
236 my $dsinfo_status = &g2futil::run_datastore_info($pid,$options);
237
238 if ($dsinfo_status == 0) {
239 # first remove the doc from the gsearch index before removing it from the fedora repository
240 print " deleting $pid from GSearch index\n";
241 &g2futil::run_delete_from_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
242
243 print " $pid being updated.\n";
244 &g2futil::run_purge($pid,$options);
245 }
246 else {
247 print " $pid not present.\n";
248 }
249 }
250
251 my $docmets_filename
252 = &util::filename_cat($hd,"docmets.xml");
253
254 print STDERR "<Build>\n" if $gli;
255
256 print "Ingesting $docmets_filename\n";
257
258 my $status = &g2futil::run_ingest($docmets_filename,$options);
259
260 # if the document was ingested into Fedora successfully, index it with GSearch next
261 if($status == 0) {
262 if(defined $hash_id) {
263 my $pid = "$pid_namespace:$gs_col-$hash_id";
264 # now update the fedoragsearch index with the newly ingested document
265 &g2futil::run_update_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
266 }
267 }
268
269 print STDERR "</Build>\n" if $gli;
270
271 }
272 }
273 else {
274 print STDERR "Error: Unable to open directory $export_dir: $!\n";
275 exit 1;
276 }
277
278
279# can possibly use inexport instead of running buildcol.pl through system()
280 print STDERR "**** Just for now, also run Greenstone's buildcol.pl\n";
281
282 my $gs_opts = " -verbosity $verbosity";
283 $gs_opts .= " -gli" if ($gli);
284 $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
285 $gs_opts .= " -mode infodb";
286
287 my $gs_buildcol_arguments = "$gs_opts $gs_col";
288
289 &g2futil::run_cmd("buildcol.pl", $gs_buildcol_arguments, $options);
290
291 # read in collect cfg file to work out db type
292 my $collectcfg = &util::filename_cat ($collectdir, $gs_col, "etc", "collectionConfig.xml");
293 #print STDERR "**** collectcfg file: $collectcfg\n";
294 unless(open(FIN, "<$collectcfg")) {
295 print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
296 exit 1;
297 }
298 close(FIN);
299
300 # for now we assume GS3, since that's what the following gets implemented for
301 my $collect_cfg = &colcfg::read_collection_cfg ($collectcfg, "gs3");
302 # get the database type for this collection from its configuration file (may be undefined)
303 my $infodbtype = $collect_cfg->{'infodbtype'} || &dbutil::get_default_infodb_type();
304
305 # open .gdbm database file in building/text/$colname.gdb, using dbutil
306 my $colname = $gs_col;
307 $colname =~ s/(:?\\|\/)(.*)$/$1/; # remove any collect group from collection name to get tailname
308
309 my $building_txt_dir = &util::filename_cat ($collectdir, $gs_col, "building", "text");
310 my $building_txt_db = &dbutil::get_infodb_file_path($infodbtype, "$colname", $building_txt_dir);
311
312 # foreach key that matches http://dir1/dir2/....file.xxx
313 my $db_keys = {};
314 &dbutil::read_infodb_keys($infodbtype,$building_txt_db, $db_keys);
315
316 foreach my $key (keys %$db_keys) {
317 if($key =~ m@^http://@) {
318
319 # get value for the key
320 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype,$building_txt_db, $key);
321 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
322 my $OID_hash_value = $src_rec->{'section'}->[0];
323 $OID_hash_value = "$pid_namespace:$gs_col-".$OID_hash_value; # convert to fedoraPID
324
325 # its fedora pid = "greenstone-http:$colname-http:||dir|file.xxx"
326 # except that fedorapids don't like extra colons and don't like |
327 my $fedora_identifier = "$pid_namespace-http:$gs_col-$key";
328 # CAN'T HAVE | OR : (as in "http:||one|two.html") in fedoraPID
329 $key =~ s@/@_@g;
330 $key =~ s@:@-@g;
331 my $fedora_pid = "$pid_namespace-http:$gs_col-$key";
332
333 # To run fedora ingest on the new file need to have sensible
334 # filenames that won't offend windows
335 my $fedora_key_file_name = "$fedora_pid";
336 $fedora_key_file_name =~ s@\.@-@g;
337 $fedora_key_file_name =~ s/\:/=/g;
338 $fedora_key_file_name .= ".xml";
339 print STDERR "+++++ fpid: $fedora_pid, fedora-key filename: $fedora_key_file_name\n";
340
341 # write out a FedoraMets File for this key (in /tmp)
342 # -> it has one metadata value, which is 'dc:title' = HASHxxxxxx
343
344 # The HASHID shouldn't be the title: then will have
345 # duplicate titles and it will be hard to search for
346 # unique ones. What about making the filename the
347 # dc.title and the HASHID the dc.identifier
348
349 my $contents = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
350 $contents .= "<mets:mets xmlns:mets=\"http://www.loc.gov/METS/\"\n";
351 $contents .= " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
352 $contents .= " xmlns:gsdl3=\"http://www.greenstone.org/namespace/gsdlmetadata/1.0/\"\n";
353 $contents .= " xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n";
354 $contents .= " xsi:schemaLocation=\"http://www.loc.gov/METS/\n";
355 $contents .= " http://www.loc.gov/standards/mets/mets.xsd\n";
356 $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/\n";
357 $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd\"\n";
358 $contents .= " OBJID=\"$fedora_pid\"\n";
359# $contents .= " OBJID=\"greenstone:$gs_col-HASH1f814d07252c354039ee11\"\n";
360 $contents .= " TYPE=\"FedoraObject\" LABEL=\"$fedora_pid\" EXT_VERSION=\"1.1\">\n";
361 $contents .= "<mets:metsHdr RECORDSTATUS=\"A\"/>\n";
362 $contents .= " <mets:amdSec ID=\"DC\" >\n";
363 $contents .= " <mets:techMD ID=\"DC.0\">\n";
364 $contents .= " <mets:mdWrap LABEL=\"Metadata\" MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"DCgsdl1\">\n";
365 $contents .= " <mets:xmlData>\n";
366 $contents .= " <oai_dc:dc xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" >\n";
367 $contents .= " <dc:title>$OID_hash_value</dc:title>\n";
368# $contents .= " <dc:identifier>$fedora_identifier</dc:identifier>\n";
369 $contents .= " </oai_dc:dc>\n";
370 $contents .= " </mets:xmlData>\n";
371 $contents .= " </mets:mdWrap>\n";
372 $contents .= " </mets:techMD>\n";
373 $contents .= " </mets:amdSec>\n";
374 $contents .= "</mets:mets>\n";
375
376
377 # write out the file and then run fedora ingest on that file
378 # The file gets purged in g2f-import.pl, so don't remove it from export dir now
379 my $fedora_key_file_path = &util::filename_cat($export_dir, $fedora_key_file_name);
380 unless(open(FOUT, ">$fedora_key_file_path")) {
381 print STDERR "g2f-buildcol.pl: Unable to open $fedora_key_file_path...ERROR: $!\n";
382 exit 1;
383 }
384 print FOUT $contents;
385 close(FOUT);
386
387 print STDERR "<Build>\n" if $gli;
388 print STDERR "Ingesting $fedora_key_file_name\n";
389 print STDERR "#### ".join(",", %$options)."\n";
390
391 &g2futil::run_ingest($fedora_key_file_path,$options);
392 print STDERR "</Build>\n" if $gli;
393 }
394
395 }
396
397
398 # If successful!!! Then need to think about:
399 # [CLX] nodes
400 # Doing this with FedoraMETSPlugin
401
402
403 # for the Greenstone reader interface to make the new Fedora collection available,
404 # need to write out buildConfig.xml with FedoraServiceProxy as a new ServiceRack element
405 # Kathy thinks it's better to create a buildConfig.xml than put it in collectionConfig.xml
406
407 my $indexdir = &util::filename_cat ($collectdir, $gs_col, "index");
408 &util::mk_dir($indexdir) unless &util::dir_exists($indexdir);
409
410 my $buildcfg = &util::filename_cat ($indexdir, "buildConfig.xml");
411 if(-e $buildcfg) {
412 print STDERR "***** $buildcfg already exists for this fedora collection.\n";
413 print STDERR "***** Not modifying it to insert a FedoraServiceProxy ServiceRack.\n";
414 }
415 else { # or do I just have a template buildConfig.xml that I copy over?
416
417 my $contents = "<buildConfig>\n";
418 $contents .= " <metadataList/>\n";
419 $contents .= " <serviceRackList>\n";
420 $contents .= " <serviceRack name=\"FedoraServiceProxy\" />\n";
421 $contents .= " </serviceRackList>\n";
422 $contents .= "</buildConfig>\n";
423
424 #print STDERR "**** collectcfg file: $collectcfg\n";
425 unless(open(FOUT, ">$buildcfg")) {
426 print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
427 exit 1;
428 }
429 print FOUT $contents;
430 close(FIN);
431 }
432}
433
434&main(@ARGV);
435
436
437
Note: See TracBrowser for help on using the repository browser.