root/main/trunk/greenstone2/bin/script/g2f-buildcol.pl @ 26324

Revision 26324, 14.6 KB (checked in by ak19, 7 years ago)

1. When purging documents from the fedora repository during import and buildcol, these documents are also removed from the Fedora GSearch index. Likewise, when documents are ingested into the Fedora repository during buildcol, these documents are also indexed by Fedora GSearch. 2. Added methods in g2futil.pm to be able to do this

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2
3BEGIN
4{
5    if (!defined $ENV{'GSDLHOME'}) {
6    print STDERR "Environment variable GSDLHOME not set.\n";
7    print STDERR "  Have you sourced Greenstone's 'setup.bash' file?\n";
8    exit 1;
9    }
10
11    if (!defined $ENV{'JAVA_HOME'}) {
12    print STDERR "Environment variable JAVA_HOME not set.\n";
13    print STDERR "Needed by Fedora command line scripts.\n";
14    exit 1;
15    }
16
17    $ENV{'FEDORA_HOSTNAME'} = "localhost" if (!defined $ENV{'FEDORA_HOSTNAME'});
18    $ENV{'FEDORA_SERVER_PORT'} = "8080" if (!defined $ENV{'FEDORA_SERVER_PORT'});
19    $ENV{'FEDORA_USER'}     = "fedoraAdmin" if (!defined $ENV{'FEDORA_USER'});
20    $ENV{'FEDORA_PASS'}     = "fedoraAdmin" if (!defined $ENV{'FEDORA_PASS'});
21    $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'});
22    $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'});
23    $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'});
24
25    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/");
26
27}
28
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and vice versa
32no strict 'subs'; # allow barewords (e.g. STDERR) as function arguments
33
34use util;
35use gsprintf 'gsprintf';
36use printusage;
37use parse2;
38use cfgread;
39use colcfg;
40
41use g2futil;
42
43use dbutil;
44
45my $arguments =
46    [
47      { 'name' => "verbosity",
48    'desc' => "Level of verbosity generated",
49    'type' => "string",
50    'deft' => '1',
51    'reqd' => "no",
52        'hiddengli' => "no" },
53      { 'name' => "hostname",
54    'desc' => "Domain hostname of Fedora server",
55    'type' => "string",
56    'deft' => $ENV{'FEDORA_HOSTNAME'},
57    'reqd' => "no",
58        'hiddengli' => "no" },
59      { 'name' => "port",
60    'desc' => "Port that the Fedora server is running on.",
61    'type' => "string",
62    'deft' => $ENV{'FEDORA_SERVER_PORT'},
63    'reqd' => "no",
64        'hiddengli' => "no" },
65      { 'name' => "username",
66    'desc' => "Fedora admin username",
67    'type' => "string",
68    'deft' => $ENV{'FEDORA_USER'},
69    'reqd' => "no",
70        'hiddengli' => "no" },
71      { 'name' => "password",
72    'desc' => "Fedora admin password",
73    'type' => "string",
74    'deft' => $ENV{'FEDORA_PASS'},
75    'reqd' => "no",
76        'hiddengli' => "no" },
77      { 'name' => "protocol",
78    'desc' => "Fedora protocol, e.g. 'http' or 'https'",
79    'type' => "string",
80    'deft' => $ENV{'FEDORA_PROTOCOL'},
81    'reqd' => "no",
82        'hiddengli' => "no" },
83      { 'name' => "pidnamespace",
84    'desc' => "Fedora prefix for PIDs",
85    'type' => "string",
86    'deft' => $ENV{'FEDORA_PID_NAMESPACE'},
87    'reqd' => "no",
88        'hiddengli' => "no" },
89      { 'name' => "gli",
90    'desc' => "",
91    'type' => "flag",
92    'reqd' => "no",
93    'hiddengli' => "yes" },
94      { 'name' => "xml",
95    'desc' => "{scripts.xml}",
96    'type' => "flag",
97    'reqd' => "no",
98    'hiddengli' => "yes" },
99      { 'name' => "removeold",
100    'desc' => "{import.removeold}",
101    'type' => "flag",
102    'reqd' => "no",
103    'modegli' => "3" },
104      { 'name' => "language",
105    'desc' => "{scripts.language}",
106    'type' => "string",
107    'reqd' => "no",
108    'modegli' => "3" },
109      { 'name' => "collectdir",
110    'desc' => "{import.collectdir}",
111    'type' => "string",
112    'deft' => "",
113    'reqd' => "no",
114    'hiddengli' => "yes" }
115      ];
116
117my $prog_options
118    = { 'name' => "g2fbuildcol.pl",
119    'desc' => "Ingest Greenstone directory of FedoraMETS documents into Fedora",
120    'args' => $arguments };
121
122
123sub main
124{
125    my (@ARGV) = @_;
126
127    my $GSDLHOME = $ENV{'GSDLHOME'};
128
129
130    my $options = {};
131    # general options available to all plugins
132    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$options,"allow_extra_options");
133
134    # Something went wrong with parsing
135    if ($intArgLeftinAfterParsing ==-1)
136    {
137    &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
138    die "\n";
139    }
140
141    my $xml = $options->{'xml'};
142    my $gli = $options->{'gli'};
143
144    if ($intArgLeftinAfterParsing != 1)
145    {
146    if ($xml) {
147        &PrintUsage::print_xml_usage($prog_options);
148        print "\n";
149        return;
150    }
151    else {
152        &PrintUsage::print_txt_usage($prog_options, "[options] greenstone-col");
153        print "\n";
154        return;
155    }
156       
157    }
158
159    my $gs_col = $ARGV[0];
160
161    my $verbosity = $options->{'verbosity'};
162    my $hostname  = $options->{'hostname'};
163    my $port      = $options->{'port'};
164    my $username  = $options->{'username'};
165    my $password  = $options->{'password'};
166    my $protocol  = $options->{'protocol'};
167    my $pid_namespace = $options->{'pidnamespace'};
168
169    # The following are needed in the FedoraMETS plugout
170    $ENV{'FEDORA_HOSTNAME'} = $hostname;
171    $ENV{'FEDORA_SERVER_PORT'} = $port;
172
173    my $collectdir = $options->{'collectdir'};
174
175    if (!$collectdir) {
176    if($ENV{'GSDL3HOME'}) {
177        $collectdir = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite","collect");
178    } else {
179        $collectdir = util::filename_cat($ENV{'GSDLHOME'},"collect");
180    }
181    }
182
183    my $full_gs_col = util::filename_cat($collectdir,$gs_col);
184
185
186    if (!-e $full_gs_col ) {
187    print STDERR "Unable to find Greenstone collection $full_gs_col\n";
188    exit 1;
189    }
190
191##    my $archives_dir = &util::filename_cat($full_gs_col,"archives");
192    my $export_dir = &util::filename_cat($full_gs_col,"export");
193
194
195    print "***\n";
196    print "* Ingesting Greenstone processed files into Fedora $pid_namespace\n";
197    print "***\n";
198
199    # Following falls foul of Schematron rule checking
200    my $fd_add_prog = "fedora-ingest";
201#    my $fd_add_cmd;
202#    $fd_add_args  = "dir $export_dir O metslikefedora1 $hostname:$port $username $password \\\n";
203#    $fd_add_args .= "  \"Automated_ingest_by_gs2fed.pl\"";
204
205#    &g2futil::run_cmd($fd_add_prog,$fd_add_args,$options);
206
207
208    # => Ingest individually!
209
210    # set up fedoragsearch for updating the index upon ingesting documents
211    my $fedoragsearch_webapp = &g2futil::gsearch_webapp_folder();   
212
213    # need the username and password preset in order to run fedoraGSearch's RESTClient script
214    # this assumes that the fedoragsearch authentication details are the same as for fedora
215    if (defined $fedoragsearch_webapp) {   
216    $ENV{'fgsUserName'} = $options->{'username'};
217    $ENV{'fgsPassword'} = $options->{'password'};
218    }
219
220    if (opendir(DIR, $export_dir)) {
221    closedir DIR;
222    ## my @hash_dirs = grep { /\.dir$/ } readdir(DIR);
223    my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir);
224
225
226    # for each hash dir, purge its respective PID
227    foreach my $hd (@hash_dirs) {
228
229        my $hash_id = &g2futil::get_hash_id($hd);
230       
231        if (defined $hash_id) {
232
233        my $pid = "$pid_namespace:$gs_col-$hash_id";
234       
235
236        my $dsinfo_status = &g2futil::run_datastore_info($pid,$options);
237       
238        if ($dsinfo_status == 0) {
239            # first remove the doc from the gsearch index before removing it from the fedora repository
240            print "  deleting $pid from GSearch index\n";
241            &g2futil::run_delete_from_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
242
243            print "  $pid being updated.\n";
244            &g2futil::run_purge($pid,$options);
245        }
246        else {
247            print "  $pid not present.\n";
248        }
249        }
250
251        my $docmets_filename
252        = &util::filename_cat($hd,"docmets.xml");
253
254        print STDERR "<Build>\n" if $gli;
255
256        print "Ingesting $docmets_filename\n";
257
258        my $status = &g2futil::run_ingest($docmets_filename,$options);
259
260        # if the document was ingested into Fedora successfully, index it with GSearch next
261        if($status == 0) {
262        if(defined $hash_id) {
263            my $pid = "$pid_namespace:$gs_col-$hash_id";
264            # now update the fedoragsearch index with the newly ingested document
265            &g2futil::run_update_index($fedoragsearch_webapp,$pid,$options) if defined $fedoragsearch_webapp;
266        }
267        }
268
269        print STDERR "</Build>\n" if $gli;
270
271    }       
272    }
273    else {
274    print STDERR "Error: Unable to open directory $export_dir: $!\n";
275    exit 1;
276    }
277
278
279# can possibly use inexport instead of running buildcol.pl through system()
280    print STDERR "**** Just for now, also run Greenstone's buildcol.pl\n";
281
282    my $gs_opts = " -verbosity $verbosity";
283    $gs_opts .= " -gli" if ($gli);
284    $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
285    $gs_opts .= " -mode infodb";
286
287    my $gs_buildcol_arguments = "$gs_opts $gs_col";
288
289    &g2futil::run_cmd("buildcol.pl", $gs_buildcol_arguments, $options);
290
291    # read in collect cfg file to work out db type
292    my $collectcfg = &util::filename_cat ($collectdir, $gs_col, "etc", "collectionConfig.xml");
293    #print STDERR "**** collectcfg file: $collectcfg\n";
294    unless(open(FIN, "<$collectcfg")) {
295    print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
296    exit 1;
297    }
298    close(FIN);
299
300    # for now we assume GS3, since that's what the following gets implemented for
301    my $collect_cfg = &colcfg::read_collection_cfg ($collectcfg, "gs3");
302    # get the database type for this collection from its configuration file (may be undefined)
303    my $infodbtype = $collect_cfg->{'infodbtype'} || &dbutil::get_default_infodb_type();
304 
305    # open .gdbm database file in building/text/$colname.gdb, using dbutil
306    my $colname = $gs_col;
307    $colname =~ s/(:?\\|\/)(.*)$/$1/; # remove any collect group from collection name to get tailname
308
309    my $building_txt_dir = &util::filename_cat ($collectdir, $gs_col, "building", "text");
310    my $building_txt_db = &dbutil::get_infodb_file_path($infodbtype, "$colname", $building_txt_dir);
311
312    # foreach key that matches http://dir1/dir2/....file.xxx
313    my $db_keys = {};
314    &dbutil::read_infodb_keys($infodbtype,$building_txt_db, $db_keys);
315
316    foreach my $key (keys %$db_keys) {
317    if($key =~ m@^http://@) {
318
319        # get value for the key
320        my $src_rec_string = &dbutil::read_infodb_entry($infodbtype,$building_txt_db, $key);
321        my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
322        my $OID_hash_value = $src_rec->{'section'}->[0];
323        $OID_hash_value = "$pid_namespace:$gs_col-".$OID_hash_value; # convert to fedoraPID
324
325        #   its fedora pid = "greenstone-http:$colname-http:||dir|file.xxx"
326        # except that fedorapids don't like extra colons and don't like |
327        my $fedora_identifier = "$pid_namespace-http:$gs_col-$key";
328        # CAN'T HAVE | OR : (as in "http:||one|two.html") in fedoraPID
329        $key =~ s@/@_@g;
330        $key =~ s@:@-@g;
331        my $fedora_pid = "$pid_namespace-http:$gs_col-$key";
332
333        #   To run fedora ingest on the new file need to have sensible
334        #   filenames that won't offend windows     
335        my $fedora_key_file_name = "$fedora_pid";
336        $fedora_key_file_name =~ s@\.@-@g;
337        $fedora_key_file_name =~ s/\:/=/g;
338        $fedora_key_file_name .= ".xml";
339        print STDERR "+++++ fpid: $fedora_pid, fedora-key filename: $fedora_key_file_name\n";
340
341        #   write out a FedoraMets File for this key (in /tmp)
342        #   -> it has one metadata value, which is 'dc:title' = HASHxxxxxx
343       
344         # The HASHID shouldn't be the title: then will have
345         # duplicate titles and it will be hard to search for
346         # unique ones. What about making the filename the
347         # dc.title and the HASHID the dc.identifier
348
349        my $contents = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
350        $contents .= "<mets:mets xmlns:mets=\"http://www.loc.gov/METS/\"\n";
351        $contents .= " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
352        $contents .= " xmlns:gsdl3=\"http://www.greenstone.org/namespace/gsdlmetadata/1.0/\"\n";
353        $contents .= " xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n";
354        $contents .= " xsi:schemaLocation=\"http://www.loc.gov/METS/\n";
355        $contents .= " http://www.loc.gov/standards/mets/mets.xsd\n";
356        $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/\n";
357        $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd\"\n";
358        $contents .= " OBJID=\"$fedora_pid\"\n";
359#       $contents .= " OBJID=\"greenstone:$gs_col-HASH1f814d07252c354039ee11\"\n";
360        $contents .= " TYPE=\"FedoraObject\" LABEL=\"$fedora_pid\" EXT_VERSION=\"1.1\">\n";
361        $contents .= "<mets:metsHdr RECORDSTATUS=\"A\"/>\n";
362        $contents .= "   <mets:amdSec ID=\"DC\" >\n";
363        $contents .= "      <mets:techMD ID=\"DC.0\">\n";
364        $contents .= "         <mets:mdWrap LABEL=\"Metadata\" MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"DCgsdl1\">\n";
365        $contents .= "            <mets:xmlData>\n";
366        $contents .= "               <oai_dc:dc xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" >\n";
367        $contents .= "                  <dc:title>$OID_hash_value</dc:title>\n";
368#       $contents .= "                  <dc:identifier>$fedora_identifier</dc:identifier>\n";
369        $contents .= "               </oai_dc:dc>\n";
370        $contents .= "            </mets:xmlData>\n";
371        $contents .= "         </mets:mdWrap>\n";
372        $contents .= "      </mets:techMD>\n";
373        $contents .= "   </mets:amdSec>\n";
374        $contents .= "</mets:mets>\n";     
375
376   
377        #   write out the file and then run fedora ingest on that file
378        #   The file gets purged in g2f-import.pl, so don't remove it from export dir now
379        my $fedora_key_file_path = &util::filename_cat($export_dir, $fedora_key_file_name);
380        unless(open(FOUT, ">$fedora_key_file_path")) {
381        print STDERR "g2f-buildcol.pl: Unable to open $fedora_key_file_path...ERROR: $!\n";
382        exit 1;
383        }
384        print FOUT $contents;
385        close(FOUT);
386
387        print STDERR "<Build>\n" if $gli;
388        print STDERR "Ingesting $fedora_key_file_name\n";
389        print STDERR "#### ".join(",", %$options)."\n";
390
391        &g2futil::run_ingest($fedora_key_file_path,$options);
392        print STDERR "</Build>\n" if $gli;
393    }
394   
395    }
396
397
398    # If successful!!! Then need to think about:
399    #    [CLX] nodes
400    #    Doing this with FedoraMETSPlugin
401
402   
403    # for the Greenstone reader interface to make the new Fedora collection available,
404    # need to write out buildConfig.xml with FedoraServiceProxy as a new ServiceRack element
405    # Kathy thinks it's better to create a buildConfig.xml than put it in collectionConfig.xml
406   
407    my $indexdir = &util::filename_cat ($collectdir, $gs_col, "index");
408    &util::mk_dir($indexdir) unless &util::dir_exists($indexdir);
409   
410    my $buildcfg = &util::filename_cat ($indexdir, "buildConfig.xml");
411    if(-e $buildcfg) {
412    print STDERR "***** $buildcfg already exists for this fedora collection.\n";
413    print STDERR "***** Not modifying it to insert a FedoraServiceProxy ServiceRack.\n";
414    }
415    else { # or do I just have a template buildConfig.xml that I copy over?
416   
417    my $contents = "<buildConfig>\n";
418    $contents .= "  <metadataList/>\n";
419    $contents .= "  <serviceRackList>\n";                                                           
420    $contents .= "    <serviceRack name=\"FedoraServiceProxy\" />\n";
421    $contents .= "  </serviceRackList>\n";
422    $contents .= "</buildConfig>\n";
423   
424    #print STDERR "**** collectcfg file: $collectcfg\n";
425    unless(open(FOUT, ">$buildcfg")) {
426        print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
427        exit 1;
428    }
429    print FOUT $contents;
430    close(FIN);
431    }
432}
433
434&main(@ARGV);
435
436
437
Note: See TracBrowser for help on using the browser.