root/main/trunk/greenstone2/perllib/g2futil.pm @ 26324

Revision 26324, 15.8 KB (checked in by ak19, 7 years ago)

1. When purging documents from the fedora repository during import and buildcol, these documents are also removed from the Fedora GSearch index. Likewise, when documents are ingested into the Fedora repository during buildcol, these documents are also indexed by Fedora GSearch. 2. Added methods in g2futil.pm to be able to do this

Line 
1package g2futil;
2
3
4BEGIN
5{
6    if (!defined $ENV{'FEDORA_HOME'}) {
7    print STDERR "Error: Environment variable FEDORA_HOME not set.\n";
8    exit 1;
9    }
10
11    my $fedora_client_bin = &util::filename_cat($ENV{'FEDORA_HOME'},"client","bin");
12    &util::envvar_append("PATH",$fedora_client_bin);
13}
14
15use strict;
16use util;
17
18sub run_cmd_old
19{
20    my ($cmd,$verbosity,$tolerate_error) = @_;
21
22    if (($verbosity == 0)
23    || (defined $tolerate_error && ($tolerate_error eq "tolerate_error"))) {
24    if($ENV{'GSDLOS'} =~ /^windows$/i) {
25        $cmd .= " > nul";
26    } else {
27        $cmd .= " > /dev/null";
28    }
29    }
30   
31    if ($verbosity >= 2) {
32    print "Running command:\n";
33    print "$cmd\n";
34    }
35
36    my $status = system($cmd);
37
38    if ($verbosity >= 2) {
39    print "Exit status = ", $status/256, "\n";
40    }
41
42    if ((!defined $tolerate_error) || ($tolerate_error ne "tolerate_error")) {
43    if ($status>0) {
44        print STDERR "Error executing:\n$cmd\n";
45        print STDERR "$!\n";
46    }
47    }
48
49    return $status;
50}
51
52
53sub run_cmd
54{
55    my ($prog,$arguments,$verbosity,$tolerate_error) = @_;
56
57    my $cmd_status = undef;
58
59    my $script_ext = ($ENV{'GSDLOS'} =~ m/^windows/) ? ".bat" : ".sh";
60
61    if ($prog =~ m/^fedora-/ || $prog =~ m/^run[A-Z]*Client/) { # fedora or fedoragsearch script
62    $prog .= $script_ext;
63    }
64    if (($prog =~ m/.pl$/i) && ($ENV{'GSDLOS'} =~ m/^windows/)) {
65    $prog ="\"".&util::get_perl_exec()."\" -S $prog";
66    }
67 
68    my $cmd = "$prog $arguments";
69
70###    print "*** cmd = $cmd\n";
71
72    if (open(CMD,"$cmd 2>&1 |"))
73    {
74    my $result = "";
75    my $line;
76    while (defined ($line = <CMD>))
77    {   
78        $result .= $line;       
79
80        if ((!defined $tolerate_error) || ($tolerate_error ne "tolerate_error"))
81        {
82        print $line;
83        }
84
85
86    }
87   
88    close(CMD);
89   
90    $cmd_status = $?;
91
92    if ($cmd_status == 0) {
93        # Check for any lines in result begining 'Error:'
94       
95        if ($result =~ m/^Error\s*:/m) {
96        # Fedora script generated an error, but did not exit
97        # with an error status => artificially raise one
98
99        $cmd_status = -1;
100        }
101    }
102
103    if ($cmd_status != 0) {
104
105        if ((!defined $tolerate_error) || ($tolerate_error ne "tolerate_error"))
106        {
107        print STDERR "Error: processing command failed.  Exit status $cmd_status\n";
108       
109        if ($verbosity >= 2) {
110            print STDERR "  Command was: $cmd\n";
111        }
112        if ($verbosity >= 3) {
113            print STDERR "result: $result\n";
114        }
115
116        }
117    }
118    }
119    else
120    {
121    print STDERR "Error: failed to execute $cmd\n";
122    }
123
124
125    return $cmd_status;
126}
127
128
129sub run_datastore_info
130{
131    my ($pid,$options) = @_;
132
133    my $verbosity = $options->{'verbosity'};
134
135    my $hostname = $options->{'hostname'};
136    my $port     = $options->{'port'};
137    my $username = $options->{'username'};
138    my $password = $options->{'password'};
139    my $protocol = $options->{'protocol'};
140
141    my $prog = "fedora-dsinfo";
142    my $arguments = "$hostname $port $username $password $pid $protocol";
143    my $status = run_cmd($prog,$arguments,$verbosity,"tolerate_error");
144
145    return $status;
146}
147
148sub run_purge
149{
150    my ($pid,$options) = @_;
151
152    my $verbosity = $options->{'verbosity'};
153
154    my $hostname = $options->{'hostname'};
155    my $port     = $options->{'port'};
156    my $username = $options->{'username'};
157    my $password = $options->{'password'};
158    my $protocol = $options->{'protocol'};
159
160    my $server = "$hostname:$port";
161
162    my $prog = "fedora-purge";
163    my $arguments = "$server $username $password $pid $protocol";
164    $arguments .= " \\\n \"Automated_purge_by_g2f_script\"";
165
166    my $status = run_cmd($prog,$arguments,$verbosity);
167
168    return $status;
169}
170
171# runs fedora gsearch's runRESTClient.sh: updateIndex deletePID <PID>
172sub run_delete_from_index
173{
174    my ($fedoragsearch_webapp,$pid,$options) = @_;
175
176    my $verbosity = $options->{'verbosity'};
177
178    my $hostname = $options->{'hostname'};
179    my $port     = $options->{'port'};
180    my $username = $options->{'username'};
181    my $password = $options->{'password'};
182    my $protocol = $options->{'protocol'};
183
184    my $server = "$hostname:$port";
185    #$ENV{'fgsUserName'} = $options->{'username'};
186    #$ENV{'fgsPassword'} = $options->{'password'};   
187
188    #my $prog = &util::filename_cat($ENV{'FEDORA_GSEARCH'}, "runRESTClient.sh");
189    my $prog = &util::filename_cat($fedoragsearch_webapp, "client", "runRESTClient.sh");
190
191    my $gsearch_commands = "updateIndex deletePID";
192    my $arguments = "$server $gsearch_commands $pid";   
193
194    my $status = run_cmd($prog,$arguments,$verbosity);
195
196    return $status;
197}
198
199# runs fedora gsearch's runRESTClient.sh: updateIndex fromPID <PID>
200sub run_update_index
201{
202    my ($fedoragsearch_webapp,$pid,$options) = @_;
203
204    my $verbosity = $options->{'verbosity'};
205
206    my $hostname = $options->{'hostname'};
207    my $port     = $options->{'port'};
208    my $username = $options->{'username'};
209    my $password = $options->{'password'};
210    my $protocol = $options->{'protocol'};
211
212    my $server = "$hostname:$port";
213    #$ENV{'fgsUserName'} = $options->{'username'};
214    #$ENV{'fgsPassword'} = $options->{'password'};   
215
216    #my $prog = &util::filename_cat($ENV{'FEDORA_GSEARCH'}, "runRESTClient.sh");
217    my $prog = &util::filename_cat($fedoragsearch_webapp, "client", "runRESTClient.sh");
218   
219    my $gsearch_commands = "updateIndex fromPID";
220    my $arguments = "$server $gsearch_commands $pid";   
221
222    my $status = run_cmd($prog,$arguments,$verbosity);
223
224    return $status;
225}
226
227sub gsearch_webapp_folder
228{   
229    my $fedoragsearch_webapp = undef;
230   
231    # if GS3, first look for a fedoragsearch webapp installed in Greenstone's tomcat
232    if(defined $ENV{'GSDL3SRCHOME'}) {
233    $fedoragsearch_webapp = &util::filename_cat($ENV{'GSDL3SRCHOME'},"packages","tomcat","webapps","fedoragsearch");   
234    return $fedoragsearch_webapp if (&util::dir_exists($fedoragsearch_webapp));
235    }
236
237    # next look for a fedoragsearch webapp installed in Fedora's tomcat
238    if(defined $ENV{'FEDORA_HOME'}) {
239    $fedoragsearch_webapp =  &util::filename_cat($ENV{'FEDORA_HOME'},"tomcat","webapps","fedoragsearch");
240    return $fedoragsearch_webapp if (&util::dir_exists($fedoragsearch_webapp));
241    }
242
243    ## check for a user-defined $ENV{'FEDORA_GSEARCH'} variable first, which points to a gsearch webapp folder??
244
245    # assume no fedoragsearch
246    return $fedoragsearch_webapp; # undef
247}
248
249
250sub run_ingest
251{
252    my ($docmets_filename,$options) = @_;
253
254    my $verbosity = $options->{'verbosity'};
255
256    my $hostname = $options->{'hostname'};
257    my $port     = $options->{'port'};
258    my $username = $options->{'username'};
259    my $password = $options->{'password'};
260    my $protocol = $options->{'protocol'};
261
262    my $server = "$hostname:$port";
263
264    my $prog = "fedora-ingest";
265
266    my $type = undef;
267   
268    if ($ENV{'FEDORA_VERSION'} =~ m/^2/) { # checking if major version is 2
269        $type = "metslikefedora1";
270    }
271    else {
272    $type = "info:fedora/fedora-system:METSFedoraExt-1.1";
273    }
274
275    my $arguments = "file \"$docmets_filename\" $type $server $username $password $protocol";
276    $arguments .= " \\\n \"Automated_purge_by_g2f_script\"";
277
278    my $status = run_cmd($prog,$arguments,$verbosity);
279
280    return $status;
281}
282
283
284sub rec_get_all_hash_dirs
285{
286    my ($full_dir,$all_dirs) = @_;
287
288    if (opendir(DIR, $full_dir)) {
289    my @sub_dirs = grep { ($_ !~ /^\./) && (-d &util::filename_cat($full_dir,$_)) } readdir(DIR);
290    closedir DIR;
291
292    my @hash_dirs = grep { $_ =~ m/\.dir$/ } @sub_dirs;
293    my @rec_dirs = grep { $_ !~ m/\.dir$/ } @sub_dirs;
294   
295    foreach my $hd (@hash_dirs) {
296        my $full_hash_dir = &util::filename_cat($full_dir,$hd);
297        push(@$all_dirs,$full_hash_dir);
298    }
299
300    foreach my $rd (@rec_dirs) {
301        my $full_rec_dir = &util::filename_cat($full_dir,$rd);
302        rec_get_all_hash_dirs($full_rec_dir,$all_dirs);
303    }       
304    }
305}
306
307sub get_all_hash_dirs
308{
309    my ($start_dir,$maxdocs) = @_;
310   
311    my @all_dirs = ();
312    rec_get_all_hash_dirs($start_dir,\@all_dirs);
313
314    if ((defined $maxdocs) && ($maxdocs ne "")) {
315    my @maxdoc_dirs = ();
316    for (my $i=0; $i<$maxdocs; $i++) {
317        push(@maxdoc_dirs,shift(@all_dirs));
318    }
319    @all_dirs = @maxdoc_dirs;
320    }
321
322    return @all_dirs;
323}
324
325sub get_hash_id
326{
327    my ($hash_dir) = @_;
328
329    my $hash_id = undef;
330
331    my $docmets_filename = &util::filename_cat($hash_dir,"docmets.xml");
332
333    if (open(DIN,"<$docmets_filename"))
334    {
335    while (defined (my $line = <DIN>))
336    {
337        if ($line =~ m/<dc:identifier>(.*?)<\/dc:identifier>/)
338        {
339        $hash_id = $1;
340        last;
341        }
342    }
343   
344    close(DIN);
345    }
346    else
347    {
348    print STDERR "Warning: Unable to open \"$docmets_filename\"\n";
349    }
350
351    return $hash_id;
352
353}
354
355
356# Subroutine to write the gsdl.xml file in FEDORA_HOME/tomcat/conf/Catalina/<host/localhost>/
357# This xml file will tell Fedora where to find the parent folder of the GS collect dir
358# so that it can obtain the FedoraMETS files for ingestion.
359# It depends on the Fedora server being on the same machine as the Greenstone server that
360# this code is part of.
361sub write_gsdl_xml_file
362{
363    my ($fedora_host, $collect_dir, $options) = @_;
364    my $verbosity = $options->{'verbosity'};
365    my $hostname = $options->{'hostname'};
366    my $port     = $options->{'port'};
367    my $protocol = $options->{'protocol'};
368
369    print STDERR "Ensuring that a correct gsdl.xml file exists on the Fedora server end\n";
370    # The top of this file has already made sure that FEDORA_HOME is set, but for GS3
371    # CATALINA_HOME is set to GS' own tomcat. Since we'll be working with fedora, we need
372    # to temporarily set CATALINA_HOME to fedora's tomcat. (Catalina is undefined for GS2.)
373    my $gs_catalina_home = $ENV{'CATALINA_HOME'} if defined $ENV{'CATALINA_HOME'};
374    $ENV{'CATALINA_HOME'} = &util::filename_cat($ENV{'FEDORA_HOME'}, "tomcat");
375   
376    # 1. Find out which folder to write to: fedora_host or localhost
377    # whichever contains fedora.xml is the one we want (if none, exit with error value?)
378    my $fedora_home = $ENV{'FEDORA_HOME'};
379    my $base_path = &util::filename_cat($fedora_home, "tomcat", "conf", "Catalina");
380
381    my $host_path = &util::filename_cat($base_path, $fedora_host);
382    my $xmlFile = &util::filename_cat($host_path, "fedora.xml");
383    if (!-e $xmlFile) {
384    # check if the folder localhost contains fedoraXML
385    $host_path = &util::filename_cat($base_path, "localhost");
386    $xmlFile = &util::filename_cat($host_path, "fedora.xml");
387    if(!-e $xmlFile) {
388        # try putting gsdl in this folder, but still print a warning
389        print STDERR "$host_path does not contain file fedora.xml. Hoping gsdl.xml belongs there anyway\n";
390    }
391    }
392
393    # 2. Construct the string we are going write to the gsdl.xml file
394    # a. get the parent directory of collect_dir by removinbg the word
395    # "collect" from it and any optional OS-type slash at the end.
396    # (Path slash direction does not matter here.)
397    my $collectParentDir = $collect_dir;
398    $collectParentDir =~ s/collect(\/|\\)?//;
399 
400    # b. Use the collectParentDir to create the contents of gsdl.xml
401    my $greenstone_url_prefix = &util::get_greenstone_url_prefix(); # would have the required slash at front
402    my $gsdlXMLcontents = "<?xml version='1.0' encoding='utf-8'?>\n<Context docBase=\"";
403    $gsdlXMLcontents = $gsdlXMLcontents.$collectParentDir."\" path=\"$greenstone_url_prefix\"></Context>";
404   
405    # 3. If there is already a gsdl.xml file in host_path, compare the string we
406    # want to write with what is already in there. If they're the same, we can return
407    $xmlFile = &util::filename_cat($host_path, "gsdl.xml");
408    if(-e $xmlFile) {
409    # such a file exists, so read the contents
410    unless(open(FIN, "<$xmlFile")) {
411        print STDERR "g2f-import.pl: Unable to open existing $xmlFile for comparing...Recoverable. $!\n";
412        # doesn't matter, we'll just overwrite it then
413    }   
414    my $xml_contents;
415    {
416        local $/ = undef;        # Read entire file at once
417        $xml_contents = <FIN>;   # Now file is read in as one single 'line'
418    }
419    close(FIN); # close the file
420    if($xml_contents eq $gsdlXMLcontents) {
421        print STDERR "Fedora links to the FLI import folder through gsdl.xml.\n";
422        # it already contains what we want, we're done
423        return "gsdl.xml";
424    }
425    }
426
427    # 4. If we're here, the contents of gsdl.xml need to be updated:
428    # a. First stop the fedora server
429    my $script_ext = ($ENV{'GSDLOS'} =~ m/^windows/) ? ".bat" : ".sh";
430    my $stop_tomcat = &util::filename_cat($fedora_home, "tomcat", "bin", "shutdown".$script_ext);
431    # execute the command
432    $! = 0; # does this initialise the return value?
433    my $status = system($stop_tomcat);
434    if ($status!=0) { # to get the actual exit value, divide by 256, but not useful here
435    # possible tomcat was already stopped - it's not the end of the world
436    print STDERR "Failed to stop Fedora server. Perhaps it was not running. $!\n";
437    print "Exit status = ", $status/256, "\n";
438    }
439
440    # b. overwrite the file that has outdated contents with the contents we just constructed
441    unless(open(FOUT, ">$xmlFile")) {  # create or overwrite gsdl.xml file
442    die "g2f-import.pl: Unable to open $xmlFile for telling Fedora where the collect dir is...ERROR: $!\n";
443    }
444    # write out the updated contents and close the file
445    print FOUT $gsdlXMLcontents;
446    close(FOUT);
447
448    # c. Restart the fedora server
449    my $start_tomcat = &util::filename_cat($fedora_home, "tomcat", "bin", "startup".$script_ext);
450    $! = 0;
451    $status = system($start_tomcat);
452    if ($status!=0) {
453    print STDERR "Failed to restart the Fedora server... ERROR: $!\n";
454    print "Exit status = ", $status/256, "\n";
455    }
456
457    # reset CATALINA_HOME to GS' Tomcat (it is undefined for GS2 since GS2 has no tomcat):
458    $ENV{'CATALINA_HOME'} = $gs_catalina_home if defined $gs_catalina_home;
459   
460    # Starting up the Fedora server takes a long time. We need to wait for the server to be
461    # ready before import can continue, because g2f-import relies on an up-and-running Fedora
462    # server to purge the collection from it while g2f-build.pl needs a ready Fedora server
463    # in order to make it ingest the FedoraMETS. Sleeping is not sufficient (#sleep 10;) since
464    # the subsequent steps depend on a proper server restart.
465    # Dr Bainbridge's suggestion: test the server is ready with a call to wget.
466   
467    # Wget tries to retrieve the fedora search page (protocol://host:port/fedora/search)
468    # 20 times, waiting 3 seconds between each failed attempt. If it ultimately fails, we
469    # print a message to the user.
470    # The wget --spider option makes it check that the page is merely there rather than
471    # downloading it (see http://www.gnu.org/software/wget/manual/wget.html#Download-Options)
472    # -q is for quiet, --tries for the number of retries, --waitretry is the number of seconds
473    # between each attempt. Usually wget returns the contents of the page, but in our case it
474    # will return 0 for success since we are not downloading.
475
476    print STDERR "Fedora server restarted. Waiting for it to become ready...\n";
477    #print STDERR "****$protocol://$hostname:$port/fedora/search\n";
478    $! = 0;
479    #my $fedoraServerReady = system("wget -q --spider --waitretry=10 --tries=20 $protocol://$hostname:$port/fedora/search");
480
481    # The retries above won't work if the server isn't running:
482    # http://www.gnu.org/software/wget/manual/wget.html
483    #'--tries=number'
484    # Set number of retries to number. Specify 0 or 'inf' for infinite retrying. The default is to retry 20 times,
485    # with the exception of fatal errors like "connection refused" or "not found" (404), which ARE NOT RETRIED.
486
487    # retry fedora server every second for a total of 20 times until the server is ready
488    my $fedoraServerReady = 0;
489    my $count = 0;
490    do {
491    $fedoraServerReady = system("wget -q --spider $protocol://$hostname:$port/fedora/search");
492    if($fedoraServerReady != 0) {
493        sleep(1);
494        $count++;
495        #print STDERR "$count second(s)\n";
496    }
497    } while($fedoraServerReady != 0 && $count < 20);
498
499    if($fedoraServerReady != 0) {
500    print STDERR "Fedora server is still not ready... ERROR: $!\n";
501    print "Exit status = ", $fedoraServerReady/256, "\n";
502    die "Exiting....\n";
503    }
504
505    # return some indication that things went well
506    return "gsdl.xml";
507}
508
509
5101;
Note: See TracBrowser for help on using the browser.