root/gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl @ 24447

Revision 24447, 11.3 KB (checked in by davidb, 9 years ago)

Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running

Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31#  http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server. 
36
37
38BEGIN {
39    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42    die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43}
44
45use strict;
46use util;
47
48# Not quite OO, but close enough for now
49#
50my $self = { 'full_server_jar' => undef,
51         'jetty_explicitly_started' => undef,
52         'jetty_stop_key' => "greenstone-solr"
53         };
54
55
56
57sub locate_file
58{
59    my ($search_path,$suffix) = @_;
60       
61    foreach my $sp (@$search_path) {
62    my $full_path = &util::filename_cat($sp,$suffix);
63   
64    if (-f $full_path) {
65        return $full_path;
66    }
67    }
68   
69    # if get to here, then failed to find match
70
71    print STDERR "Error: Failed to find '$suffix'\n";
72    print STDERR "  Looked in: ", join(", ", @$search_path), "\n";
73    exit -1;
74}
75
76sub start_solr_server
77{
78    my ($search_path) = @_;
79
80    my $solr_home         = $ENV{'GEXT_SOLR'};
81    my $jetty_stop_port   = $ENV{'JETTY_STOP_PORT'};
82    my $jetty_server_port = $ENV{'SOLR_JETTY_PORT'};
83
84    chdir($solr_home);
85   
86    my $solr_etc = &util::filename_cat($solr_home,"etc");
87
88    my $server_props = "-DSTOP.PORT=$jetty_stop_port";
89    $server_props .= " -DSTOP.KEY=".$self->{'jetty_stop_key'};
90    $server_props .= " -Dsolr.solr.home=$solr_etc";
91
92    my $server_jar = &util::filename_cat("lib","java","solr-jetty-server.jar");
93    my $full_server_jar = locate_file($search_path,$server_jar);
94    $self->{'full_server_jar'} = $full_server_jar;
95   
96    my $server_java_cmd = "java $server_props -jar \"$full_server_jar\"";
97
98##    print STDERR "**** server cmd = $server_java_cmd\n";
99
100    if (open(SIN,"$server_java_cmd 2>&1 |")) {
101   
102    my $server_status = "unknown";
103
104    my $line;
105    while (defined($line=<SIN>)) {
106        # Scan through output until you see a line like:
107        #   2011-08-22 .. :INFO::Started SocketConnector@0.0.0.0:8983
108        # which signifies that the server has started up and is
109        # "ready and listening"
110
111##      print STDERR "**** $line";
112
113        if (($line =~ m/^(WARN|ERROR|SEVERE):/)
114        || ($line =~ m/^[0-9 :-]*(WARN|ERROR|SEVERE)::/)) {
115        print $line;
116        }
117
118
119        if ($line =~ m/WARN::failed SocketConnector/) {
120        if ($line =~ m/Address already in use/) {
121            $server_status = "already-running";
122        }
123        else {
124            $server_status = "failed-to-start";
125        }
126        last;
127        }
128       
129        if ($line =~ m/INFO::Started SocketConnector/) {
130        $server_status = "explicitly-started";
131        last;
132        }
133    }
134
135    if ($server_status eq "explicitly-started") {
136        $self->{'jetty_explicitly_started'} = 1;
137        print STDERR "Jetty server ready and listening for connections\n";
138    }
139    elsif ($server_status eq "already-running") {
140        print STDERR "Using existing server detected on port $jetty_server_port\n";
141    }
142    else {
143        print STDERR "Failed to start Solr/Jetty web server on $jetty_server_port\n";
144        exit -1;
145    }
146       
147    # now we know the server is ready to accept connections, fork a
148    # child process that continues to listen to the output and
149    # prints out any lines that are not INFO lines
150
151    if (fork()==0) {
152        # child process
153       
154        my $line;
155        while (defined ($line = <SIN>)) {
156        next if ($line =~ m/^INFO:/);
157        next if ($line =~ m/^[0-9 :-]*INFO::/);
158        next if ($line =~ m/^\d{2}\/\d{2}\/\d{4}\s+/);
159        }
160        close(SIN);
161       
162        # And now stop nicely
163        exit 0;
164    }
165    }
166    else {
167    print STDERR "Error: failed to start solr-jetty-server\n";
168    print STDERR "!$\n\n";
169    print STDERR "Command attempted was:\n";
170    print STDERR "  $server_java_cmd\n";
171    print STDERR "run from directory:\n";
172    print STDERR "  $solr_home\n";
173    print STDERR "----\n";
174
175    exit -1;
176    }
177
178    # If get to here then server started (and ready and listening)
179    # *and* we are the parent process of the fork()
180
181}
182
183
184
185sub stop_solr_server
186{
187    my $full_server_jar = $self->{'full_server_jar'};
188    my $jetty_stop_port = $ENV{'JETTY_STOP_PORT'};
189   
190    my $server_props = "-DSTOP.PORT=$jetty_stop_port";
191    $server_props   .= " -DSTOP.KEY=".$self->{'jetty_stop_key'};
192    my $server_java_cmd = "java $server_props -jar \"$full_server_jar\" --stop";
193
194    my $server_status = system($server_java_cmd);
195   
196    if ($server_status!=0) {
197    print STDERR "Error: failed to stop solr-jetty-server\n";
198    print STDERR "!$\n";
199    exit -1;
200    }
201    else {
202    wait(); # let the child process finish
203    print STDERR "Jetty server shutdown\n";
204    }
205}
206
207
208sub open_java_solr
209{
210  my ($collect, $doc_tag_level,$full_builddir,$indexdir,$removeold) = @_;
211
212
213  # if removeold set, then delete the curring $full_builddir
214  if ($removeold) {
215      my $full_indexdir = &util::filename_cat($full_builddir,$indexdir);
216      &util::rm_r($full_indexdir);
217  }
218
219  my $search_path = [];
220
221  push(@$search_path,$ENV{'GSDLCOLLECTDIR'}) if defined $ENV{'GSDLCOLLECTDIR'};
222  push(@$search_path,$ENV{'GSDLHOME'})       if defined $ENV{'GSDLHOME'};
223  push(@$search_path,$ENV{'GEXT_SOLR'})      if defined $ENV{'GEXT_SOLR'};
224
225
226  # The following returns once Jetty has generated its
227  # "reading and listening" line
228  #
229  start_solr_server($search_path);
230
231  # Now run the solr-post command
232
233  chdir($ENV{'GEXT_SOLR'});
234 
235  my $post_jar   = &util::filename_cat("lib","java","solr-post.jar");
236  my $full_post_jar   = locate_file($search_path,$post_jar);
237
238  my $jetty_server_port = $ENV{'SOLR_JETTY_PORT'};
239
240  # Now run solr-post command
241  my $post_props = "-Durl=http://localhost:$jetty_server_port/solr/$collect-$doc_tag_level/update";
242  $post_props .= " -Ddata=stdin";
243  $post_props .= " -Dcommit=yes";
244
245  my $post_java_cmd = "java $post_props -jar \"$full_post_jar\"";
246
247###  print STDERR "**** post cmd = $post_java_cmd\n";
248 
249  open (PIPEOUT, "| $post_java_cmd")
250      || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n";
251}
252
253
254
255sub close_java_solr
256{
257    # closing the pipe has the effect of shutting down solr-post.jar
258    close(PIPEOUT);
259   
260    if ($self->{'jetty_explicitly_started'}) {
261    stop_solr_server();
262    }
263}
264
265
266#----
267
268sub save_xml_doc
269{
270    # This is identical to the one in lucene_passes.pl, and should be
271    # moved in to a package and shared ####
272
273    my ($full_textdir,$output_filename,$doc_xml) = @_;
274
275    my $dir_sep = &util::get_os_dirsep();
276
277    my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
278    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
279    &util::mk_all_dir($full_output_dir);
280
281    open(DOCOUT,">$full_output_filename")
282    || die "Unable to open $full_output_filename";
283
284    print DOCOUT $doc_xml;
285    close(DOCOUT);
286
287    # What this the purpose of the following? ####
288    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
289}
290
291
292sub compress_xml_doc
293{
294    # This is identical to the one in lucene_passes.pl, and should be
295    # moved in to a package and shared ####
296
297    my ($full_textdir,$output_filename) = @_;
298
299    my $full_output_filename
300    = &util::filename_cat($full_textdir,$output_filename);
301
302    # Greenstone ships with gzip for Windows
303    `gzip $full_output_filename`;
304}
305
306
307sub monitor_xml_stream
308{
309    # based on lucene's monitor_xml_stream, but simplified
310    # as only now used when in "text" mode
311
312    my ($full_textdir) = @_;
313
314    my $doc_xml = "";
315    my $output_filename = "";
316
317    my $line;
318    while (defined ($line = <STDIN>)) {
319
320    $doc_xml .= $line;
321
322    if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
323        $output_filename = $1;     
324    }
325   
326    if ($line =~ m/^<\/Doc>$/) {
327        save_xml_doc($full_textdir,$output_filename,$doc_xml);
328
329        # Compress file
330        #
331        # The compress option was taken out for efficiency
332        # reasons.  Consider putting it back in but making it a
333        # switch so a collection builder can decide for themselves on a
334        # case by case basis if they want to save on diskspace, but have
335        # the overhead of uncompressing at runtime
336       
337###     compress_xml_doc($full_textdir,$output_filename);
338
339        $doc_xml = "";
340        $output_filename = "";
341    }
342    }
343}
344
345
346sub pass_on_xml_stream
347{
348    my $line;
349    while (defined ($line = <STDIN>)) {
350    print PIPEOUT $line;
351    }
352}
353
354
355
356
357# /** This checks the arguments on the command line, filters the
358#  *  unknown command line arguments and then calls the open_java_solr
359#  *  function to begin processing.
360#  */
361sub main
362{
363  my (@argv) = @_;
364  my $argc = scalar(@argv);
365
366  my $removeold = 0;
367  my @filtered_argv = ();
368
369  my $i = 0;
370  while ($i<$argc) {
371    if ($argv[$i] =~ m/^\-(.*)$/) {
372
373      my $option = $1;
374
375      # -removeold causes the existing index to be overwritten
376      if ($option eq "removeold") {
377        print STDERR "\n-removeold set (new index will be created)\n";
378        $removeold = 1;
379      }
380      # -verbosity <num>
381      elsif ($option eq "verbosity") {
382        $i++;
383        if ($i<$argc)
384    {
385      # solr indexing has no support for verbosity
386      # => parse to be compatible with calling program, but supress it
387      #    for solr-post.jar
388        }
389      }
390      else {
391        print STDERR "Unrecognised minus option: -$option\n";
392      }
393    }
394    else {
395        push(@filtered_argv,$argv[$i]);
396    }
397    $i++;
398  }
399
400  my $filtered_argc = scalar(@filtered_argv);
401
402  if ($filtered_argc < 5) {
403    print STDERR "Usage: solr_passes.pl [-removeold|-verbosity num] collect \"text\"|\"index\" doc-tag-level build-dir index-name\n";
404    exit 1;
405  }
406
407  my $collect       = $filtered_argv[0];
408  my $mode          = $filtered_argv[1];
409  my $doc_tag_level = $filtered_argv[2];
410  my $full_builddir = $filtered_argv[3];
411  my $indexdir      = $filtered_argv[4];
412
413  # We only need the Solr handle opened if we are indexing the
414  # documents, not if we are just storing the text
415  if ($mode eq "index") {
416    open_java_solr($collect, $doc_tag_level, $full_builddir, $indexdir, $removeold);
417  }
418
419  if ($mode eq "text") {
420      print STDERR "Monitoring for input!\n";
421      my $full_textdir = &util::filename_cat($full_builddir,"text");
422      monitor_xml_stream($full_textdir);
423  }
424  else {
425      print STDERR "Streaming document input onto Solr server!\n";
426      pass_on_xml_stream();
427  }
428
429
430  if ($mode eq "index") {
431    close_java_solr();
432  }
433}
434
435
436&main(@ARGV);
Note: See TracBrowser for help on using the browser.