source: gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl@ 24447

Last change on this file since 24447 was 24447, checked in by davidb, 11 years ago

Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running

File size: 11.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31# http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42 die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43}
44
45use strict;
46use util;
47
48# Not quite OO, but close enough for now
49#
50my $self = { 'full_server_jar' => undef,
51 'jetty_explicitly_started' => undef,
52 'jetty_stop_key' => "greenstone-solr"
53 };
54
55
56
57sub locate_file
58{
59 my ($search_path,$suffix) = @_;
60
61 foreach my $sp (@$search_path) {
62 my $full_path = &util::filename_cat($sp,$suffix);
63
64 if (-f $full_path) {
65 return $full_path;
66 }
67 }
68
69 # if get to here, then failed to find match
70
71 print STDERR "Error: Failed to find '$suffix'\n";
72 print STDERR " Looked in: ", join(", ", @$search_path), "\n";
73 exit -1;
74}
75
76sub start_solr_server
77{
78 my ($search_path) = @_;
79
80 my $solr_home = $ENV{'GEXT_SOLR'};
81 my $jetty_stop_port = $ENV{'JETTY_STOP_PORT'};
82 my $jetty_server_port = $ENV{'SOLR_JETTY_PORT'};
83
84 chdir($solr_home);
85
86 my $solr_etc = &util::filename_cat($solr_home,"etc");
87
88 my $server_props = "-DSTOP.PORT=$jetty_stop_port";
89 $server_props .= " -DSTOP.KEY=".$self->{'jetty_stop_key'};
90 $server_props .= " -Dsolr.solr.home=$solr_etc";
91
92 my $server_jar = &util::filename_cat("lib","java","solr-jetty-server.jar");
93 my $full_server_jar = locate_file($search_path,$server_jar);
94 $self->{'full_server_jar'} = $full_server_jar;
95
96 my $server_java_cmd = "java $server_props -jar \"$full_server_jar\"";
97
98## print STDERR "**** server cmd = $server_java_cmd\n";
99
100 if (open(SIN,"$server_java_cmd 2>&1 |")) {
101
102 my $server_status = "unknown";
103
104 my $line;
105 while (defined($line=<SIN>)) {
106 # Scan through output until you see a line like:
107 # 2011-08-22 .. :INFO::Started SocketConnector@0.0.0.0:8983
108 # which signifies that the server has started up and is
109 # "ready and listening"
110
111## print STDERR "**** $line";
112
113 if (($line =~ m/^(WARN|ERROR|SEVERE):/)
114 || ($line =~ m/^[0-9 :-]*(WARN|ERROR|SEVERE)::/)) {
115 print $line;
116 }
117
118
119 if ($line =~ m/WARN::failed SocketConnector/) {
120 if ($line =~ m/Address already in use/) {
121 $server_status = "already-running";
122 }
123 else {
124 $server_status = "failed-to-start";
125 }
126 last;
127 }
128
129 if ($line =~ m/INFO::Started SocketConnector/) {
130 $server_status = "explicitly-started";
131 last;
132 }
133 }
134
135 if ($server_status eq "explicitly-started") {
136 $self->{'jetty_explicitly_started'} = 1;
137 print STDERR "Jetty server ready and listening for connections\n";
138 }
139 elsif ($server_status eq "already-running") {
140 print STDERR "Using existing server detected on port $jetty_server_port\n";
141 }
142 else {
143 print STDERR "Failed to start Solr/Jetty web server on $jetty_server_port\n";
144 exit -1;
145 }
146
147 # now we know the server is ready to accept connections, fork a
148 # child process that continues to listen to the output and
149 # prints out any lines that are not INFO lines
150
151 if (fork()==0) {
152 # child process
153
154 my $line;
155 while (defined ($line = <SIN>)) {
156 next if ($line =~ m/^INFO:/);
157 next if ($line =~ m/^[0-9 :-]*INFO::/);
158 next if ($line =~ m/^\d{2}\/\d{2}\/\d{4}\s+/);
159 }
160 close(SIN);
161
162 # And now stop nicely
163 exit 0;
164 }
165 }
166 else {
167 print STDERR "Error: failed to start solr-jetty-server\n";
168 print STDERR "!$\n\n";
169 print STDERR "Command attempted was:\n";
170 print STDERR " $server_java_cmd\n";
171 print STDERR "run from directory:\n";
172 print STDERR " $solr_home\n";
173 print STDERR "----\n";
174
175 exit -1;
176 }
177
178 # If get to here then server started (and ready and listening)
179 # *and* we are the parent process of the fork()
180
181}
182
183
184
185sub stop_solr_server
186{
187 my $full_server_jar = $self->{'full_server_jar'};
188 my $jetty_stop_port = $ENV{'JETTY_STOP_PORT'};
189
190 my $server_props = "-DSTOP.PORT=$jetty_stop_port";
191 $server_props .= " -DSTOP.KEY=".$self->{'jetty_stop_key'};
192 my $server_java_cmd = "java $server_props -jar \"$full_server_jar\" --stop";
193
194 my $server_status = system($server_java_cmd);
195
196 if ($server_status!=0) {
197 print STDERR "Error: failed to stop solr-jetty-server\n";
198 print STDERR "!$\n";
199 exit -1;
200 }
201 else {
202 wait(); # let the child process finish
203 print STDERR "Jetty server shutdown\n";
204 }
205}
206
207
208sub open_java_solr
209{
210 my ($collect, $doc_tag_level,$full_builddir,$indexdir,$removeold) = @_;
211
212
213 # if removeold set, then delete the curring $full_builddir
214 if ($removeold) {
215 my $full_indexdir = &util::filename_cat($full_builddir,$indexdir);
216 &util::rm_r($full_indexdir);
217 }
218
219 my $search_path = [];
220
221 push(@$search_path,$ENV{'GSDLCOLLECTDIR'}) if defined $ENV{'GSDLCOLLECTDIR'};
222 push(@$search_path,$ENV{'GSDLHOME'}) if defined $ENV{'GSDLHOME'};
223 push(@$search_path,$ENV{'GEXT_SOLR'}) if defined $ENV{'GEXT_SOLR'};
224
225
226 # The following returns once Jetty has generated its
227 # "reading and listening" line
228 #
229 start_solr_server($search_path);
230
231 # Now run the solr-post command
232
233 chdir($ENV{'GEXT_SOLR'});
234
235 my $post_jar = &util::filename_cat("lib","java","solr-post.jar");
236 my $full_post_jar = locate_file($search_path,$post_jar);
237
238 my $jetty_server_port = $ENV{'SOLR_JETTY_PORT'};
239
240 # Now run solr-post command
241 my $post_props = "-Durl=http://localhost:$jetty_server_port/solr/$collect-$doc_tag_level/update";
242 $post_props .= " -Ddata=stdin";
243 $post_props .= " -Dcommit=yes";
244
245 my $post_java_cmd = "java $post_props -jar \"$full_post_jar\"";
246
247### print STDERR "**** post cmd = $post_java_cmd\n";
248
249 open (PIPEOUT, "| $post_java_cmd")
250 || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n";
251}
252
253
254
255sub close_java_solr
256{
257 # closing the pipe has the effect of shutting down solr-post.jar
258 close(PIPEOUT);
259
260 if ($self->{'jetty_explicitly_started'}) {
261 stop_solr_server();
262 }
263}
264
265
266#----
267
268sub save_xml_doc
269{
270 # This is identical to the one in lucene_passes.pl, and should be
271 # moved in to a package and shared ####
272
273 my ($full_textdir,$output_filename,$doc_xml) = @_;
274
275 my $dir_sep = &util::get_os_dirsep();
276
277 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
278 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
279 &util::mk_all_dir($full_output_dir);
280
281 open(DOCOUT,">$full_output_filename")
282 || die "Unable to open $full_output_filename";
283
284 print DOCOUT $doc_xml;
285 close(DOCOUT);
286
287 # What this the purpose of the following? ####
288 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
289}
290
291
292sub compress_xml_doc
293{
294 # This is identical to the one in lucene_passes.pl, and should be
295 # moved in to a package and shared ####
296
297 my ($full_textdir,$output_filename) = @_;
298
299 my $full_output_filename
300 = &util::filename_cat($full_textdir,$output_filename);
301
302 # Greenstone ships with gzip for Windows
303 `gzip $full_output_filename`;
304}
305
306
307sub monitor_xml_stream
308{
309 # based on lucene's monitor_xml_stream, but simplified
310 # as only now used when in "text" mode
311
312 my ($full_textdir) = @_;
313
314 my $doc_xml = "";
315 my $output_filename = "";
316
317 my $line;
318 while (defined ($line = <STDIN>)) {
319
320 $doc_xml .= $line;
321
322 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
323 $output_filename = $1;
324 }
325
326 if ($line =~ m/^<\/Doc>$/) {
327 save_xml_doc($full_textdir,$output_filename,$doc_xml);
328
329 # Compress file
330 #
331 # The compress option was taken out for efficiency
332 # reasons. Consider putting it back in but making it a
333 # switch so a collection builder can decide for themselves on a
334 # case by case basis if they want to save on diskspace, but have
335 # the overhead of uncompressing at runtime
336
337### compress_xml_doc($full_textdir,$output_filename);
338
339 $doc_xml = "";
340 $output_filename = "";
341 }
342 }
343}
344
345
346sub pass_on_xml_stream
347{
348 my $line;
349 while (defined ($line = <STDIN>)) {
350 print PIPEOUT $line;
351 }
352}
353
354
355
356
357# /** This checks the arguments on the command line, filters the
358# * unknown command line arguments and then calls the open_java_solr
359# * function to begin processing.
360# */
361sub main
362{
363 my (@argv) = @_;
364 my $argc = scalar(@argv);
365
366 my $removeold = 0;
367 my @filtered_argv = ();
368
369 my $i = 0;
370 while ($i<$argc) {
371 if ($argv[$i] =~ m/^\-(.*)$/) {
372
373 my $option = $1;
374
375 # -removeold causes the existing index to be overwritten
376 if ($option eq "removeold") {
377 print STDERR "\n-removeold set (new index will be created)\n";
378 $removeold = 1;
379 }
380 # -verbosity <num>
381 elsif ($option eq "verbosity") {
382 $i++;
383 if ($i<$argc)
384 {
385 # solr indexing has no support for verbosity
386 # => parse to be compatible with calling program, but supress it
387 # for solr-post.jar
388 }
389 }
390 else {
391 print STDERR "Unrecognised minus option: -$option\n";
392 }
393 }
394 else {
395 push(@filtered_argv,$argv[$i]);
396 }
397 $i++;
398 }
399
400 my $filtered_argc = scalar(@filtered_argv);
401
402 if ($filtered_argc < 5) {
403 print STDERR "Usage: solr_passes.pl [-removeold|-verbosity num] collect \"text\"|\"index\" doc-tag-level build-dir index-name\n";
404 exit 1;
405 }
406
407 my $collect = $filtered_argv[0];
408 my $mode = $filtered_argv[1];
409 my $doc_tag_level = $filtered_argv[2];
410 my $full_builddir = $filtered_argv[3];
411 my $indexdir = $filtered_argv[4];
412
413 # We only need the Solr handle opened if we are indexing the
414 # documents, not if we are just storing the text
415 if ($mode eq "index") {
416 open_java_solr($collect, $doc_tag_level, $full_builddir, $indexdir, $removeold);
417 }
418
419 if ($mode eq "text") {
420 print STDERR "Monitoring for input!\n";
421 my $full_textdir = &util::filename_cat($full_builddir,"text");
422 monitor_xml_stream($full_textdir);
423 }
424 else {
425 print STDERR "Streaming document input onto Solr server!\n";
426 pass_on_xml_stream();
427 }
428
429
430 if ($mode eq "index") {
431 close_java_solr();
432 }
433}
434
435
436&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.