########################################################################### # # solrutil.pm -- support module for Solr extension # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package solrutil; use strict; sub locate_file { my ($search_path,$suffix) = @_; foreach my $sp (@$search_path) { my $full_path = &util::filename_cat($sp,$suffix); if (-f $full_path) { return $full_path; } } # if get to here, then failed to find match print STDERR "Error: Failed to find '$suffix'\n"; print STDERR " Looked in: ", join(", ", @$search_path), "\n"; exit -1; } sub get_search_path { my $search_path = []; push(@$search_path,$ENV{'GSDLCOLLECTDIR'}) if defined $ENV{'GSDLCOLLECTDIR'}; push(@$search_path,$ENV{'GSDLHOME'}) if defined $ENV{'GSDLHOME'}; push(@$search_path,$ENV{'GEXT_SOLR'}) if defined $ENV{'GEXT_SOLR'}; return $search_path; } # The get-solr-servlet-url ant target can be run from anywhere by specifying the # location of GS3's ant build.xml buildfile. # GSDL3SRCHOME will be set for GS3 by gs3-setup.sh. # Based on servercontrol::get_library_URL. sub get_solr_servlet_url { # Set up fall backs, incl. old way of using solr host and port values that's already in the environment my $solr_url = "http://".$ENV{'SOLR_HOST'}.$ENV{'SOLR_PORT'}."/solr"; # fallback to default my $perl_command = "ant -buildfile \"$ENV{'GSDL3SRCHOME'}/build.xml\" get-solr-servlet-url"; if (open(PIN, "$perl_command |")) { while (defined (my $perl_output_line = )) { if($perl_output_line =~ m@(https?):\/\/(\S*)@) { # grab all the non-whitespace chars $solr_url="$1://".$2; # preserve the http protocol } } close(PIN); #print STDERR "XXXXXXXXXX SOLR URL: $solr_url\n"; } else { print STDERR "*** ERROR IN solrutil::get_solr_servlet_url:\n"; print STDERR " Failed to run $perl_command to work out GS3's solr URL\n"; print STDERR " falling back to using original solr_URL: $solr_url\n"; } return $solr_url; } # Given the solr base url (e.g. http://localhost:8383/solr by default), this function # returns the url's parts: protocol, host, port, solr servlet sub get_solr_url_parts { my $solr_url = shift (@_); # Set up fall backs, incl. old way of using solr host and port values that's already in the environment my ($protocol, $server_host, $server_port, $servlet_name) = ("http://", $ENV{'SOLR_HOST'}, $ENV{'SOLR_PORT'}, "solr"); # http://stackoverflow.com/questions/8206135/storing-regex-result-in-a-new-variable if($solr_url =~ m@(https?)://([^:]*):([0-9]*)/(.*)$@) { # m@https?://([^:]*):([^/])/(.*)@) { ($protocol, $server_host, $server_port, $servlet_name) = ($1, $2, $3, $4); #print STDERR "XXXXXXXXXX PROTOCOL: $protocol, SOLR_HOST: $server_host, SOLR_PORT: $server_port, servlet: $servlet_name\n"; } else { print STDERR "*** WARNING: in solrutil::get_solr_url_parts(): solr servlet URL not in expected format\n"; } return ($protocol, $server_host, $server_port, $servlet_name); } sub get_post_pipe_cmd { my ($core, $solr_base_url) = @_; my $search_path = get_search_path(); chdir($ENV{'GEXT_SOLR'}); my $post_jar = &util::filename_cat("lib","java","solr-post.jar"); my $full_post_jar = solrutil::locate_file($search_path,$post_jar); # Now run solr-post command # See https://wiki.apache.org/solr/UpdateXmlMessages # also https://lucene.apache.org/solr/4_2_1/tutorial.html # suffixing commit=true/commitWithin=10000 to solr's /update servlet didn't work, because # when using SimplePostTool, the commit only happens after the pipe to the tool is closed my $post_props = "-Durl=$solr_base_url/$core/update"; # robustness of protocol is taken care of too $post_props .= " -Ddata=stdin"; $post_props .= " -Dcommit=yes"; # increased VM mem from 512 to 1024, but increasing to 2048M didn't help either when too much # data streamed to SimplePostTool before commit. Nothing works short of committing before the # data streamed gets too large. The solution is to close and reopen the pipe to force commits. my $post_java_cmd = "java -Xmx1024M $post_props -jar \"$full_post_jar\""; ##print STDERR "**** post cmd = $post_java_cmd\n"; return $post_java_cmd; } sub open_post_pipe { my ($core, $solr_base_url) = @_; my $post_java_cmd = &get_post_pipe_cmd($core, $solr_base_url); open (PIPEOUT, "| $post_java_cmd") || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n"; return $post_java_cmd; # return the post_java_cmd so caller can store it and reopen_post_pipe() } sub reopen_post_pipe { my $post_java_cmd = shift(@_); open (PIPEOUT, "| $post_java_cmd") || die "Error in solrutil::reopen_post_pipe: Failed to run $post_java_cmd\n!$\n"; } sub print_to_post_pipe { my ($line) = @_; print PIPEOUT $line; } sub close_post_pipe { # closing the pipe has the effect of shutting down solr-post.jar close(PIPEOUT); } 1;