source: gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl@ 32088

Last change on this file since 32088 was 32088, checked in by ak19, 6 years ago

Martin (mwilliman email id) on the mailing list found that solr got SIGPIPE errors when he built his 3020 doc sorl collection. The problem occurred when the docs were sent in a single stream for solr ingestion using the SimplePostTool (post.jar/solr-post.jar). The problem is that the data stream becomes to large, since SimplePostTool doesn't cause a commit until after the pipe to it is closed. Initially other methods were attempted: increasing the Java VM mem size from 512 to 2048, which only helped process a certain additional number of docs before resulting in a SIGPIPE again. We tried changing the solr update url to have ?commit=true and ?commitWithin=15000 (ms) suffixed to it, but as the commit isn't done until after the pipe to SimplePostTool is closed, the url change had no effect with SimplePostTool. Though we retained an increase to 1024 of the Java VM when launching SimplePostTool, the actual present solution was to close and reopen the pipe to the post tool jar file executable after every x number of docs. Currently this batch size is set to 20. However, if any file is gigantic, we could get to see this problem again: it has to do with the overall size of the data stream rather than number of docs. The actual problem lies in HttpURLConnection that SimplePostTool opens, rather than how often we open/close the open to the post tool. This commit contains 3 changes: 1. changed Java VM memory to 1024 when launching SimplePostTool (solr-post.jar); 2. code changes to solrutil.pm and solr_passes.pl to close and reopen the pipe to flush the data after every 20 docs to force a commit to solr; 3. the existing code changes work with the old solr-post.jar (version 1.3) but committing version 1.5 since it has a larger buffer and is found to be better by Dr Bainbridge. The new, v1.5 solr-post.jar is from solr-4.7.2's example/examples/post.jar, renamed to the usual solr-post.jar.

  • Property svn:executable set to *
File size: 8.3 KB
RevLine 
[24446]1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31# http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42 die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
[24453]43
44 my $solr_ext = $ENV{'GEXT_SOLR'};
45 unshift (@INC, "$solr_ext/perllib");
[24446]46}
47
48use strict;
49use util;
[24453]50use solrutil;
51use solrserver;
[24446]52
[32088]53my $DOC_BATCH_SIZE = 20;
54
[24446]55# Not quite OO, but close enough for now
56#
[24453]57my $self = { 'solr_server' => undef };
[24446]58
59sub open_java_solr
60{
[24501]61 my ($core,$full_builddir,$indexdir) = @_;
[24446]62
[24453]63 # If the Solr/Jetty server is not already running, the following starts
64 # it up, and only returns when the server is "reading and listening"
[24446]65
[24501]66 my $solr_server = new solrserver($full_builddir);
[24453]67 $solr_server->start();
68 $self->{'solr_server'} = $solr_server;
[24446]69
[32088]70 # Now start up the solr-post command and store the open cmd that is returned
71 $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
[24446]72}
73
[32088]74# To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool,
75# close the pipe. Then reopen it to continue streaming data to it.
76sub flush_and_reopen_java_solr
77{
78 &solrutil::close_post_pipe();
79 &solrutil::reopen_post_pipe($self->{'post_java_cmd'});
80}
81
[24446]82sub close_java_solr
83{
[24453]84 &solrutil::close_post_pipe();
85
86 my $solr_server = $self->{'solr_server'};
87 if ($solr_server->explicitly_started()) {
88 $solr_server->stop();
[24446]89 }
90}
91
[24447]92#----
[24446]93
94sub save_xml_doc
95{
96 # This is identical to the one in lucene_passes.pl, and should be
97 # moved in to a package and shared ####
98
99 my ($full_textdir,$output_filename,$doc_xml) = @_;
100
101 my $dir_sep = &util::get_os_dirsep();
102
103 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
104 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
[27780]105 &FileUtils::makeAllDirectories($full_output_dir);
[24446]106
107 open(DOCOUT,">$full_output_filename")
108 || die "Unable to open $full_output_filename";
109
110 print DOCOUT $doc_xml;
111 close(DOCOUT);
112
113 # What this the purpose of the following? ####
114 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
115}
116
117
118sub compress_xml_doc
119{
120 # This is identical to the one in lucene_passes.pl, and should be
121 # moved in to a package and shared ####
122
123 my ($full_textdir,$output_filename) = @_;
124
125 my $full_output_filename
126 = &util::filename_cat($full_textdir,$output_filename);
127
128 # Greenstone ships with gzip for Windows
129 `gzip $full_output_filename`;
130}
131
132
133sub monitor_xml_stream
134{
135 # based on lucene's monitor_xml_stream, but simplified
136 # as only now used when in "text" mode
137
138 my ($full_textdir) = @_;
139
140 my $doc_xml = "";
141 my $output_filename = "";
142
143 my $line;
144 while (defined ($line = <STDIN>)) {
145
146 $doc_xml .= $line;
147
148 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
149 $output_filename = $1;
150 }
151
152 if ($line =~ m/^<\/Doc>$/) {
153 save_xml_doc($full_textdir,$output_filename,$doc_xml);
154
155 # Compress file
156 #
157 # The compress option was taken out for efficiency
158 # reasons. Consider putting it back in but making it a
159 # switch so a collection builder can decide for themselves on a
160 # case by case basis if they want to save on diskspace, but have
161 # the overhead of uncompressing at runtime
162
163### compress_xml_doc($full_textdir,$output_filename);
164
165 $doc_xml = "";
166 $output_filename = "";
167 }
168 }
169}
170
171
[32088]172# Called when mode = index,
173# This is the function that passes the contents of the docs for ingesting into solr
174# as one long stream.
175# Since if we have many docs in a collection, there will be one long stream of bytes
176# sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until
177# the pipe is closed. And for a long bytestream, this will result in an out of heap memory
178# https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http
179# So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE
180# We could still have the same issue if any one document is very large (long stream of bytes),
181# but then we'd need to take care of the problem at the root, see the StackOverFlow page
182# and SimplePostTool.java, as the problem lies in HttpURLConnection.
[24446]183sub pass_on_xml_stream
184{
[32088]185 # the xml_stream sent to solr looks like:
186 # <update>
187 # <add>
188 # <doc>
189 # ....
190 # </doc>
191 # </add>
192 # <doc>
193 # ....
194 # </doc>
195 # </add>
196 # ...
197 # </update>
198
199 my $doc_count = 0;
200
[24446]201 my $line;
202 while (defined ($line = <STDIN>)) {
[32088]203 # monitor for end of each document
204 if ($line =~ m/^\s*<\/add>$/) {
205 $doc_count++;
206 }
207 else {
208 if ($doc_count == $DOC_BATCH_SIZE) {
209 # evidence that there is still more documents to process
210 # => but reached batch size, so flush and reopen
211 # to force a commit of past docs to solr
212 # before sending this current line on its way
213
214 if ($line =~ m/^\s*<add>$/) {
215 &solrutil::print_to_post_pipe("</update>\n");
216 flush_and_reopen_java_solr();
217
218 # start next doc
219 &solrutil::print_to_post_pipe("<update>\n");
220 $doc_count = 0;
221 }
222 }
223 }
224
[24453]225 &solrutil::print_to_post_pipe($line);
[32088]226
[24446]227 }
228}
229
230
231# /** This checks the arguments on the command line, filters the
232# * unknown command line arguments and then calls the open_java_solr
233# * function to begin processing.
234# */
235sub main
236{
237 my (@argv) = @_;
238 my $argc = scalar(@argv);
239
240 my @filtered_argv = ();
241
242 my $i = 0;
243 while ($i<$argc) {
244 if ($argv[$i] =~ m/^\-(.*)$/) {
245
246 my $option = $1;
247
248 # -verbosity <num>
[24501]249 if ($option eq "verbosity") {
[24446]250 $i++;
251 if ($i<$argc)
252 {
253 # solr indexing has no support for verbosity
254 # => parse to be compatible with calling program, but supress it
255 # for solr-post.jar
256 }
257 }
258 else {
259 print STDERR "Unrecognised minus option: -$option\n";
260 }
261 }
262 else {
263 push(@filtered_argv,$argv[$i]);
264 }
265 $i++;
266 }
267
268 my $filtered_argc = scalar(@filtered_argv);
269
[24501]270 if ($filtered_argc < 4) {
271 print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
[24446]272 exit 1;
273 }
274
[24501]275 my $core = $filtered_argv[0];
[24446]276 my $mode = $filtered_argv[1];
[24501]277 my $full_builddir = $filtered_argv[2];
278 my $indexdir = $filtered_argv[3];
[24446]279
280 # We only need the Solr handle opened if we are indexing the
281 # documents, not if we are just storing the text
282 if ($mode eq "index") {
[24501]283 open_java_solr($core, $full_builddir, $indexdir);
[24446]284 }
285
286 if ($mode eq "text") {
287 print STDERR "Monitoring for input!\n";
288 my $full_textdir = &util::filename_cat($full_builddir,"text");
289 monitor_xml_stream($full_textdir);
290 }
291 else {
[24447]292 print STDERR "Streaming document input onto Solr server!\n";
[24446]293 pass_on_xml_stream();
294 }
295
296
297 if ($mode eq "index") {
298 close_java_solr();
299 }
300}
301
302
303&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.