root/gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl @ 32088

Revision 32088, 8.3 KB (checked in by ak19, 3 years ago)

Martin (mwilliman email id) on the mailing list found that solr got SIGPIPE errors when he built his 3020 doc sorl collection. The problem occurred when the docs were sent in a single stream for solr ingestion using the SimplePostTool? (post.jar/solr-post.jar). The problem is that the data stream becomes to large, since SimplePostTool? doesn't cause a commit until after the pipe to it is closed. Initially other methods were attempted: increasing the Java VM mem size from 512 to 2048, which only helped process a certain additional number of docs before resulting in a SIGPIPE again. We tried changing the solr update url to have ?commit=true and ?commitWithin=15000 (ms) suffixed to it, but as the commit isn't done until after the pipe to SimplePostTool? is closed, the url change had no effect with SimplePostTool?. Though we retained an increase to 1024 of the Java VM when launching SimplePostTool?, the actual present solution was to close and reopen the pipe to the post tool jar file executable after every x number of docs. Currently this batch size is set to 20. However, if any file is gigantic, we could get to see this problem again: it has to do with the overall size of the data stream rather than number of docs. The actual problem lies in HttpURLConnection that SimplePostTool? opens, rather than how often we open/close the open to the post tool. This commit contains 3 changes: 1. changed Java VM memory to 1024 when launching SimplePostTool? (solr-post.jar); 2. code changes to solrutil.pm and solr_passes.pl to close and reopen the pipe to flush the data after every 20 docs to force a commit to solr; 3. the existing code changes work with the old solr-post.jar (version 1.3) but committing version 1.5 since it has a larger buffer and is found to be better by Dr Bainbridge. The new, v1.5 solr-post.jar is from solr-4.7.2's example/examples/post.jar, renamed to the usual solr-post.jar.

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31#  http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server. 
36
37
38BEGIN {
39    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42    die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44    my $solr_ext = $ENV{'GEXT_SOLR'};
45    unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53my $DOC_BATCH_SIZE = 20;
54
55# Not quite OO, but close enough for now
56#
57my $self = { 'solr_server' => undef };
58
59sub open_java_solr
60{
61  my ($core,$full_builddir,$indexdir) = @_;
62
63  # If the Solr/Jetty server is not already running, the following starts
64  # it up, and only returns when the server is "reading and listening"
65 
66  my $solr_server = new solrserver($full_builddir);
67  $solr_server->start();
68  $self->{'solr_server'} = $solr_server;
69
70  # Now start up the solr-post command and store the open cmd that is returned
71  $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
72}
73
74# To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool,
75# close the pipe. Then reopen it to continue streaming data to it.
76sub flush_and_reopen_java_solr
77{
78    &solrutil::close_post_pipe();
79    &solrutil::reopen_post_pipe($self->{'post_java_cmd'});
80}
81
82sub close_java_solr
83{
84    &solrutil::close_post_pipe();
85     
86    my $solr_server = $self->{'solr_server'};
87    if ($solr_server->explicitly_started()) {
88    $solr_server->stop();
89    }
90}
91
92#----
93
94sub save_xml_doc
95{
96    # This is identical to the one in lucene_passes.pl, and should be
97    # moved in to a package and shared ####
98
99    my ($full_textdir,$output_filename,$doc_xml) = @_;
100
101    my $dir_sep = &util::get_os_dirsep();
102
103    my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
104    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
105    &FileUtils::makeAllDirectories($full_output_dir);
106
107    open(DOCOUT,">$full_output_filename")
108    || die "Unable to open $full_output_filename";
109
110    print DOCOUT $doc_xml;
111    close(DOCOUT);
112
113    # What this the purpose of the following? ####
114    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
115}
116
117
118sub compress_xml_doc
119{
120    # This is identical to the one in lucene_passes.pl, and should be
121    # moved in to a package and shared ####
122
123    my ($full_textdir,$output_filename) = @_;
124
125    my $full_output_filename
126    = &util::filename_cat($full_textdir,$output_filename);
127
128    # Greenstone ships with gzip for Windows
129    `gzip $full_output_filename`;
130}
131
132
133sub monitor_xml_stream
134{
135    # based on lucene's monitor_xml_stream, but simplified
136    # as only now used when in "text" mode
137
138    my ($full_textdir) = @_;
139
140    my $doc_xml = "";
141    my $output_filename = "";
142
143    my $line;
144    while (defined ($line = <STDIN>)) {
145
146    $doc_xml .= $line;
147
148    if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
149        $output_filename = $1;     
150    }
151   
152    if ($line =~ m/^<\/Doc>$/) {
153        save_xml_doc($full_textdir,$output_filename,$doc_xml);
154
155        # Compress file
156        #
157        # The compress option was taken out for efficiency
158        # reasons.  Consider putting it back in but making it a
159        # switch so a collection builder can decide for themselves on a
160        # case by case basis if they want to save on diskspace, but have
161        # the overhead of uncompressing at runtime
162       
163###     compress_xml_doc($full_textdir,$output_filename);
164
165        $doc_xml = "";
166        $output_filename = "";
167    }
168    }
169}
170
171
172# Called when mode = index,
173# This is the function that passes the contents of the docs for ingesting into solr
174# as one long stream.
175# Since if we have many docs in a collection, there will be one long stream of bytes
176# sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until
177# the pipe is closed. And for a long bytestream, this will result in an out of heap memory
178# https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http
179# So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE
180# We could still have the same issue if any one document is very large (long stream of bytes),
181# but then we'd need to take care of the problem at the root, see the StackOverFlow page
182# and SimplePostTool.java, as the problem lies in HttpURLConnection.
183sub pass_on_xml_stream
184{
185    # the xml_stream sent to solr looks like:
186    # <update>
187    #   <add>
188    #     <doc>
189    #       ....
190    #     </doc>
191    #   </add>
192    #     <doc>
193    #       ....
194    #     </doc>
195    #   </add>
196    #   ...
197    # </update>   
198   
199    my $doc_count = 0;
200   
201    my $line;
202    while (defined ($line = <STDIN>)) {
203    # monitor for end of each document     
204    if ($line =~ m/^\s*<\/add>$/) {
205        $doc_count++;
206    }
207    else {
208        if ($doc_count == $DOC_BATCH_SIZE) {
209        # evidence that there is still more documents to process
210        # => but reached batch size, so flush and reopen
211        # to force a commit of past docs to solr
212        # before sending this current line on its way
213       
214        if ($line =~ m/^\s*<add>$/) {
215            &solrutil::print_to_post_pipe("</update>\n");
216            flush_and_reopen_java_solr();
217       
218            # start next doc
219            &solrutil::print_to_post_pipe("<update>\n");
220            $doc_count = 0;
221        }
222        }
223    }
224           
225    &solrutil::print_to_post_pipe($line);
226
227    }
228}
229
230
231# /** This checks the arguments on the command line, filters the
232#  *  unknown command line arguments and then calls the open_java_solr
233#  *  function to begin processing.
234#  */
235sub main
236{
237  my (@argv) = @_;
238  my $argc = scalar(@argv);
239
240  my @filtered_argv = ();
241
242  my $i = 0;
243  while ($i<$argc) {
244    if ($argv[$i] =~ m/^\-(.*)$/) {
245
246      my $option = $1;
247
248      # -verbosity <num>
249      if ($option eq "verbosity") {
250        $i++;
251        if ($i<$argc)
252    {
253      # solr indexing has no support for verbosity
254      # => parse to be compatible with calling program, but supress it
255      #    for solr-post.jar
256        }
257      }
258      else {
259        print STDERR "Unrecognised minus option: -$option\n";
260      }
261    }
262    else {
263        push(@filtered_argv,$argv[$i]);
264    }
265    $i++;
266  }
267
268  my $filtered_argc = scalar(@filtered_argv);
269
270  if ($filtered_argc < 4) {
271    print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
272    exit 1;
273  }
274
275  my $core          = $filtered_argv[0];
276  my $mode          = $filtered_argv[1];
277  my $full_builddir = $filtered_argv[2];
278  my $indexdir      = $filtered_argv[3];
279
280  # We only need the Solr handle opened if we are indexing the
281  # documents, not if we are just storing the text
282  if ($mode eq "index") {
283    open_java_solr($core, $full_builddir, $indexdir);
284  }
285
286  if ($mode eq "text") {
287      print STDERR "Monitoring for input!\n";
288      my $full_textdir = &util::filename_cat($full_builddir,"text");
289      monitor_xml_stream($full_textdir);
290  }
291  else {
292      print STDERR "Streaming document input onto Solr server!\n";
293      pass_on_xml_stream();
294  }
295
296
297  if ($mode eq "index") {
298    close_java_solr();
299  }
300}
301
302
303&main(@ARGV);
Note: See TracBrowser for help on using the browser.