Ignore:
Timestamp:
2017-12-08T17:58:07+13:00 (6 years ago)
Author:
ak19
Message:

Martin (mwilliman email id) on the mailing list found that solr got SIGPIPE errors when he built his 3020 doc sorl collection. The problem occurred when the docs were sent in a single stream for solr ingestion using the SimplePostTool (post.jar/solr-post.jar). The problem is that the data stream becomes to large, since SimplePostTool doesn't cause a commit until after the pipe to it is closed. Initially other methods were attempted: increasing the Java VM mem size from 512 to 2048, which only helped process a certain additional number of docs before resulting in a SIGPIPE again. We tried changing the solr update url to have ?commit=true and ?commitWithin=15000 (ms) suffixed to it, but as the commit isn't done until after the pipe to SimplePostTool is closed, the url change had no effect with SimplePostTool. Though we retained an increase to 1024 of the Java VM when launching SimplePostTool, the actual present solution was to close and reopen the pipe to the post tool jar file executable after every x number of docs. Currently this batch size is set to 20. However, if any file is gigantic, we could get to see this problem again: it has to do with the overall size of the data stream rather than number of docs. The actual problem lies in HttpURLConnection that SimplePostTool opens, rather than how often we open/close the open to the post tool. This commit contains 3 changes: 1. changed Java VM memory to 1024 when launching SimplePostTool (solr-post.jar); 2. code changes to solrutil.pm and solr_passes.pl to close and reopen the pipe to flush the data after every 20 docs to force a commit to solr; 3. the existing code changes work with the old solr-post.jar (version 1.3) but committing version 1.5 since it has a larger buffer and is found to be better by Dr Bainbridge. The new, v1.5 solr-post.jar is from solr-4.7.2's example/examples/post.jar, renamed to the usual solr-post.jar.

Location:
gs3-extensions/solr/trunk/src
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl

    r31490 r32088  
    5050use solrutil;
    5151use solrserver;
     52
     53my $DOC_BATCH_SIZE = 20;
    5254
    5355# Not quite OO, but close enough for now
     
    6668  $self->{'solr_server'} = $solr_server;
    6769
    68   # Now start up the solr-post command
    69   &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
     70  # Now start up the solr-post command and store the open cmd that is returned
     71  $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
     72}
     73
     74# To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool,
     75# close the pipe. Then reopen it to continue streaming data to it.
     76sub flush_and_reopen_java_solr
     77{
     78    &solrutil::close_post_pipe();
     79    &solrutil::reopen_post_pipe($self->{'post_java_cmd'});
    7080}
    7181
     
    160170
    161171
     172# Called when mode = index,
     173# This is the function that passes the contents of the docs for ingesting into solr
     174# as one long stream.
     175# Since if we have many docs in a collection, there will be one long stream of bytes
     176# sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until
     177# the pipe is closed. And for a long bytestream, this will result in an out of heap memory
     178# https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http
     179# So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE
     180# We could still have the same issue if any one document is very large (long stream of bytes),
     181# but then we'd need to take care of the problem at the root, see the StackOverFlow page
     182# and SimplePostTool.java, as the problem lies in HttpURLConnection.
    162183sub pass_on_xml_stream
    163184{
     185    # the xml_stream sent to solr looks like:
     186    # <update>
     187    #   <add>
     188    #     <doc>
     189    #       ....
     190    #     </doc>
     191    #   </add>
     192    #     <doc>
     193    #       ....
     194    #     </doc>
     195    #   </add>
     196    #   ...
     197    # </update>   
     198   
     199    my $doc_count = 0;
     200   
    164201    my $line;
    165202    while (defined ($line = <STDIN>)) {
     203    # monitor for end of each document     
     204    if ($line =~ m/^\s*<\/add>$/) {
     205        $doc_count++;
     206    }
     207    else {
     208        if ($doc_count == $DOC_BATCH_SIZE) {
     209        # evidence that there is still more documents to process
     210        # => but reached batch size, so flush and reopen
     211        # to force a commit of past docs to solr
     212        # before sending this current line on its way
     213       
     214        if ($line =~ m/^\s*<add>$/) {
     215            &solrutil::print_to_post_pipe("</update>\n");
     216            flush_and_reopen_java_solr();
     217       
     218            # start next doc
     219            &solrutil::print_to_post_pipe("<update>\n");
     220            $doc_count = 0;
     221        }
     222        }
     223    }
     224           
    166225    &solrutil::print_to_post_pipe($line);
    167     }
    168 }
    169 
    170 
     226
     227    }
     228}
    171229
    172230
  • gs3-extensions/solr/trunk/src/perllib/solrutil.pm

    r31490 r32088  
    112112}
    113113
    114 
    115 sub open_post_pipe
     114sub get_post_pipe_cmd
    116115{
    117116    my ($core, $solr_base_url) = @_;
     
    125124   
    126125    # Now run solr-post command
     126    # See https://wiki.apache.org/solr/UpdateXmlMessages
     127    # also https://lucene.apache.org/solr/4_2_1/tutorial.html
     128        # suffixing commit=true/commitWithin=10000 to solr's /update servlet didn't work, because
     129        # when using SimplePostTool, the commit only happens after the pipe to the tool is closed
    127130    my $post_props = "-Durl=$solr_base_url/$core/update"; # robustness of protocol is taken care of too
    128131
    129132    $post_props .= " -Ddata=stdin";
    130133    $post_props .= " -Dcommit=yes";
     134
     135    # increased VM mem from 512 to 1024, but increasing to 2048M didn't help either when too much
     136    # data streamed to SimplePostTool before commit. Nothing works short of committing before the
     137    # data streamed gets too large. The solution is to close and reopen the pipe to force commits.
     138    my $post_java_cmd = "java -Xmx1024M $post_props -jar \"$full_post_jar\"";
    131139   
    132     my $post_java_cmd = "java -Xmx512M $post_props -jar \"$full_post_jar\"";
     140       ##print STDERR "**** post cmd = $post_java_cmd\n";
    133141   
    134     ##print STDERR "**** post cmd = $post_java_cmd\n";
     142    return $post_java_cmd;
     143}
     144
     145sub open_post_pipe
     146{
     147    my ($core, $solr_base_url) = @_;
     148    my $post_java_cmd = &get_post_pipe_cmd($core, $solr_base_url);
     149
     150    open (PIPEOUT, "| $post_java_cmd")
     151    || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n";
     152
     153    return $post_java_cmd; # return the post_java_cmd so caller can store it and reopen_post_pipe()
     154}
     155
     156sub reopen_post_pipe
     157{
     158    my $post_java_cmd = shift(@_);
    135159   
    136160    open (PIPEOUT, "| $post_java_cmd")
    137     || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n";
     161    || die "Error in solrutil::reopen_post_pipe: Failed to run $post_java_cmd\n!$\n";
    138162   
    139163}
Note: See TracChangeset for help on using the changeset viewer.