Changeset 32088 for gs3-extensions

Show
Ignore:
Timestamp:
08.12.2017 17:58:07 (17 months ago)
Author:
ak19
Message:

Martin (mwilliman email id) on the mailing list found that solr got SIGPIPE errors when he built his 3020 doc sorl collection. The problem occurred when the docs were sent in a single stream for solr ingestion using the SimplePostTool? (post.jar/solr-post.jar). The problem is that the data stream becomes to large, since SimplePostTool? doesn't cause a commit until after the pipe to it is closed. Initially other methods were attempted: increasing the Java VM mem size from 512 to 2048, which only helped process a certain additional number of docs before resulting in a SIGPIPE again. We tried changing the solr update url to have ?commit=true and ?commitWithin=15000 (ms) suffixed to it, but as the commit isn't done until after the pipe to SimplePostTool? is closed, the url change had no effect with SimplePostTool?. Though we retained an increase to 1024 of the Java VM when launching SimplePostTool?, the actual present solution was to close and reopen the pipe to the post tool jar file executable after every x number of docs. Currently this batch size is set to 20. However, if any file is gigantic, we could get to see this problem again: it has to do with the overall size of the data stream rather than number of docs. The actual problem lies in HttpURLConnection that SimplePostTool? opens, rather than how often we open/close the open to the post tool. This commit contains 3 changes: 1. changed Java VM memory to 1024 when launching SimplePostTool? (solr-post.jar); 2. code changes to solrutil.pm and solr_passes.pl to close and reopen the pipe to flush the data after every 20 docs to force a commit to solr; 3. the existing code changes work with the old solr-post.jar (version 1.3) but committing version 1.5 since it has a larger buffer and is found to be better by Dr Bainbridge. The new, v1.5 solr-post.jar is from solr-4.7.2's example/examples/post.jar, renamed to the usual solr-post.jar.

Location:
gs3-extensions/solr/trunk/src
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl

    r31490 r32088  
    5050use solrutil; 
    5151use solrserver; 
     52 
     53my $DOC_BATCH_SIZE = 20; 
    5254 
    5355# Not quite OO, but close enough for now 
     
    6668  $self->{'solr_server'} = $solr_server; 
    6769 
    68   # Now start up the solr-post command 
    69   &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url()); 
     70  # Now start up the solr-post command and store the open cmd that is returned 
     71  $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url()); 
     72} 
     73 
     74# To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool, 
     75# close the pipe. Then reopen it to continue streaming data to it. 
     76sub flush_and_reopen_java_solr 
     77{ 
     78    &solrutil::close_post_pipe(); 
     79    &solrutil::reopen_post_pipe($self->{'post_java_cmd'}); 
    7080} 
    7181 
     
    160170 
    161171 
     172# Called when mode = index, 
     173# This is the function that passes the contents of the docs for ingesting into solr 
     174# as one long stream. 
     175# Since if we have many docs in a collection, there will be one long stream of bytes 
     176# sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until 
     177# the pipe is closed. And for a long bytestream, this will result in an out of heap memory 
     178# https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http 
     179# So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE 
     180# We could still have the same issue if any one document is very large (long stream of bytes), 
     181# but then we'd need to take care of the problem at the root, see the StackOverFlow page 
     182# and SimplePostTool.java, as the problem lies in HttpURLConnection. 
    162183sub pass_on_xml_stream 
    163184{ 
     185    # the xml_stream sent to solr looks like: 
     186    # <update> 
     187    #   <add> 
     188    #     <doc> 
     189    #       .... 
     190    #     </doc> 
     191    #   </add> 
     192    #     <doc> 
     193    #       .... 
     194    #     </doc> 
     195    #   </add> 
     196    #   ... 
     197    # </update>     
     198     
     199    my $doc_count = 0; 
     200     
    164201    my $line; 
    165202    while (defined ($line = <STDIN>)) { 
     203    # monitor for end of each document       
     204    if ($line =~ m/^\s*<\/add>$/) { 
     205        $doc_count++; 
     206    } 
     207    else { 
     208        if ($doc_count == $DOC_BATCH_SIZE) { 
     209        # evidence that there is still more documents to process 
     210        # => but reached batch size, so flush and reopen 
     211        # to force a commit of past docs to solr 
     212        # before sending this current line on its way 
     213         
     214        if ($line =~ m/^\s*<add>$/) { 
     215            &solrutil::print_to_post_pipe("</update>\n"); 
     216            flush_and_reopen_java_solr(); 
     217         
     218            # start next doc 
     219            &solrutil::print_to_post_pipe("<update>\n"); 
     220            $doc_count = 0; 
     221        } 
     222        } 
     223    } 
     224             
    166225    &solrutil::print_to_post_pipe($line); 
    167     } 
    168 } 
    169  
    170  
     226 
     227    } 
     228} 
    171229 
    172230 
  • gs3-extensions/solr/trunk/src/perllib/solrutil.pm

    r31490 r32088  
    112112} 
    113113 
    114  
    115 sub open_post_pipe 
     114sub get_post_pipe_cmd 
    116115{ 
    117116    my ($core, $solr_base_url) = @_; 
     
    125124     
    126125    # Now run solr-post command 
     126    # See https://wiki.apache.org/solr/UpdateXmlMessages 
     127    # also https://lucene.apache.org/solr/4_2_1/tutorial.html 
     128        # suffixing commit=true/commitWithin=10000 to solr's /update servlet didn't work, because 
     129        # when using SimplePostTool, the commit only happens after the pipe to the tool is closed 
    127130    my $post_props = "-Durl=$solr_base_url/$core/update"; # robustness of protocol is taken care of too 
    128131 
    129132    $post_props .= " -Ddata=stdin"; 
    130133    $post_props .= " -Dcommit=yes"; 
     134 
     135    # increased VM mem from 512 to 1024, but increasing to 2048M didn't help either when too much 
     136    # data streamed to SimplePostTool before commit. Nothing works short of committing before the 
     137    # data streamed gets too large. The solution is to close and reopen the pipe to force commits. 
     138    my $post_java_cmd = "java -Xmx1024M $post_props -jar \"$full_post_jar\""; 
    131139     
    132     my $post_java_cmd = "java -Xmx512M $post_props -jar \"$full_post_jar\""; 
     140       ##print STDERR "**** post cmd = $post_java_cmd\n"; 
    133141     
    134     ##print STDERR "**** post cmd = $post_java_cmd\n"; 
     142    return $post_java_cmd; 
     143} 
     144 
     145sub open_post_pipe 
     146{ 
     147    my ($core, $solr_base_url) = @_; 
     148    my $post_java_cmd = &get_post_pipe_cmd($core, $solr_base_url); 
     149 
     150    open (PIPEOUT, "| $post_java_cmd")  
     151    || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n"; 
     152 
     153    return $post_java_cmd; # return the post_java_cmd so caller can store it and reopen_post_pipe() 
     154} 
     155 
     156sub reopen_post_pipe 
     157{ 
     158    my $post_java_cmd = shift(@_); 
    135159     
    136160    open (PIPEOUT, "| $post_java_cmd")  
    137     || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n"; 
     161    || die "Error in solrutil::reopen_post_pipe: Failed to run $post_java_cmd\n!$\n"; 
    138162     
    139163}