Index: /gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl
===================================================================
--- /gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl (revision 32087)
+++ /gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl (revision 32088)
@@ -50,4 +50,6 @@
use solrutil;
use solrserver;
+
+my $DOC_BATCH_SIZE = 20;
# Not quite OO, but close enough for now
@@ -66,6 +68,14 @@
$self->{'solr_server'} = $solr_server;
- # Now start up the solr-post command
- &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
+ # Now start up the solr-post command and store the open cmd that is returned
+ $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
+}
+
+# To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool,
+# close the pipe. Then reopen it to continue streaming data to it.
+sub flush_and_reopen_java_solr
+{
+ &solrutil::close_post_pipe();
+ &solrutil::reopen_post_pipe($self->{'post_java_cmd'});
}
@@ -160,13 +170,61 @@
+# Called when mode = index,
+# This is the function that passes the contents of the docs for ingesting into solr
+# as one long stream.
+# Since if we have many docs in a collection, there will be one long stream of bytes
+# sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until
+# the pipe is closed. And for a long bytestream, this will result in an out of heap memory
+# https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http
+# So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE
+# We could still have the same issue if any one document is very large (long stream of bytes),
+# but then we'd need to take care of the problem at the root, see the StackOverFlow page
+# and SimplePostTool.java, as the problem lies in HttpURLConnection.
sub pass_on_xml_stream
{
+ # the xml_stream sent to solr looks like:
+ #
+ #
+ #
+ # ....
+ #
+ #
+ #
+ # ....
+ #
+ #
+ # ...
+ #
+
+ my $doc_count = 0;
+
my $line;
while (defined ($line = )) {
+ # monitor for end of each document
+ if ($line =~ m/^\s*<\/add>$/) {
+ $doc_count++;
+ }
+ else {
+ if ($doc_count == $DOC_BATCH_SIZE) {
+ # evidence that there is still more documents to process
+ # => but reached batch size, so flush and reopen
+ # to force a commit of past docs to solr
+ # before sending this current line on its way
+
+ if ($line =~ m/^\s*$/) {
+ &solrutil::print_to_post_pipe("\n");
+ flush_and_reopen_java_solr();
+
+ # start next doc
+ &solrutil::print_to_post_pipe("\n");
+ $doc_count = 0;
+ }
+ }
+ }
+
&solrutil::print_to_post_pipe($line);
- }
-}
-
-
+
+ }
+}
Index: /gs3-extensions/solr/trunk/src/perllib/solrutil.pm
===================================================================
--- /gs3-extensions/solr/trunk/src/perllib/solrutil.pm (revision 32087)
+++ /gs3-extensions/solr/trunk/src/perllib/solrutil.pm (revision 32088)
@@ -112,6 +112,5 @@
}
-
-sub open_post_pipe
+sub get_post_pipe_cmd
{
my ($core, $solr_base_url) = @_;
@@ -125,15 +124,40 @@
# Now run solr-post command
+ # See https://wiki.apache.org/solr/UpdateXmlMessages
+ # also https://lucene.apache.org/solr/4_2_1/tutorial.html
+ # suffixing commit=true/commitWithin=10000 to solr's /update servlet didn't work, because
+ # when using SimplePostTool, the commit only happens after the pipe to the tool is closed
my $post_props = "-Durl=$solr_base_url/$core/update"; # robustness of protocol is taken care of too
$post_props .= " -Ddata=stdin";
$post_props .= " -Dcommit=yes";
+
+ # increased VM mem from 512 to 1024, but increasing to 2048M didn't help either when too much
+ # data streamed to SimplePostTool before commit. Nothing works short of committing before the
+ # data streamed gets too large. The solution is to close and reopen the pipe to force commits.
+ my $post_java_cmd = "java -Xmx1024M $post_props -jar \"$full_post_jar\"";
- my $post_java_cmd = "java -Xmx512M $post_props -jar \"$full_post_jar\"";
+ ##print STDERR "**** post cmd = $post_java_cmd\n";
- ##print STDERR "**** post cmd = $post_java_cmd\n";
+ return $post_java_cmd;
+}
+
+sub open_post_pipe
+{
+ my ($core, $solr_base_url) = @_;
+ my $post_java_cmd = &get_post_pipe_cmd($core, $solr_base_url);
+
+ open (PIPEOUT, "| $post_java_cmd")
+ || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n";
+
+ return $post_java_cmd; # return the post_java_cmd so caller can store it and reopen_post_pipe()
+}
+
+sub reopen_post_pipe
+{
+ my $post_java_cmd = shift(@_);
open (PIPEOUT, "| $post_java_cmd")
- || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n";
+ || die "Error in solrutil::reopen_post_pipe: Failed to run $post_java_cmd\n!$\n";
}