Changeset 32088
- Timestamp:
- 2017-12-08T17:58:07+13:00 (5 years ago)
- Location:
- gs3-extensions/solr/trunk/src
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl
r31490 r32088 50 50 use solrutil; 51 51 use solrserver; 52 53 my $DOC_BATCH_SIZE = 20; 52 54 53 55 # Not quite OO, but close enough for now … … 66 68 $self->{'solr_server'} = $solr_server; 67 69 68 # Now start up the solr-post command 69 &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url()); 70 # Now start up the solr-post command and store the open cmd that is returned 71 $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url()); 72 } 73 74 # To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool, 75 # close the pipe. Then reopen it to continue streaming data to it. 76 sub flush_and_reopen_java_solr 77 { 78 &solrutil::close_post_pipe(); 79 &solrutil::reopen_post_pipe($self->{'post_java_cmd'}); 70 80 } 71 81 … … 160 170 161 171 172 # Called when mode = index, 173 # This is the function that passes the contents of the docs for ingesting into solr 174 # as one long stream. 175 # Since if we have many docs in a collection, there will be one long stream of bytes 176 # sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until 177 # the pipe is closed. And for a long bytestream, this will result in an out of heap memory 178 # https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http 179 # So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE 180 # We could still have the same issue if any one document is very large (long stream of bytes), 181 # but then we'd need to take care of the problem at the root, see the StackOverFlow page 182 # and SimplePostTool.java, as the problem lies in HttpURLConnection. 162 183 sub pass_on_xml_stream 163 184 { 185 # the xml_stream sent to solr looks like: 186 # <update> 187 # <add> 188 # <doc> 189 # .... 190 # </doc> 191 # </add> 192 # <doc> 193 # .... 194 # </doc> 195 # </add> 196 # ... 197 # </update> 198 199 my $doc_count = 0; 200 164 201 my $line; 165 202 while (defined ($line = <STDIN>)) { 203 # monitor for end of each document 204 if ($line =~ m/^\s*<\/add>$/) { 205 $doc_count++; 206 } 207 else { 208 if ($doc_count == $DOC_BATCH_SIZE) { 209 # evidence that there is still more documents to process 210 # => but reached batch size, so flush and reopen 211 # to force a commit of past docs to solr 212 # before sending this current line on its way 213 214 if ($line =~ m/^\s*<add>$/) { 215 &solrutil::print_to_post_pipe("</update>\n"); 216 flush_and_reopen_java_solr(); 217 218 # start next doc 219 &solrutil::print_to_post_pipe("<update>\n"); 220 $doc_count = 0; 221 } 222 } 223 } 224 166 225 &solrutil::print_to_post_pipe($line); 167 } 168 } 169 170 226 227 } 228 } 171 229 172 230 -
gs3-extensions/solr/trunk/src/perllib/solrutil.pm
r31490 r32088 112 112 } 113 113 114 115 sub open_post_pipe 114 sub get_post_pipe_cmd 116 115 { 117 116 my ($core, $solr_base_url) = @_; … … 125 124 126 125 # Now run solr-post command 126 # See https://wiki.apache.org/solr/UpdateXmlMessages 127 # also https://lucene.apache.org/solr/4_2_1/tutorial.html 128 # suffixing commit=true/commitWithin=10000 to solr's /update servlet didn't work, because 129 # when using SimplePostTool, the commit only happens after the pipe to the tool is closed 127 130 my $post_props = "-Durl=$solr_base_url/$core/update"; # robustness of protocol is taken care of too 128 131 129 132 $post_props .= " -Ddata=stdin"; 130 133 $post_props .= " -Dcommit=yes"; 134 135 # increased VM mem from 512 to 1024, but increasing to 2048M didn't help either when too much 136 # data streamed to SimplePostTool before commit. Nothing works short of committing before the 137 # data streamed gets too large. The solution is to close and reopen the pipe to force commits. 138 my $post_java_cmd = "java -Xmx1024M $post_props -jar \"$full_post_jar\""; 131 139 132 my $post_java_cmd = "java -Xmx512M $post_props -jar \"$full_post_jar\"";140 ##print STDERR "**** post cmd = $post_java_cmd\n"; 133 141 134 ##print STDERR "**** post cmd = $post_java_cmd\n"; 142 return $post_java_cmd; 143 } 144 145 sub open_post_pipe 146 { 147 my ($core, $solr_base_url) = @_; 148 my $post_java_cmd = &get_post_pipe_cmd($core, $solr_base_url); 149 150 open (PIPEOUT, "| $post_java_cmd") 151 || die "Error in solr_passes.pl: Failed to run $post_java_cmd\n!$\n"; 152 153 return $post_java_cmd; # return the post_java_cmd so caller can store it and reopen_post_pipe() 154 } 155 156 sub reopen_post_pipe 157 { 158 my $post_java_cmd = shift(@_); 135 159 136 160 open (PIPEOUT, "| $post_java_cmd") 137 || die "Error in solr _passes.pl: Failed to run $post_java_cmd\n!$\n";161 || die "Error in solrutil::reopen_post_pipe: Failed to run $post_java_cmd\n!$\n"; 138 162 139 163 }
Note:
See TracChangeset
for help on using the changeset viewer.