Changeset 32088 for gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl
- Timestamp:
- 2017-12-08T17:58:07+13:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl
r31490 r32088 50 50 use solrutil; 51 51 use solrserver; 52 53 my $DOC_BATCH_SIZE = 20; 52 54 53 55 # Not quite OO, but close enough for now … … 66 68 $self->{'solr_server'} = $solr_server; 67 69 68 # Now start up the solr-post command 69 &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url()); 70 # Now start up the solr-post command and store the open cmd that is returned 71 $self->{'post_java_cmd'} = &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url()); 72 } 73 74 # To commit any stream of data to solr that's amassed so far on the pipe to SimplePostTool, 75 # close the pipe. Then reopen it to continue streaming data to it. 76 sub flush_and_reopen_java_solr 77 { 78 &solrutil::close_post_pipe(); 79 &solrutil::reopen_post_pipe($self->{'post_java_cmd'}); 70 80 } 71 81 … … 160 170 161 171 172 # Called when mode = index, 173 # This is the function that passes the contents of the docs for ingesting into solr 174 # as one long stream. 175 # Since if we have many docs in a collection, there will be one long stream of bytes 176 # sent to the SimplePostTool/(solr-)post.jar, which remain uncommitted until 177 # the pipe is closed. And for a long bytestream, this will result in an out of heap memory 178 # https://stackoverflow.com/questions/2082057/outputstream-outofmemoryerror-when-sending-http 179 # So we need to close and then reopen solr post pipe to force commit after some $DOC_BATCH_SIZE 180 # We could still have the same issue if any one document is very large (long stream of bytes), 181 # but then we'd need to take care of the problem at the root, see the StackOverFlow page 182 # and SimplePostTool.java, as the problem lies in HttpURLConnection. 162 183 sub pass_on_xml_stream 163 184 { 185 # the xml_stream sent to solr looks like: 186 # <update> 187 # <add> 188 # <doc> 189 # .... 190 # </doc> 191 # </add> 192 # <doc> 193 # .... 194 # </doc> 195 # </add> 196 # ... 197 # </update> 198 199 my $doc_count = 0; 200 164 201 my $line; 165 202 while (defined ($line = <STDIN>)) { 203 # monitor for end of each document 204 if ($line =~ m/^\s*<\/add>$/) { 205 $doc_count++; 206 } 207 else { 208 if ($doc_count == $DOC_BATCH_SIZE) { 209 # evidence that there is still more documents to process 210 # => but reached batch size, so flush and reopen 211 # to force a commit of past docs to solr 212 # before sending this current line on its way 213 214 if ($line =~ m/^\s*<add>$/) { 215 &solrutil::print_to_post_pipe("</update>\n"); 216 flush_and_reopen_java_solr(); 217 218 # start next doc 219 &solrutil::print_to_post_pipe("<update>\n"); 220 $doc_count = 0; 221 } 222 } 223 } 224 166 225 &solrutil::print_to_post_pipe($line); 167 } 168 } 169 170 226 227 } 228 } 171 229 172 230
Note:
See TracChangeset
for help on using the changeset viewer.