source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 33392

Last change on this file since 33392 was 33392, checked in by ak19, 5 years ago

Kathy found a problem whereby she wanted to run consecutive buildcols without activate on a solr collection. She experienced file locking issues on Windows, which the original solr related building code would inevitably cause without activate. Dr Bainbridge's solution was to change our way of thinking about what activate and buildcol should now be doing as regards solr collections. The solution was to unload building-cores for indexes at the end of buildcol, instead of only doing this during activate.pl. I've tried to be conservative with the changes made to the existing code, so that activate still attempts to also unload building-cores, but first pings them (and any other cores it attempts to unload) to ensure the cores exist. During buildcol too, the building-cores are pinged to check they exist before we attempt to unload them.

File size: 27.7 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34
35sub BEGIN {
36 @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41 my $class = shift(@_);
42 my $self = new lucenebuilder (@_);
43 $self = bless $self, $class;
44
45 $self->{'buildtype'} = "solr";
46
47 my $solr_passes_script = "solr_passes.pl";
48
49 $self->{'solr_passes'} = "$solr_passes_script";
50 # Tack perl on the beginning to ensure execution
51 $self->{'solr_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$solr_passes_script\"";
52 return $self;
53}
54
55
56sub default_buildproc {
57 my $self = shift (@_);
58
59 return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69 my $self = shift (@_);
70 # do nothing if we don't want compressed text
71 return if $self->{'no_text'};
72
73 my ($textindex) = @_;
74
75 # workaround to avoid hard-coding "solr" check into buildcol.pl
76 $textindex =~ s/^section://;
77
78 my $outhandle = $self->{'outhandle'};
79
80 # the text directory
81 my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
82 my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
83 &FileUtils::makeAllDirectories($text_dir);
84
85 my $osextra = "";
86 if ($ENV{'GSDLOS'} =~ /^windows$/i)
87 {
88 $text_dir =~ s@/@\\@g;
89 }
90 else
91 {
92 if ($outhandle ne "STDERR")
93 {
94 # so solr_passes doesn't print to stderr if we redirect output
95 $osextra .= " 2>/dev/null";
96 }
97 }
98
99 # Find the perl script to call to run solr
100 my $solr_passes = $self->{'solr_passes'};
101 my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103 my $solr_passes_sections = "Doc";
104
105 my ($handle);
106
107 if ($self->{'debug'})
108 {
109 $handle = *STDOUT;
110 }
111 else
112 {
113 my $site = $self->{'site'};
114 my $collect = $self->{'collection'};
115 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
116 my $core = $core_prefix; # unused in this call to solr_passes
117
118 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
119
120 print STDERR "Executable: $solr_passes_exe\n";
121 print STDERR "Sections: $solr_passes_sections\n";
122 print STDERR "Build Dir: $build_dir\n";
123 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
124 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
125 {
126 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
127 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
128 }
129 }
130
131 # stored text is always Doc and Sec levels
132 my $levels = { 'document' => 1, 'section' => 1 };
133 # always do database at section level
134 my $db_level = "section";
135
136 # set up the document processr
137 $self->{'buildproc'}->set_output_handle ($handle);
138 $self->{'buildproc'}->set_mode ('text');
139 $self->{'buildproc'}->set_index ($textindex);
140 $self->{'buildproc'}->set_indexing_text (0);
141 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
142 $self->{'buildproc'}->set_levels ($levels);
143 $self->{'buildproc'}->set_db_level ($db_level);
144 $self->{'buildproc'}->reset();
145
146 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
147 $self->{'buildproc'}, $self->{'maxdocs'});
148 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
149 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
150 &plugin::end($self->{'pluginfo'});
151
152 close ($handle) unless $self->{'debug'};
153 $self->print_stats();
154
155 print STDERR "</Stage>\n" if $self->{'gli'};
156}
157
158#----
159
160
161
162sub filter_in_out_file
163{
164 my ($in_filename,$out_filename,$replace_rules) = @_;
165
166 if (open(SIN,"<$in_filename")) {
167
168 if (open(SOUT,">$out_filename")) {
169
170 my $line;
171 while (defined ($line=<SIN>)) {
172 chomp $line;
173
174 my $done_insert = 0;
175 foreach my $rule (@$replace_rules) {
176 my $line_re = $rule->{'regexp'};
177 my $insert = $rule->{'insert'};
178
179 if ($line =~ m/$line_re/) {
180 print SOUT $insert;
181 $done_insert = 1;
182 last;
183 }
184 }
185 if (!$done_insert) {
186 print SOUT "$line\n";;
187 }
188 }
189
190 close(SOUT);
191 }
192 else {
193 print STDERR "Error: Failed to open $out_filename\n";
194 print STDERR " $!\n";
195 }
196
197 close(SIN);
198 }
199 else {
200 print STDERR "Error: Failed to open $in_filename\n";
201 print STDERR " $!\n";
202 }
203
204}
205
206# We need to push the list of indexfield to shortname mappings through to the
207# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
208# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
209# a mess of this - it only outputs fields that have been processed (none have)
210# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
211# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
212# This version is dumber - just copy them all across verbatim - but works. We
213# do still need to support the special case of 'allfields'
214sub make_final_field_list
215{
216 my $self = shift (@_);
217 $self->{'build_cfg'} = {};
218 my @indexfieldmap = ();
219 my @indexfields = ();
220
221 # @todo support: $self->{'buildproc'}->{'extraindexfields'}
222 foreach my $fields (@{$self->{'collect_cfg'}->{'indexes'}})
223 {
224 # remove subcoll stuff
225 $fields =~ s/:.*$//;
226 foreach my $field (split(';', $fields))
227 {
228 my $shortname = 'ERROR';
229 if ($field eq 'allfields')
230 {
231 $shortname = 'ZZ';
232 }
233 elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
234 {
235 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
236 }
237 else
238 {
239 print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
240 }
241 push (@indexfieldmap, $field . '->' . $shortname);
242 push (@indexfields, $field);
243 }
244 }
245
246 if (scalar @indexfieldmap)
247 {
248 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
249 }
250
251 if (scalar @indexfields)
252 {
253 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
254 }
255}
256
257# Generate solr schema.xml file based on indexmapfield and other associated
258# config files
259#
260# Unlike make_auxiliary_files(), this needs to be done up-front (rather
261# than at the end) so the data-types in schema.xml are correctly set up
262# prior to document content being pumped through solr_passes.pl
263
264
265sub premake_solr_auxiliary_files
266{
267 my $self = shift (@_);
268
269 # Replace the following marker:
270 #
271 # <!-- ##GREENSTONE-FIELDS## -->
272 #
273 # with lines of the form:
274 #
275 # <field name="<field>" type="string" ... />
276 #
277 # for each <field> in 'indexfieldmap'
278
279 my $schema_insert_xml = "";
280
281 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
282
283 my ($fullfieldname, $field) = ($ifm =~ m/^(.*)->(.*)$/);
284
285 $schema_insert_xml .= " "; # indent
286 $schema_insert_xml .= "<field name=\"$field\" ";
287
288 if($field eq "CD" || $field eq "CS") {
289 # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting
290 # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in.
291 # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true.
292 # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but
293 # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)."
294 # But stored=false makes it not visible in Luke. So setting stored=true as for other fields
295 # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented."
296 # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector)
297 # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring
298 # in the doc, and don't care how often a Coordinate occurs in a document.
299 # Consequently, we don't care about term positions and term offsets for Coordinate meta either.
300
301 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
302 }
303
304 elsif($field eq "ML") {
305 # mapLabel: same attributes as for coord meta CD and CS above
306 # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly
307 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
308 }
309
310 else {
311 if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN)
312 {
313 # Latitude and Longitude is being phased out in favour of using Coord meta.
314 # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location"
315 # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng
316 # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected.
317 # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev
318 # which states:
319 # When indexing, the format is something like:
320 # <field name="store_lat_lon">12.34,-123.45</field>
321 #
322 $schema_insert_xml .= "type=\"location\" ";
323 }
324
325
326 # elsif ($field ne "ZZ" && $field ne "TX")
327 # {
328 # $schema_insert_xml .= "type=\"string\" ";
329 # }
330 else
331 {
332 #$schema_insert_xml .= "type=\"text_en_splitting\" ";
333
334 # original default solr field type for all fields is text_en_splitting
335 my $solrfieldtype = "text_en_splitting";
336 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) {
337 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'};
338 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n";
339 }
340 $schema_insert_xml .= "type=\"$solrfieldtype\" ";
341
342 }
343 # set termVectors=\"true\" when term vectors info is required,
344 # see TermsResponse termResponse = solrResponse.getTermsResponse();
345 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";
346 }
347 }
348
349 # just the one rule to date
350 my $insert_rules
351 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
352 'insert' => $schema_insert_xml } ];
353
354 my $solr_home = $ENV{'GEXT_SOLR'};
355## my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"etc","conf");
356 my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"conf");
357 my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");
358
359 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
360 my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
361 my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
362
363 # make sure output conf directory exists
364 if (!&FileUtils::directoryExists($out_dirname)) {
365 &FileUtils::makeDirectory($out_dirname);
366 }
367
368 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
369
370 # now do the same for solrconfig.xml, stopwords, ...
371 # these are simpler, as they currently do not need any filtering
372
373 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
374 "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml" );
375
376 foreach my $file ( @in_file_list ) {
377 my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file.".in");
378 my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
379
380 if(&FileUtils::fileExists($in_filename)) {
381 filter_in_out_file($in_filename,$out_filename,[]);
382 }
383 }
384
385 my @in_dir_list = ( "lang" );
386 foreach my $dir ( @in_dir_list ) {
387
388 my $full_subdir_name = &FileUtils::filenameConcatenate($in_dirname,$dir);
389
390 if(&FileUtils::directoryExists($full_subdir_name)) {
391 &FileUtils::copyFilesRecursiveNoSVN($full_subdir_name, $out_dirname);
392 }
393 }
394}
395
396
397sub pre_build_indexes
398{
399 my $self = shift (@_);
400 my ($indexname) = @_;
401 my $outhandle = $self->{'outhandle'};
402
403 # If the Solr/Jetty server is not already running, the following starts
404 # it up, and only returns when the server is "reading and listening"
405
406 my $solr_server = new solrserver($self->{'build_dir'});
407 $solr_server->start();
408 $self->{'solr_server'} = $solr_server;
409
410 my $indexes = [];
411 if (defined $indexname && $indexname =~ /\w/) {
412 push @$indexes, $indexname;
413 } else {
414 $indexes = $self->{'collect_cfg'}->{'indexes'};
415 }
416
417 # skip para-level check, as this is done in the main 'build_indexes'
418 # routine
419
420 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
421 my $allfields_index = 0; # do we have an allfields index?
422
423 # Using a hashmap here would avoid duplications, but while more space
424 # efficient, it's not entirely clear it would be more computationally
425 # efficient
426 my @all_fields = ();
427
428 foreach my $index (@$indexes) {
429 if ($self->want_built($index)) {
430
431 # get the parameters for the output
432 # split on : just in case there is subcoll and lang stuff
433 my ($fields) = split (/:/, $index);
434
435 foreach my $field (split (/;/, $fields)) {
436 if ($field eq "metadata") {
437 $all_metadata_specified = 1;
438 }
439 else {
440 push(@all_fields,$field);
441 }
442 }
443 }
444 }
445
446 if ($all_metadata_specified) {
447
448 # (Unforunately) we need to process all the documents in the collection
449 # to figure out what the metadata_field_mapping is
450
451 # set up the document processr
452 $self->{'buildproc'}->set_output_handle (undef);
453 $self->{'buildproc'}->set_mode ('index_field_mapping');
454 $self->{'buildproc'}->reset();
455
456 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
457 $self->{'buildproc'}, $self->{'maxdocs'});
458 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
459 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
460 &plugin::end($self->{'pluginfo'});
461
462 }
463
464 else {
465 # Field mapping solely dependent of entries in 'indexes'
466
467 # No need to explicitly handle "allfields" as create_shortname()
468 # will get a fix on it through it's static_indexfield_map
469
470 my $buildproc = $self->{'buildproc'};
471
472 foreach my $field (@all_fields)
473 {
474 if (!defined $buildproc->{'indexfieldmap'}->{$field})
475 {
476 my $shortname = $buildproc->get_or_create_shortname($field);
477 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
478 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
479 }
480 }
481 }
482
483 # Write out solr 'schema.xml' (and related) file
484 #
485 $self->make_final_field_list();
486 $self->premake_solr_auxiliary_files();
487
488 # Now update the solr-core information in solr.xml
489 # => at most two cores <colname>-Doc and <colname>-Sec
490
491 my $site = $self->{'site'};
492 my $collect = $self->{'collection'};
493 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
494
495 # my $idx = $self->{'index_mapping'}->{$index};
496 my $idx = "idx";
497
498 my $build_dir = $self->{'build_dir'};
499
500 foreach my $level (keys %{$self->{'levels'}}) {
501
502 my ($pindex) = $level =~ /^(.)/;
503
504 my $index_dir = $pindex.$idx;
505 my $core = "$core_prefix-$index_dir";
506
507 # force_removeold == opposite of being run in 'incremental' mode
508 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
509
510 if ($force_removeold) {
511 print $outhandle "\n-removeold set (new index will be created)\n";
512
513 # create cores under temporary core names, corresponding to building directory
514 $core = "building-".$core;
515
516 my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
517 &FileUtils::removeFilesRecursive($full_index_dir);
518 &FileUtils::makeDirectory($full_index_dir);
519
520 my $full_tlog_dir = &FileUtils::filenameConcatenate($full_index_dir, "tlog");
521 &FileUtils::makeDirectory($full_tlog_dir);
522
523 # Solr then wants an "index" folder within this general index area!
524# my $full_index_index_dir = &FileUtils::filenameConcatenate($full_index_dir,"index");
525# &FileUtils::makeDirectory($full_index_index_dir);
526
527
528 # now go on and create new index
529 print $outhandle "Creating Solr core: $core\n";
530 $solr_server->admin_create_core($core);
531
532 }
533 else {
534 # if collect==core is already in solr.xml (check with STATUS)
535 # => use RELOAD* call to refresh fields now expressed in schema.xml
536 #
537 # else
538 # => use CREATE API to add to solr.xml
539 #
540 # No longer calling RELOAD, because Georgy documented a memory leak with it (svn r32178)
541 # Using unload + create to get the same effect as RELOAD without its side-effects.
542 #
543
544 my $check_core_exists = $solr_server->admin_ping_core($core);
545
546 if ($check_core_exists) {
547 print $outhandle "Unloading Solr core: $core\n";
548 $solr_server->admin_unload_core($core);
549 }
550
551 print $outhandle "Creating Solr core: $core\n";
552 $solr_server->admin_create_core($core);
553
554 }
555 }
556
557}
558
559# Essentially the same as the lucenebuilder.pm version, only using solr_passes
560# => refactor and make better use of inheritence
561
562sub build_index {
563 my $self = shift (@_);
564 my ($index,$llevel) = @_;
565 my $outhandle = $self->{'outhandle'};
566 my $build_dir = $self->{'build_dir'};
567
568 # get the full index directory path and make sure it exists
569 my $indexdir = $self->{'index_mapping'}->{$index};
570 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));
571
572 # Find the perl script to call to run solr
573 my $solr_passes = $self->{'solr_passes'};
574 my $solr_passes_exe = $self->{'solr_passes_exe'};
575
576 # define the section names for solrpasses
577 # define the section names and possibly the doc name for solrpasses
578 my $solr_passes_sections = $llevel;
579
580 my $osextra = "";
581 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
582 $build_dir =~ s@/@\\@g;
583 } else {
584 if ($outhandle ne "STDERR") {
585 # so solr_passes doesn't print to stderr if we redirect output
586 $osextra .= " 2>/dev/null";
587 }
588 }
589
590 # get the index expression if this index belongs
591 # to a subcollection
592 my $indexexparr = [];
593 my $langarr = [];
594
595 # there may be subcollection info, and language info.
596 my ($fields, $subcollection, $language) = split (":", $index);
597 my @subcollections = ();
598 @subcollections = split /,/, $subcollection if (defined $subcollection);
599
600 foreach $subcollection (@subcollections) {
601 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
602 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
603 }
604 }
605
606 # add expressions for languages if this index belongs to
607 # a language subcollection - only put languages expressions for the
608 # ones we want in the index
609 my @languages = ();
610 my $languagemetadata = "Language";
611 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
612 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
613 }
614 @languages = split /,/, $language if (defined $language);
615 foreach my $language (@languages) {
616 my $not=0;
617 if ($language =~ s/^\!//) {
618 $not = 1;
619 }
620 if($not) {
621 push (@$langarr, "!$language");
622 } else {
623 push (@$langarr, "$language");
624 }
625 }
626
627 # Build index dictionary. Uses verbatim stem method
628 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
629 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
630 my ($handle);
631
632 if ($self->{'debug'}) {
633 $handle = *STDOUT;
634 } else {
635 my $site = $self->{'site'};
636 my $collect = $self->{'collection'};
637 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
638 my $ds_idx = $self->{'index_mapping'}->{$index};
639 my $core = "$core_prefix-$ds_idx";
640
641 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
642
643 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
644 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
645 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
646 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
647 }
648 }
649
650 my $store_levels = $self->{'levels'};
651 my $db_level = "section"; #always
652 my $dom_level = "";
653 foreach my $key (keys %$store_levels) {
654 if ($mgppbuilder::level_map{$key} eq $llevel) {
655 $dom_level = $key;
656 }
657 }
658 if ($dom_level eq "") {
659 print STDERR "Warning: unrecognized tag level $llevel\n";
660 $dom_level = "document";
661 }
662
663 my $local_levels = { $dom_level => 1 }; # work on one level at a time
664
665 # set up the document processr
666 $self->{'buildproc'}->set_output_handle ($handle);
667 $self->{'buildproc'}->set_mode ('text');
668 $self->{'buildproc'}->set_index ($index, $indexexparr);
669 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
670 $self->{'buildproc'}->set_indexing_text (1);
671 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
672 $self->{'buildproc'}->set_levels ($local_levels);
673 if (defined $self->{'collect_cfg'}->{'sortfields'}) {
674 $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
675 }
676 if (defined $self->{'collect_cfg'}->{'facetfields'}) {
677 $self->{'buildproc'}->set_facetfields ($self->{'collect_cfg'}->{'facetfields'});
678 }
679 $self->{'buildproc'}->set_db_level($db_level);
680 $self->{'buildproc'}->reset();
681
682 print $handle "<update>\n";
683
684 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
685 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
686
687 print $handle "</update>\n";
688
689 close ($handle) unless $self->{'debug'};
690
691 $self->print_stats();
692
693 $self->{'buildproc'}->set_levels ($store_levels);
694 print STDERR "</Stage>\n" if $self->{'gli'};
695
696
697 print STDERR "@@@@@ FINISHED PROCESSING INDEX: indexlevel $self->{'index_mapping'}->{$index}\n\n";
698
699}
700
701
702sub post_build_indexes {
703 my $self = shift(@_);
704
705 # deliberately override to prevent the mgpp post_build_index() calling
706 # $self->make_final_field_list()
707 # as this has been done in our pre_build_indexes() phase for solr
708
709
710 my $solr_server = $self->{'solr_server'};
711
712 # 1 Aug 2019: now we unload (remove) building-cores for each index during buildcol itself
713 # instead of during activate.
714 # Kathy described a problem that when calling buildcol.pl successively without succeeding each
715 # with a call to activate, there were Win file lock issues when attempting to manually remove
716 # the building folder. This was what activate was to solve, however, there's no reason to
717 # call activate after buildcol in such cases where it is known the buildcol failed in some way.
718 # (In such cases, the user building the collection would have to manually unload the building-
719 # cores through the solr servlet interface).
720 # Dr Bainbridge instructed that the building- cores should be unloaded again at the end
721 # of buildcol. And any symmetrical step during pre-building, if any is found necessary.
722 # I'm still not sure this won't break activate in some way, for some combination,
723 # as that is meant to ensure building-cores exist whenever the building folder exists...
724 # But I was asked not to take to long on this, so I can't test all the different combinations
725 # (removeold/incremental/..., or even remote GS situation) in which building can happen and in
726 # which buildcol can be combined or not with activate, or be sequenced with further calls to
727 # buildcol, with or without -activate.
728 # So to compensate, I've tried to keep the code changes as conservative as possible to keep the chances
729 # for things going wrong to a minimum, by pinging for building-* cores before unloading them here
730 # in solrbuilder.pm (note that unload doesn't delete the index directory associated with the core)
731 # and then in activate.pl the building-* cores get pinged again to determine whether they exist
732 # before attempting to unload them there as well, since I can no longer assume the corse exist
733 # and can be unloaded. There is now the additional overhead of all the extra pinging going on,
734 # but it helps ensure we only unload building-* cores when they exist.
735
736 # Note that pre-build-indexes() was already creating the building- cores, so don't need to
737 # worry about the needed symmetry at start and end of buildcol to create building- cores
738 # in symmetry with unloading them here.
739
740 my $site = $self->{'site'};
741 my $collect = $self->{'collection'};
742 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
743 my $build_dir = $self->{'build_dir'};
744 my $idx = "idx";
745
746 foreach my $level (keys %{$self->{'levels'}}) {
747
748 my ($pindex) = $level =~ /^(.)/;
749
750 my $index_dir = $pindex.$idx;
751 my $corename = "building-$core_prefix-$index_dir";
752
753 # unload the core if it exists. Since I can't think of exactly in which cases
754 # the building-cores exist and won't exist (e.g. removeold), I'll just always
755 # first check if the building-core exists and then unload it.
756 if ($solr_server->admin_ping_core($corename)) {
757 print STDERR "@@@@ solrbuilder::post_build_indexes(): Now unloading this index's building core: $corename\n\n";
758 $solr_server->admin_unload_core_explicitly_retaining_index($corename)
759 }
760
761 }
762
763 # Also need to stop the Solr server (be it tomcat or jetty) if it was explicitly started
764 # in pre_build_indexes()
765
766 if ($solr_server->explicitly_started()) {
767 $solr_server->stop();
768 }
769
770 $self->{'solr_server'} = undef;
771
772}
773
774sub build_cfg_extra {
775 my $self = shift (@_);
776 my ($build_cfg) = @_;
777
778 $self->lucenebuilder::build_cfg_extra($build_cfg);
779
780 # need to add in facet stuff
781 my @facetfields = ();
782 my @facetfieldmap = ();
783
784 foreach my $sf (@{$self->{'buildproc'}->{'facetfields'}}) {
785 if ($self->{'buildproc'}->{'actualfacetfields'}->{$sf}) {
786 my $shortname = $self->{'buildproc'}->{'facetfieldnamemap'}->{$sf};
787 push(@facetfields, $shortname);
788 push (@facetfieldmap, "$sf\-\>$shortname");
789 }
790
791 }
792 $build_cfg->{'indexfacetfields'} = \@facetfields;
793 $build_cfg->{'indexfacetfieldmap'} = \@facetfieldmap;
794}
7951;
796
797
Note: See TracBrowser for help on using the repository browser.