source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm

Last change on this file was 37787, checked in by kjdon, 10 months ago

code updated to support subcollections and language partitions. we don't assume just idx anymore, we use the index defs which includes subcoll info. store the corenmames in buildcfg, for use later.

File size: 28.2 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34
35sub BEGIN {
36 @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41 my $class = shift(@_);
42 my $self = new lucenebuilder (@_);
43 $self = bless $self, $class;
44
45 $self->{'buildtype'} = "solr";
46
47 my $solr_passes_script = "solr_passes.pl";
48
49 $self->{'solr_passes'} = "$solr_passes_script";
50 # Tack perl on the beginning to ensure execution
51 $self->{'solr_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$solr_passes_script\"";
52 return $self;
53}
54
55
56sub default_buildproc {
57 my $self = shift (@_);
58
59 return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69 my $self = shift (@_);
70 # do nothing if we don't want compressed text
71 return if $self->{'no_text'};
72
73 my ($textindex) = @_;
74
75 # workaround to avoid hard-coding "solr" check into buildcol.pl
76 $textindex =~ s/^section://;
77
78 my $outhandle = $self->{'outhandle'};
79
80 # the text directory
81 my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
82 my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
83 &FileUtils::makeAllDirectories($text_dir);
84
85 my $osextra = "";
86 if ($ENV{'GSDLOS'} =~ /^windows$/i)
87 {
88 $text_dir =~ s@/@\\@g;
89 }
90 else
91 {
92 if ($outhandle ne "STDERR")
93 {
94 # so solr_passes doesn't print to stderr if we redirect output
95 $osextra .= " 2>/dev/null";
96 }
97 }
98
99 # Find the perl script to call to run solr
100 my $solr_passes = $self->{'solr_passes'};
101 my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103 my $solr_passes_sections = "Doc";
104
105 my ($handle);
106
107 if ($self->{'debug'})
108 {
109 $handle = *STDOUT;
110 }
111 else
112 {
113 my $site = $self->{'site'};
114 my $collect = $self->{'collection'};
115 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
116 my $core = $core_prefix; # unused in this call to solr_passes
117
118 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
119
120 print STDERR "Executable: $solr_passes_exe\n";
121 print STDERR "Sections: $solr_passes_sections\n";
122 print STDERR "Build Dir: $build_dir\n";
123 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
124 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
125 {
126 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
127 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
128 }
129 }
130
131 # stored text is always Doc and Sec levels
132 my $levels = { 'document' => 1, 'section' => 1 };
133 # always do database at section level
134 my $db_level = "section";
135
136 # set up the document processr
137 $self->{'buildproc'}->set_output_handle ($handle);
138 $self->{'buildproc'}->set_mode ('text');
139 $self->{'buildproc'}->set_index ($textindex);
140 $self->{'buildproc'}->set_indexing_text (0);
141 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
142 $self->{'buildproc'}->set_levels ($levels);
143 $self->{'buildproc'}->set_db_level ($db_level);
144 $self->{'buildproc'}->reset();
145
146 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
147 $self->{'buildproc'}, $self->{'maxdocs'});
148 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
149 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
150 &plugin::end($self->{'pluginfo'});
151
152 close ($handle) unless $self->{'debug'};
153 $self->print_stats();
154
155 print STDERR "</Stage>\n" if $self->{'gli'};
156}
157
158#----
159
160
161
162sub filter_in_out_file
163{
164 my ($in_filename,$out_filename,$replace_rules) = @_;
165
166 if (open(SIN,"<$in_filename")) {
167
168 if (open(SOUT,">$out_filename")) {
169
170 my $line;
171 while (defined ($line=<SIN>)) {
172 chomp $line;
173
174 my $done_insert = 0;
175 foreach my $rule (@$replace_rules) {
176 my $line_re = $rule->{'regexp'};
177 my $insert = $rule->{'insert'};
178
179 if ($line =~ m/$line_re/) {
180 print SOUT $insert;
181 $done_insert = 1;
182 last;
183 }
184 }
185 if (!$done_insert) {
186 print SOUT "$line\n";;
187 }
188 }
189
190 close(SOUT);
191 }
192 else {
193 print STDERR "Error: Failed to open $out_filename\n";
194 print STDERR " $!\n";
195 }
196
197 close(SIN);
198 }
199 else {
200 print STDERR "Error: Failed to open $in_filename\n";
201 print STDERR " $!\n";
202 }
203
204}
205
206# We need to push the list of indexfield to shortname mappings through to the
207# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
208# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
209# a mess of this - it only outputs fields that have been processed (none have)
210# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
211# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
212# This version is dumber - just copy them all across verbatim - but works. We
213# do still need to support the special case of 'allfields'
214sub make_final_field_list
215{
216 my $self = shift (@_);
217 $self->{'build_cfg'} = {};
218 my @indexfieldmap = ();
219 my @indexfields = ();
220
221 # so we don't add duplicates - now that we have subcollections working, there will be multiple index defs with the same fields (and different subcolls)
222 my $done_fields = {};
223 # @todo support: $self->{'buildproc'}->{'extraindexfields'}
224 foreach my $origfields (@{$self->{'collect_cfg'}->{'indexes'}})
225 {
226 # remove subcoll stuff for finding fields, but we need to leave it in the oroginal index definition for later, so make a copy
227 my $fields = $origfields;
228 $fields =~ s/:.*$//;
229 foreach my $field (split(';', $fields))
230 {
231 next if (defined $done_fields->{$field});
232 my $shortname = 'ERROR';
233 if ($field eq 'allfields')
234 {
235 $shortname = 'ZZ';
236 }
237 elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
238 {
239 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
240 }
241 else
242 {
243 print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
244 }
245 push (@indexfieldmap, $field . '->' . $shortname);
246 push (@indexfields, $field);
247 $done_fields->{$field} = 1;
248 }
249 }
250
251 if (scalar @indexfieldmap)
252 {
253 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
254 }
255
256 if (scalar @indexfields)
257 {
258 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
259 }
260}
261
262
263# Generate solr schema.xml file based on indexmapfield and other associated
264# config files
265#
266# Unlike make_auxiliary_files(), this needs to be done up-front (rather
267# than at the end) so the data-types in schema.xml are correctly set up
268# prior to document content being pumped through solr_passes.pl
269
270
271sub premake_solr_auxiliary_files
272{
273 my $self = shift (@_);
274
275 # Replace the following marker:
276 #
277 # <!-- ##GREENSTONE-FIELDS## -->
278 #
279 # with lines of the form:
280 #
281 # <field name="<field>" type="string" ... />
282 #
283 # for each <field> in 'indexfieldmap'
284
285 my $schema_insert_xml = "";
286
287 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
288 my ($fullfieldname, $field) = ($ifm =~ m/^(.*)->(.*)$/);
289
290 $schema_insert_xml .= " "; # indent
291 $schema_insert_xml .= "<field name=\"$field\" ";
292
293 if($field eq "CD" || $field eq "CS") {
294 # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting
295 # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in.
296 # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true.
297 # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but
298 # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)."
299 # But stored=false makes it not visible in Luke. So setting stored=true as for other fields
300 # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented."
301 # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector)
302 # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring
303 # in the doc, and don't care how often a Coordinate occurs in a document.
304 # Consequently, we don't care about term positions and term offsets for Coordinate meta either.
305
306 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
307 }
308
309 elsif($field eq "ML") {
310 # mapLabel: same attributes as for coord meta CD and CS above
311 # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly
312 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
313 }
314
315 else {
316 if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN)
317 {
318 # Latitude and Longitude is being phased out in favour of using Coord meta.
319 # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location"
320 # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng
321 # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected.
322 # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev
323 # which states:
324 # When indexing, the format is something like:
325 # <field name="store_lat_lon">12.34,-123.45</field>
326 #
327 $schema_insert_xml .= "type=\"location\" ";
328 }
329
330
331 # elsif ($field ne "ZZ" && $field ne "TX")
332 # {
333 # $schema_insert_xml .= "type=\"string\" ";
334 # }
335 else
336 {
337 #$schema_insert_xml .= "type=\"text_en_splitting\" ";
338
339 # original default solr field type for all fields is text_en_splitting
340 my $solrfieldtype = "text_en_splitting";
341 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) {
342 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'};
343 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n";
344 }
345 $schema_insert_xml .= "type=\"$solrfieldtype\" ";
346
347 }
348 # set termVectors=\"true\" when term vectors info is required,
349 # see TermsResponse termResponse = solrResponse.getTermsResponse();
350 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";
351 }
352 }
353
354 # just the one rule to date
355 my $insert_rules
356 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
357 'insert' => $schema_insert_xml } ];
358
359 my $solr_home = $ENV{'GEXT_SOLR'};
360## my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"etc","conf");
361 my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"conf");
362 my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");
363
364 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
365 my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
366 my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
367
368 # make sure output conf directory exists
369 if (!&FileUtils::directoryExists($out_dirname)) {
370 &FileUtils::makeDirectory($out_dirname);
371 }
372
373 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
374
375 # now do the same for solrconfig.xml, stopwords, ...
376 # these are simpler, as they currently do not need any filtering
377
378 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
379 "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml" );
380
381 foreach my $file ( @in_file_list ) {
382 my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file.".in");
383 my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
384
385 if(&FileUtils::fileExists($in_filename)) {
386 filter_in_out_file($in_filename,$out_filename,[]);
387 }
388 }
389
390 my @in_dir_list = ( "lang" );
391 foreach my $dir ( @in_dir_list ) {
392
393 my $full_subdir_name = &FileUtils::filenameConcatenate($in_dirname,$dir);
394
395 if(&FileUtils::directoryExists($full_subdir_name)) {
396 &FileUtils::copyFilesRecursiveNoSVN($full_subdir_name, $out_dirname);
397 }
398 }
399}
400
401
402sub pre_build_indexes
403{
404 my $self = shift (@_);
405 my ($indexname) = @_;
406 my $outhandle = $self->{'outhandle'};
407
408 # If the Solr/Jetty server is not already running, the following starts
409 # it up, and only returns when the server is "reading and listening"
410
411 my $solr_server = new solrserver($self->{'build_dir'});
412 $solr_server->start();
413 $self->{'solr_server'} = $solr_server;
414
415 my $indexes = [];
416 if (defined $indexname && $indexname =~ /\w/) {
417 push @$indexes, $indexname;
418 } else {
419 $indexes = $self->{'collect_cfg'}->{'indexes'};
420 }
421
422 # create the mapping between the index descriptions
423 # and their directory names (includes subcolls and langs)
424 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
425
426
427 # skip para-level check, as this is done in the main 'build_indexes'
428 # routine
429
430 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
431 my $allfields_index = 0; # do we have an allfields index?
432
433 # Using a hashmap here would avoid duplications, but while more space
434 # efficient, it's not entirely clear it would be more computationally
435 # efficient
436 my @all_fields = ();
437
438 foreach my $index (@$indexes) {
439 if ($self->want_built($index)) {
440
441 # get the parameters for the output
442 # split on : just in case there is subcoll and lang stuff
443 my ($fields) = split (/:/, $index);
444
445 foreach my $field (split (/;/, $fields)) {
446 if ($field eq "metadata") {
447 $all_metadata_specified = 1;
448 }
449 else {
450 push(@all_fields,$field);
451 }
452 }
453 }
454 }
455
456 if ($all_metadata_specified) {
457
458 # (Unforunately) we need to process all the documents in the collection
459 # to figure out what the metadata_field_mapping is
460
461 # set up the document processr
462 $self->{'buildproc'}->set_output_handle (undef);
463 $self->{'buildproc'}->set_mode ('index_field_mapping');
464 $self->{'buildproc'}->reset();
465
466 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
467 $self->{'buildproc'}, $self->{'maxdocs'});
468 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
469 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
470 &plugin::end($self->{'pluginfo'});
471
472 }
473
474 else {
475 # Field mapping solely dependent of entries in 'indexes'
476
477 # No need to explicitly handle "allfields" as create_shortname()
478 # will get a fix on it through it's static_indexfield_map
479
480 my $buildproc = $self->{'buildproc'};
481
482 foreach my $field (@all_fields)
483 {
484 if (!defined $buildproc->{'indexfieldmap'}->{$field})
485 {
486 my $shortname = $buildproc->get_or_create_shortname($field);
487 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
488 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
489 }
490 }
491 }
492
493 # Write out solr 'schema.xml' (and related) file
494 #
495 $self->make_final_field_list();
496
497 $self->premake_solr_auxiliary_files();
498
499 # Now update the solr-core information in solr.xml
500 # => at most two cores <colname>-Doc and <colname>-Sec
501
502 my $site = $self->{'site'};
503 my $collect = $self->{'collection'};
504 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
505 my $build_dir = $self->{'build_dir'};
506
507 $self->{'solrcores'} = [];
508 foreach my $index (@$indexes) {
509 if ($self->want_built($index)) {
510
511 my $idx = $self->{'index_mapping'}->{$index};
512
513 foreach my $level (keys %{$self->{'levels'}}) {
514
515 my ($pindex) = $level =~ /^(.)/;
516
517 my $index_dir = $pindex.$idx;
518 my $core = "$core_prefix-$index_dir";
519 push (@{$self->{'solrcores'}}, $index_dir);
520 # force_removeold == opposite of being run in 'incremental' mode
521 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
522
523 if ($force_removeold) {
524 print $outhandle "\n-removeold set (new index will be created)\n";
525
526 # create cores under temporary core names, corresponding to building directory
527 $core = "building-".$core;
528
529 my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
530 &FileUtils::removeFilesRecursive($full_index_dir);
531 &FileUtils::makeDirectory($full_index_dir);
532
533 my $full_tlog_dir = &FileUtils::filenameConcatenate($full_index_dir, "tlog");
534 &FileUtils::makeDirectory($full_tlog_dir);
535
536 # Solr then wants an "index" folder within this general index area!
537# my $full_index_index_dir = &FileUtils::filenameConcatenate($full_index_dir,"index");
538# &FileUtils::makeDirectory($full_index_index_dir);
539
540
541 # now go on and create new index
542 print $outhandle "Creating Solr core: $core\n";
543 $solr_server->admin_create_core($core);
544
545 }
546 else {
547 # if collect==core is already in solr.xml (check with STATUS)
548 # => use RELOAD* call to refresh fields now expressed in schema.xml
549 #
550 # else
551 # => use CREATE API to add to solr.xml
552 #
553 # No longer calling RELOAD, because Georgy documented a memory leak with it (svn r32178)
554 # Using unload + create to get the same effect as RELOAD without its side-effects.
555 #
556
557 my $check_core_exists = $solr_server->admin_ping_core($core);
558
559 if ($check_core_exists) {
560 print $outhandle "Unloading Solr core: $core\n";
561 $solr_server->admin_unload_core($core);
562 }
563
564 print $outhandle "Creating Solr core: $core\n";
565 $solr_server->admin_create_core($core);
566 }
567 } # foreach level
568 } #if (want build index)
569 } #foreach index `
570
571}
572
573# Essentially the same as the lucenebuilder.pm version, only using solr_passes
574# => refactor and make better use of inheritence
575
576sub build_index {
577 my $self = shift (@_);
578 my ($index,$llevel) = @_;
579 my $outhandle = $self->{'outhandle'};
580 my $build_dir = $self->{'build_dir'};
581
582 # get the full index directory path and make sure it exists
583 my $indexdir = $self->{'index_mapping'}->{$index};
584 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));
585
586 # Find the perl script to call to run solr
587 my $solr_passes = $self->{'solr_passes'};
588 my $solr_passes_exe = $self->{'solr_passes_exe'};
589
590 # define the section names for solrpasses
591 # define the section names and possibly the doc name for solrpasses
592 my $solr_passes_sections = $llevel;
593
594 my $osextra = "";
595 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
596 $build_dir =~ s@/@\\@g;
597 } else {
598 if ($outhandle ne "STDERR") {
599 # so solr_passes doesn't print to stderr if we redirect output
600 $osextra .= " 2>/dev/null";
601 }
602 }
603
604 # get the index expression if this index belongs
605 # to a subcollection
606 my $indexexparr = [];
607 my $langarr = [];
608
609 # there may be subcollection info, and language info.
610 my ($fields, $subcollection, $language) = split (":", $index);
611 my @subcollections = ();
612 @subcollections = split /,/, $subcollection if (defined $subcollection);
613
614 foreach $subcollection (@subcollections) {
615 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
616 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
617 }
618 }
619
620 # add expressions for languages if this index belongs to
621 # a language subcollection - only put languages expressions for the
622 # ones we want in the index
623 my @languages = ();
624 my $languagemetadata = "Language";
625 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
626 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
627 }
628 @languages = split /,/, $language if (defined $language);
629 foreach my $language (@languages) {
630 my $not=0;
631 if ($language =~ s/^\!//) {
632 $not = 1;
633 }
634 if($not) {
635 push (@$langarr, "!$language");
636 } else {
637 push (@$langarr, "$language");
638 }
639 }
640
641 # Build index dictionary. Uses verbatim stem method
642 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
643 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
644 my ($handle);
645
646 if ($self->{'debug'}) {
647 $handle = *STDOUT;
648 } else {
649 my $site = $self->{'site'};
650 my $collect = $self->{'collection'};
651 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
652 my $ds_idx = $self->{'index_mapping'}->{$index};
653 my $core = "$core_prefix-$ds_idx";
654
655 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
656
657 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
658 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
659 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
660 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
661 }
662 }
663
664 my $store_levels = $self->{'levels'};
665 my $db_level = "section"; #always
666 my $dom_level = "";
667 foreach my $key (keys %$store_levels) {
668 if ($mgppbuilder::level_map{$key} eq $llevel) {
669 $dom_level = $key;
670 }
671 }
672 if ($dom_level eq "") {
673 print STDERR "Warning: unrecognized tag level $llevel\n";
674 $dom_level = "document";
675 }
676
677 my $local_levels = { $dom_level => 1 }; # work on one level at a time
678
679 # set up the document processr
680 $self->{'buildproc'}->set_output_handle ($handle);
681 $self->{'buildproc'}->set_mode ('text');
682 $self->{'buildproc'}->set_index ($index, $indexexparr);
683 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
684 $self->{'buildproc'}->set_indexing_text (1);
685 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
686 $self->{'buildproc'}->set_levels ($local_levels);
687 if (defined $self->{'collect_cfg'}->{'sortfields'}) {
688 $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
689 }
690 if (defined $self->{'collect_cfg'}->{'facetfields'}) {
691 $self->{'buildproc'}->set_facetfields ($self->{'collect_cfg'}->{'facetfields'});
692 }
693 $self->{'buildproc'}->set_db_level($db_level);
694 $self->{'buildproc'}->reset();
695
696 print $handle "<update>\n";
697
698 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
699 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
700
701 print $handle "</update>\n";
702
703 close ($handle) unless $self->{'debug'};
704
705 $self->print_stats();
706
707 $self->{'buildproc'}->set_levels ($store_levels);
708 print STDERR "</Stage>\n" if $self->{'gli'};
709
710
711 ##print STDERR "@@@@@ FINISHED PROCESSING INDEX: indexlevel $self->{'index_mapping'}->{$index}\n\n";
712
713}
714
715
716sub post_build_indexes {
717 my $self = shift(@_);
718
719 # deliberately override to prevent the mgpp post_build_index() calling
720 # $self->make_final_field_list()
721 # as this has been done in our pre_build_indexes() phase for solr
722
723
724 my $solr_server = $self->{'solr_server'};
725
726 # 1 Aug 2019: now we unload (remove) building-cores for each index during buildcol itself
727 # instead of during activate.
728 # Kathy described a problem that when calling buildcol.pl successively without succeeding each
729 # with a call to activate, there were Win file lock issues when attempting to manually remove
730 # the building folder. This was what activate was to solve, however, there's no reason to
731 # call activate after buildcol in such cases where it is known the buildcol failed in some way.
732 # (In such cases, the user building the collection would have to manually unload the building-
733 # cores through the solr servlet interface).
734 # Dr Bainbridge instructed that the building- cores should be unloaded again at the end
735 # of buildcol. And any symmetrical step during pre-building, if any is found necessary.
736 # I'm still not sure this won't break activate in some way, for some combination,
737 # as that is meant to ensure building-cores exist whenever the building folder exists...
738 # But I was asked not to take to long on this, so I can't test all the different combinations
739 # (removeold/incremental/..., or even remote GS situation) in which building can happen and in
740 # which buildcol can be combined or not with activate, or be sequenced with further calls to
741 # buildcol, with or without -activate.
742 # So to compensate, I've tried to keep the code changes as conservative as possible to keep the chances
743 # for things going wrong to a minimum, by pinging for building-* cores before unloading them here
744 # in solrbuilder.pm (note that unload doesn't delete the index directory associated with the core)
745 # and then in activate.pl the building-* cores get pinged again to determine whether they exist
746 # before attempting to unload them there as well, since I can no longer assume the cores exist
747 # and can be unloaded. There is now the additional overhead of all the extra pinging going on,
748 # but it helps ensure we only unload building-* cores when they exist.
749
750 # Note that pre-build-indexes() was already creating the building- cores, so don't need to
751 # worry about the needed symmetry at start and end of buildcol to create building- cores
752 # in symmetry with unloading them here.
753
754 # update: now that we can do subcollections with solr, we don't know what cores may have been there
755 # lets just removes all cores for the collection.
756 my $site = $self->{'site'};
757 my $collect = $self->{'collection'};
758 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
759 my $build_dir = $self->{'build_dir'};
760
761 $solr_server->admin_unload_all_cores_for_prefix("building-$core_prefix");
762
763 # Also need to stop the Solr server (be it tomcat or jetty) if it was explicitly started
764 # in pre_build_indexes()
765
766 if ($solr_server->explicitly_started()) {
767 $solr_server->stop();
768 }
769
770 $self->{'solr_server'} = undef;
771
772}
773
774sub build_cfg_extra {
775 my $self = shift (@_);
776 my ($build_cfg) = @_;
777
778 $self->lucenebuilder::build_cfg_extra($build_cfg);
779
780 # need to add in facet stuff
781 my @facetfields = ();
782 my @facetfieldmap = ();
783
784 foreach my $sf (@{$self->{'buildproc'}->{'facetfields'}}) {
785 if ($self->{'buildproc'}->{'actualfacetfields'}->{$sf}) {
786 my $shortname = $self->{'buildproc'}->{'facetfieldnamemap'}->{$sf};
787 push(@facetfields, $shortname);
788 push (@facetfieldmap, "$sf\-\>$shortname");
789 }
790
791 }
792 $build_cfg->{'indexfacetfields'} = \@facetfields;
793 $build_cfg->{'indexfacetfieldmap'} = \@facetfieldmap;
794 # store the core names in buildConfig, so that activate.pl can use them,
795 $build_cfg->{'solrcores'} = $self->{'solrcores'};
796
797}
7981;
799
800
Note: See TracBrowser for help on using the repository browser.