source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 33327

Last change on this file since 33327 was 33327, checked in by ak19, 5 years ago

In order to get map coordinate metadata stored correctly in solr, changes were required. These changes revealed that the way in which some index fields were stored in solr but also lucene were not exactly correct and required changing too. 1. Coordinate/CD, CoordShort/CS and GPSMapOverlayLabel/ML meta are now being stored. The schema elements created for these indexed fields notably need to say they're multivalued (multiple values per docOID) and are of type=string rather than type=text_en_splitting as the other meta have been so far. No term related information being stored for them as that doesn't appear important for these indexed fields. 2. Changes to solrbuildproc required and these changes were also repeated into lucenebuildproc: in their code before this commit, <field name=... /> elements were stored once for all meta elements in that field. It sort of worked out so far since the type=text_en_splitting for these fields. This however created the problem that for example all Coordinate meta for a docOID went into a single <field name=CD .../> element separate by spaces rather than a <field name=CD .../> element for each Coordinate meta. We wanted the latter behaviour for CD, CS and ML meta but also for all other indexed meta fields such as TI for titles. But also for indexed fields that include multiple meta in one index such as a hypothetical TT where TT would include dc.Title,ex.Title,text. In that case too we want a <field name=TT /> element for each title meta and for the text meta. 3. The num_processed_bytes calculation is left untouched and still includes the encapsulating <field name=.../> element and has not been changed to be calculated over just the meta data value of each field. This is because not only is it calculated to include the field in super -buildproc.pm classes, but also because the definition of num_processed_bytes in basebuilder.pm is defined as the number of bytes actually passed to (mg) for the current index, where lucene and mgpp buildprocs both include the enclosing element in the calculation which seems deliberate. Further, num_processed_bytes contrasts against num_bytes, declared and defined in basebuildproc.pm too as The actual number of bytes in the collection, normally the same as what's processed during text compression. num_bytes seems to be what Dr Bainbridge had in mind today when he said that actually the enclosing <field/> element shouldn't be included in the calculation of num_processed_bytes. Since the definition of num_processed_bytes seems ambiguous to me now, I leave it alone until discussed with Dr Bainbridge again, as there are many places where it needs changing otherwise.

File size: 24.5 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34
35sub BEGIN {
36 @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41 my $class = shift(@_);
42 my $self = new lucenebuilder (@_);
43 $self = bless $self, $class;
44
45 $self->{'buildtype'} = "solr";
46
47 my $solr_passes_script = "solr_passes.pl";
48
49 $self->{'solr_passes'} = "$solr_passes_script";
50 # Tack perl on the beginning to ensure execution
51 $self->{'solr_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$solr_passes_script\"";
52 return $self;
53}
54
55
56sub default_buildproc {
57 my $self = shift (@_);
58
59 return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69 my $self = shift (@_);
70 # do nothing if we don't want compressed text
71 return if $self->{'no_text'};
72
73 my ($textindex) = @_;
74
75 # workaround to avoid hard-coding "solr" check into buildcol.pl
76 $textindex =~ s/^section://;
77
78 my $outhandle = $self->{'outhandle'};
79
80 # the text directory
81 my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
82 my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
83 &FileUtils::makeAllDirectories($text_dir);
84
85 my $osextra = "";
86 if ($ENV{'GSDLOS'} =~ /^windows$/i)
87 {
88 $text_dir =~ s@/@\\@g;
89 }
90 else
91 {
92 if ($outhandle ne "STDERR")
93 {
94 # so solr_passes doesn't print to stderr if we redirect output
95 $osextra .= " 2>/dev/null";
96 }
97 }
98
99 # Find the perl script to call to run solr
100 my $solr_passes = $self->{'solr_passes'};
101 my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103 my $solr_passes_sections = "Doc";
104
105 my ($handle);
106
107 if ($self->{'debug'})
108 {
109 $handle = *STDOUT;
110 }
111 else
112 {
113 my $site = $self->{'site'};
114 my $collect = $self->{'collection'};
115 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
116 my $core = $core_prefix; # unused in this call to solr_passes
117
118 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
119
120 print STDERR "Executable: $solr_passes_exe\n";
121 print STDERR "Sections: $solr_passes_sections\n";
122 print STDERR "Build Dir: $build_dir\n";
123 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
124 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
125 {
126 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
127 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
128 }
129 }
130
131 # stored text is always Doc and Sec levels
132 my $levels = { 'document' => 1, 'section' => 1 };
133 # always do database at section level
134 my $db_level = "section";
135
136 # set up the document processr
137 $self->{'buildproc'}->set_output_handle ($handle);
138 $self->{'buildproc'}->set_mode ('text');
139 $self->{'buildproc'}->set_index ($textindex);
140 $self->{'buildproc'}->set_indexing_text (0);
141 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
142 $self->{'buildproc'}->set_levels ($levels);
143 $self->{'buildproc'}->set_db_level ($db_level);
144 $self->{'buildproc'}->reset();
145
146 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
147 $self->{'buildproc'}, $self->{'maxdocs'});
148 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
149 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
150 &plugin::end($self->{'pluginfo'});
151
152 close ($handle) unless $self->{'debug'};
153 $self->print_stats();
154
155 print STDERR "</Stage>\n" if $self->{'gli'};
156}
157
158#----
159
160
161
162sub filter_in_out_file
163{
164 my ($in_filename,$out_filename,$replace_rules) = @_;
165
166 if (open(SIN,"<$in_filename")) {
167
168 if (open(SOUT,">$out_filename")) {
169
170 my $line;
171 while (defined ($line=<SIN>)) {
172 chomp $line;
173
174 my $done_insert = 0;
175 foreach my $rule (@$replace_rules) {
176 my $line_re = $rule->{'regexp'};
177 my $insert = $rule->{'insert'};
178
179 if ($line =~ m/$line_re/) {
180 print SOUT $insert;
181 $done_insert = 1;
182 last;
183 }
184 }
185 if (!$done_insert) {
186 print SOUT "$line\n";;
187 }
188 }
189
190 close(SOUT);
191 }
192 else {
193 print STDERR "Error: Failed to open $out_filename\n";
194 print STDERR " $!\n";
195 }
196
197 close(SIN);
198 }
199 else {
200 print STDERR "Error: Failed to open $in_filename\n";
201 print STDERR " $!\n";
202 }
203
204}
205
206# We need to push the list of indexfield to shortname mappings through to the
207# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
208# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
209# a mess of this - it only outputs fields that have been processed (none have)
210# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
211# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
212# This version is dumber - just copy them all across verbatim - but works. We
213# do still need to support the special case of 'allfields'
214sub make_final_field_list
215{
216 my $self = shift (@_);
217 $self->{'build_cfg'} = {};
218 my @indexfieldmap = ();
219 my @indexfields = ();
220
221 # @todo support: $self->{'buildproc'}->{'extraindexfields'}
222 foreach my $fields (@{$self->{'collect_cfg'}->{'indexes'}})
223 {
224 # remove subcoll stuff
225 $fields =~ s/:.*$//;
226 foreach my $field (split(';', $fields))
227 {
228 my $shortname = 'ERROR';
229 if ($field eq 'allfields')
230 {
231 $shortname = 'ZZ';
232 }
233 elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
234 {
235 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
236 }
237 else
238 {
239 print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
240 }
241 push (@indexfieldmap, $field . '->' . $shortname);
242 push (@indexfields, $field);
243 }
244 }
245
246 if (scalar @indexfieldmap)
247 {
248 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
249 }
250
251 if (scalar @indexfields)
252 {
253 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
254 }
255}
256
257# Generate solr schema.xml file based on indexmapfield and other associated
258# config files
259#
260# Unlike make_auxiliary_files(), this needs to be done up-front (rather
261# than at the end) so the data-types in schema.xml are correctly set up
262# prior to document content being pumped through solr_passes.pl
263
264
265sub premake_solr_auxiliary_files
266{
267 my $self = shift (@_);
268
269 # Replace the following marker:
270 #
271 # <!-- ##GREENSTONE-FIELDS## -->
272 #
273 # with lines of the form:
274 #
275 # <field name="<field>" type="string" ... />
276 #
277 # for each <field> in 'indexfieldmap'
278
279 my $schema_insert_xml = "";
280
281 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
282
283 my ($fullfieldname, $field) = ($ifm =~ m/^(.*)->(.*)$/);
284
285 $schema_insert_xml .= " "; # indent
286 $schema_insert_xml .= "<field name=\"$field\" ";
287
288 if($field eq "CD" || $field eq "CS") {
289 # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting
290 # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in.
291 # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true.
292 # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but
293 # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)."
294 # But stored=false makes it not visible in Luke. So setting stored=true as for other fields
295 # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented."
296 # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector)
297 # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring
298 # in the doc, and don't care how often a Coordinate occurs in a document.
299 # Consequently, we don't care about term positions and term offsets for Coordinate meta either.
300
301 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
302 }
303
304 elsif($field eq "ML") {
305 # mapLabel: same attributes as for coord meta CD and CS above
306 # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly
307 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
308 }
309
310 else {
311 if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN)
312 {
313 # Latitude and Longitude is being phased out in favour of using Coord meta.
314 # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location"
315 # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng
316 # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected.
317 # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev
318 # which states:
319 # When indexing, the format is something like:
320 # <field name="store_lat_lon">12.34,-123.45</field>
321 #
322 $schema_insert_xml .= "type=\"location\" ";
323 }
324
325
326 # elsif ($field ne "ZZ" && $field ne "TX")
327 # {
328 # $schema_insert_xml .= "type=\"string\" ";
329 # }
330 else
331 {
332 #$schema_insert_xml .= "type=\"text_en_splitting\" ";
333
334 # original default solr field type for all fields is text_en_splitting
335 my $solrfieldtype = "text_en_splitting";
336 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) {
337 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'};
338 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n";
339 }
340 $schema_insert_xml .= "type=\"$solrfieldtype\" ";
341
342 }
343 # set termVectors=\"true\" when term vectors info is required,
344 # see TermsResponse termResponse = solrResponse.getTermsResponse();
345 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";
346 }
347 }
348
349 # just the one rule to date
350 my $insert_rules
351 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
352 'insert' => $schema_insert_xml } ];
353
354 my $solr_home = $ENV{'GEXT_SOLR'};
355## my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"etc","conf");
356 my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"conf");
357 my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");
358
359 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
360 my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
361 my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
362
363 # make sure output conf directory exists
364 if (!&FileUtils::directoryExists($out_dirname)) {
365 &FileUtils::makeDirectory($out_dirname);
366 }
367
368 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
369
370 # now do the same for solrconfig.xml, stopwords, ...
371 # these are simpler, as they currently do not need any filtering
372
373 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
374 "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml" );
375
376 foreach my $file ( @in_file_list ) {
377 my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file.".in");
378 my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
379
380 if(&FileUtils::fileExists($in_filename)) {
381 filter_in_out_file($in_filename,$out_filename,[]);
382 }
383 }
384
385 my @in_dir_list = ( "lang" );
386 foreach my $dir ( @in_dir_list ) {
387
388 my $full_subdir_name = &FileUtils::filenameConcatenate($in_dirname,$dir);
389
390 if(&FileUtils::directoryExists($full_subdir_name)) {
391 &FileUtils::copyFilesRecursiveNoSVN($full_subdir_name, $out_dirname);
392 }
393 }
394}
395
396
397sub pre_build_indexes
398{
399 my $self = shift (@_);
400 my ($indexname) = @_;
401 my $outhandle = $self->{'outhandle'};
402
403 # If the Solr/Jetty server is not already running, the following starts
404 # it up, and only returns when the server is "reading and listening"
405
406 my $solr_server = new solrserver($self->{'build_dir'});
407 $solr_server->start();
408 $self->{'solr_server'} = $solr_server;
409
410 my $indexes = [];
411 if (defined $indexname && $indexname =~ /\w/) {
412 push @$indexes, $indexname;
413 } else {
414 $indexes = $self->{'collect_cfg'}->{'indexes'};
415 }
416
417 # skip para-level check, as this is done in the main 'build_indexes'
418 # routine
419
420 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
421 my $allfields_index = 0; # do we have an allfields index?
422
423 # Using a hashmap here would avoid duplications, but while more space
424 # efficient, it's not entirely clear it would be more computationally
425 # efficient
426 my @all_fields = ();
427
428 foreach my $index (@$indexes) {
429 if ($self->want_built($index)) {
430
431 # get the parameters for the output
432 # split on : just in case there is subcoll and lang stuff
433 my ($fields) = split (/:/, $index);
434
435 foreach my $field (split (/;/, $fields)) {
436 if ($field eq "metadata") {
437 $all_metadata_specified = 1;
438 }
439 else {
440 push(@all_fields,$field);
441 }
442 }
443 }
444 }
445
446 if ($all_metadata_specified) {
447
448 # (Unforunately) we need to process all the documents in the collection
449 # to figure out what the metadata_field_mapping is
450
451 # set up the document processr
452 $self->{'buildproc'}->set_output_handle (undef);
453 $self->{'buildproc'}->set_mode ('index_field_mapping');
454 $self->{'buildproc'}->reset();
455
456 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
457 $self->{'buildproc'}, $self->{'maxdocs'});
458 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
459 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
460 &plugin::end($self->{'pluginfo'});
461
462 }
463
464 else {
465 # Field mapping solely dependent of entries in 'indexes'
466
467 # No need to explicitly handle "allfields" as create_shortname()
468 # will get a fix on it through it's static_indexfield_map
469
470 my $buildproc = $self->{'buildproc'};
471
472 foreach my $field (@all_fields)
473 {
474 if (!defined $buildproc->{'indexfieldmap'}->{$field})
475 {
476 my $shortname = '';
477 if (defined $buildproc->{'fieldnamemap'}->{$field})
478 {
479 $shortname = $buildproc->{'fieldnamemap'}->{$field};
480 }
481 else
482 {
483 $shortname = $buildproc->create_shortname($field);
484 }
485 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
486 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
487 }
488 }
489 }
490
491 # Write out solr 'schema.xml' (and related) file
492 #
493 $self->make_final_field_list();
494 $self->premake_solr_auxiliary_files();
495
496 # Now update the solr-core information in solr.xml
497 # => at most two cores <colname>-Doc and <colname>-Sec
498
499 my $site = $self->{'site'};
500 my $collect = $self->{'collection'};
501 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
502
503 # my $idx = $self->{'index_mapping'}->{$index};
504 my $idx = "idx";
505
506 my $build_dir = $self->{'build_dir'};
507
508 foreach my $level (keys %{$self->{'levels'}}) {
509
510 my ($pindex) = $level =~ /^(.)/;
511
512 my $index_dir = $pindex.$idx;
513 my $core = "$core_prefix-$index_dir";
514
515 # force_removeold == opposite of being run in 'incremental' mode
516 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
517
518 if ($force_removeold) {
519 print $outhandle "\n-removeold set (new index will be created)\n";
520
521 # create cores under temporary core names, corresponding to building directory
522 $core = "building-".$core;
523
524 my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
525 &FileUtils::removeFilesRecursive($full_index_dir);
526 &FileUtils::makeDirectory($full_index_dir);
527
528 my $full_tlog_dir = &FileUtils::filenameConcatenate($full_index_dir, "tlog");
529 &FileUtils::makeDirectory($full_tlog_dir);
530
531 # Solr then wants an "index" folder within this general index area!
532# my $full_index_index_dir = &FileUtils::filenameConcatenate($full_index_dir,"index");
533# &FileUtils::makeDirectory($full_index_index_dir);
534
535
536 # now go on and create new index
537 print $outhandle "Creating Solr core: $core\n";
538 $solr_server->admin_create_core($core);
539
540 }
541 else {
542 # if collect==core already in solr.xml (check with STATUS)
543 # => use RELOAD call to refresh fields now expressed in schema.xml
544 #
545 # else
546 # => use CREATE API to add to solr.xml
547
548 my $check_core_exists = $solr_server->admin_ping_core($core);
549
550 if ($check_core_exists) {
551 print $outhandle "Unloading Solr core: $core\n";
552 $solr_server->admin_unload_core($core);
553 }
554
555 print $outhandle "Creating Solr core: $core\n";
556 $solr_server->admin_create_core($core);
557
558 }
559 }
560
561}
562
563# Essentially the same as the lucenebuilder.pm version, only using solr_passes
564# => refactor and make better use of inheritence
565
566sub build_index {
567 my $self = shift (@_);
568 my ($index,$llevel) = @_;
569 my $outhandle = $self->{'outhandle'};
570 my $build_dir = $self->{'build_dir'};
571
572 # get the full index directory path and make sure it exists
573 my $indexdir = $self->{'index_mapping'}->{$index};
574 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));
575
576 # Find the perl script to call to run solr
577 my $solr_passes = $self->{'solr_passes'};
578 my $solr_passes_exe = $self->{'solr_passes_exe'};
579
580 # define the section names for solrpasses
581 # define the section names and possibly the doc name for solrpasses
582 my $solr_passes_sections = $llevel;
583
584 my $osextra = "";
585 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
586 $build_dir =~ s@/@\\@g;
587 } else {
588 if ($outhandle ne "STDERR") {
589 # so solr_passes doesn't print to stderr if we redirect output
590 $osextra .= " 2>/dev/null";
591 }
592 }
593
594 # get the index expression if this index belongs
595 # to a subcollection
596 my $indexexparr = [];
597 my $langarr = [];
598
599 # there may be subcollection info, and language info.
600 my ($fields, $subcollection, $language) = split (":", $index);
601 my @subcollections = ();
602 @subcollections = split /,/, $subcollection if (defined $subcollection);
603
604 foreach $subcollection (@subcollections) {
605 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
606 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
607 }
608 }
609
610 # add expressions for languages if this index belongs to
611 # a language subcollection - only put languages expressions for the
612 # ones we want in the index
613 my @languages = ();
614 my $languagemetadata = "Language";
615 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
616 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
617 }
618 @languages = split /,/, $language if (defined $language);
619 foreach my $language (@languages) {
620 my $not=0;
621 if ($language =~ s/^\!//) {
622 $not = 1;
623 }
624 if($not) {
625 push (@$langarr, "!$language");
626 } else {
627 push (@$langarr, "$language");
628 }
629 }
630
631 # Build index dictionary. Uses verbatim stem method
632 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
633 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
634 my ($handle);
635
636 if ($self->{'debug'}) {
637 $handle = *STDOUT;
638 } else {
639 my $site = $self->{'site'};
640 my $collect = $self->{'collection'};
641 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
642 my $ds_idx = $self->{'index_mapping'}->{$index};
643 my $core = "$core_prefix-$ds_idx";
644
645 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
646
647 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
648 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
649 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
650 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
651 }
652 }
653
654 my $store_levels = $self->{'levels'};
655 my $db_level = "section"; #always
656 my $dom_level = "";
657 foreach my $key (keys %$store_levels) {
658 if ($mgppbuilder::level_map{$key} eq $llevel) {
659 $dom_level = $key;
660 }
661 }
662 if ($dom_level eq "") {
663 print STDERR "Warning: unrecognized tag level $llevel\n";
664 $dom_level = "document";
665 }
666
667 my $local_levels = { $dom_level => 1 }; # work on one level at a time
668
669 # set up the document processr
670 $self->{'buildproc'}->set_output_handle ($handle);
671 $self->{'buildproc'}->set_mode ('text');
672 $self->{'buildproc'}->set_index ($index, $indexexparr);
673 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
674 $self->{'buildproc'}->set_indexing_text (1);
675 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
676 $self->{'buildproc'}->set_levels ($local_levels);
677 if (defined $self->{'collect_cfg'}->{'sortfields'}) {
678 $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
679 }
680 if (defined $self->{'collect_cfg'}->{'facetfields'}) {
681 $self->{'buildproc'}->set_facetfields ($self->{'collect_cfg'}->{'facetfields'});
682 }
683 $self->{'buildproc'}->set_db_level($db_level);
684 $self->{'buildproc'}->reset();
685
686 print $handle "<update>\n";
687
688 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
689 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
690
691 print $handle "</update>\n";
692
693 close ($handle) unless $self->{'debug'};
694
695 $self->print_stats();
696
697 $self->{'buildproc'}->set_levels ($store_levels);
698 print STDERR "</Stage>\n" if $self->{'gli'};
699
700}
701
702
703sub post_build_indexes {
704 my $self = shift(@_);
705
706 # deliberately override to prevent the mgpp post_build_index() calling
707 # $self->make_final_field_list()
708 # as this has been done in our pre_build_indexes() phase for solr
709
710
711 # Also need to stop the Solr server (be it tomcat or jetty) if it was explicitly started
712 # in pre_build_indexes()
713
714 my $solr_server = $self->{'solr_server'};
715
716 if ($solr_server->explicitly_started()) {
717 $solr_server->stop();
718 }
719
720 $self->{'solr_server'} = undef;
721
722}
723
724sub build_cfg_extra {
725 my $self = shift (@_);
726 my ($build_cfg) = @_;
727
728 $self->lucenebuilder::build_cfg_extra($build_cfg);
729
730 # need to add in facet stuff
731 my @facetfields = ();
732 my @facetfieldmap = ();
733
734 foreach my $sf (@{$self->{'buildproc'}->{'facetfields'}}) {
735 if ($sf eq "rank") {
736 push(@facetfields, $sf);
737 } elsif ($self->{'buildproc'}->{'actualsortfields'}->{$sf}) {
738 my $shortname = $self->{'buildproc'}->{'sortfieldnamemap'}->{$sf};
739 push(@facetfields, $shortname);
740 push (@facetfieldmap, "$sf\-\>$shortname");
741 }
742
743 }
744 $build_cfg->{'indexfacetfields'} = \@facetfields;
745 $build_cfg->{'indexfacetfieldmap'} = \@facetfieldmap;
746}
7471;
748
749
Note: See TracBrowser for help on using the repository browser.