root/gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm @ 33327

Revision 33327, 24.5 KB (checked in by ak19, 2 months ago)

In order to get map coordinate metadata stored correctly in solr, changes were required. These changes revealed that the way in which some index fields were stored in solr but also lucene were not exactly correct and required changing too. 1. Coordinate/CD, CoordShort?/CS and GPSMapOverlayLabel/ML meta are now being stored. The schema elements created for these indexed fields notably need to say they're multivalued (multiple values per docOID) and are of type=string rather than type=text_en_splitting as the other meta have been so far. No term related information being stored for them as that doesn't appear important for these indexed fields. 2. Changes to solrbuildproc required and these changes were also repeated into lucenebuildproc: in their code before this commit, <field name=... /> elements were stored once for all meta elements in that field. It sort of worked out so far since the type=text_en_splitting for these fields. This however created the problem that for example all Coordinate meta for a docOID went into a single <field name=CD .../> element separate by spaces rather than a <field name=CD .../> element for each Coordinate meta. We wanted the latter behaviour for CD, CS and ML meta but also for all other indexed meta fields such as TI for titles. But also for indexed fields that include multiple meta in one index such as a hypothetical TT where TT would include dc.Title,ex.Title,text. In that case too we want a <field name=TT /> element for each title meta and for the text meta. 3. The num_processed_bytes calculation is left untouched and still includes the encapsulating <field name=.../> element and has not been changed to be calculated over just the meta data value of each field. This is because not only is it calculated to include the field in super -buildproc.pm classes, but also because the definition of num_processed_bytes in basebuilder.pm is defined as the number of bytes actually passed to (mg) for the current index, where lucene and mgpp buildprocs both include the enclosing element in the calculation which seems deliberate. Further, num_processed_bytes contrasts against num_bytes, declared and defined in basebuildproc.pm too as The actual number of bytes in the collection, normally the same as what's processed during text compression. num_bytes seems to be what Dr Bainbridge had in mind today when he said that actually the enclosing <field/> element shouldn't be included in the calculation of num_processed_bytes. Since the definition of num_processed_bytes seems ambiguous to me now, I leave it alone until discussed with Dr Bainbridge again, as there are many places where it needs changing otherwise.

Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34
35sub BEGIN {
36    @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41    my $class = shift(@_);
42    my $self = new lucenebuilder (@_);
43    $self = bless $self, $class;
44
45    $self->{'buildtype'} = "solr";
46
47    my $solr_passes_script = "solr_passes.pl";
48
49    $self->{'solr_passes'} = "$solr_passes_script";
50    # Tack perl on the beginning to ensure execution
51    $self->{'solr_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$solr_passes_script\"";
52    return $self;
53}
54
55
56sub default_buildproc {
57    my $self  = shift (@_);
58
59    return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69    my $self = shift (@_);
70    # do nothing if we don't want compressed text
71    return if $self->{'no_text'};
72
73    my ($textindex) = @_;
74
75    # workaround to avoid hard-coding "solr" check into buildcol.pl
76    $textindex =~ s/^section://;
77
78    my $outhandle = $self->{'outhandle'};
79
80    # the text directory
81    my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
82    my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
83    &FileUtils::makeAllDirectories($text_dir);
84
85    my $osextra = "";
86    if ($ENV{'GSDLOS'} =~ /^windows$/i)
87    {
88    $text_dir =~ s@/@\\@g;
89    }
90    else
91    {
92    if ($outhandle ne "STDERR")
93    {
94        # so solr_passes doesn't print to stderr if we redirect output
95        $osextra .= " 2>/dev/null";
96    }
97    }
98
99    # Find the perl script to call to run solr
100    my $solr_passes = $self->{'solr_passes'};
101    my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103    my $solr_passes_sections = "Doc";
104
105    my ($handle);
106
107    if ($self->{'debug'})
108    {
109    $handle = *STDOUT;
110    }
111    else
112    {
113    my $site        = $self->{'site'};
114    my $collect     = $self->{'collection'};   
115    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
116    my $core        = $core_prefix; # unused in this call to solr_passes
117
118    $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
119
120        print STDERR "Executable:    $solr_passes_exe\n";
121        print STDERR "Sections:      $solr_passes_sections\n";
122        print STDERR "Build Dir:     $build_dir\n";
123        print STDERR "Cmd:           $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra\n";
124    if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra"))
125    {
126        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
127        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
128    }
129    }
130
131    # stored text is always Doc and Sec levels   
132    my $levels = { 'document' => 1, 'section' => 1 };
133    # always do database at section level
134    my $db_level = "section";
135
136    # set up the document processr
137    $self->{'buildproc'}->set_output_handle ($handle);
138    $self->{'buildproc'}->set_mode ('text');
139    $self->{'buildproc'}->set_index ($textindex);
140    $self->{'buildproc'}->set_indexing_text (0);
141    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
142    $self->{'buildproc'}->set_levels ($levels);
143    $self->{'buildproc'}->set_db_level ($db_level);
144    $self->{'buildproc'}->reset();
145
146    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
147           $self->{'buildproc'}, $self->{'maxdocs'});
148    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
149           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
150    &plugin::end($self->{'pluginfo'});
151
152    close ($handle) unless $self->{'debug'};
153    $self->print_stats();
154
155    print STDERR "</Stage>\n" if $self->{'gli'};
156}
157
158#----
159
160
161
162sub filter_in_out_file
163{
164    my ($in_filename,$out_filename,$replace_rules) = @_;
165
166    if (open(SIN,"<$in_filename")) {
167
168    if (open(SOUT,">$out_filename")) {
169
170        my $line;
171        while (defined ($line=<SIN>)) {
172        chomp $line;
173
174        my $done_insert = 0;
175        foreach my $rule (@$replace_rules) {
176            my $line_re = $rule->{'regexp'};
177            my $insert  = $rule->{'insert'};
178
179            if ($line =~ m/$line_re/) {
180            print SOUT $insert;
181            $done_insert = 1;
182            last;
183            }
184        }
185        if (!$done_insert) {
186            print SOUT "$line\n";;
187        }
188        }
189
190        close(SOUT);
191    }
192    else {
193        print STDERR "Error: Failed to open $out_filename\n";
194        print STDERR "       $!\n";
195    }
196
197    close(SIN);
198    }
199    else {
200    print STDERR "Error: Failed to open $in_filename\n";
201    print STDERR "       $!\n";
202    }
203
204}
205
206# We need to push the list of indexfield to shortname mappings through to the
207# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
208# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
209# a mess of this - it only outputs fields that have been processed (none have)
210# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
211# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
212# This version is dumber - just copy them all across verbatim - but works. We
213# do still need to support the special case of 'allfields'
214sub make_final_field_list
215{
216  my $self = shift (@_);
217  $self->{'build_cfg'} = {};
218  my @indexfieldmap = ();
219  my @indexfields = ();
220
221  # @todo support: $self->{'buildproc'}->{'extraindexfields'}
222  foreach my $fields (@{$self->{'collect_cfg'}->{'indexes'}})
223  {
224    # remove subcoll stuff
225    $fields =~ s/:.*$//;
226    foreach my $field (split(';', $fields))
227    {
228      my $shortname = 'ERROR';
229      if ($field eq 'allfields')
230      {
231        $shortname = 'ZZ';
232      }
233      elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
234      {
235        $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
236      }
237      else
238      {
239        print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
240      }
241      push (@indexfieldmap, $field . '->' . $shortname);
242      push (@indexfields, $field);
243    }
244  }
245
246  if (scalar @indexfieldmap)
247  {
248    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
249  }
250
251  if (scalar @indexfields)
252  {
253    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
254  }
255}
256
257# Generate solr schema.xml file based on indexmapfield and other associated
258# config files
259#
260# Unlike make_auxiliary_files(), this needs to be done up-front (rather
261# than at the end) so the data-types in schema.xml are correctly set up
262# prior to document content being pumped through solr_passes.pl
263
264
265sub premake_solr_auxiliary_files
266{
267    my $self = shift (@_);
268
269    # Replace the following marker:
270    #
271    #   <!-- ##GREENSTONE-FIELDS## -->
272    #
273    # with lines of the form:
274    #
275    #   <field name="<field>" type="string" ... />
276    #
277    # for each <field> in 'indexfieldmap'
278
279    my $schema_insert_xml = "";
280
281    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
282
283        my ($fullfieldname, $field) = ($ifm =~ m/^(.*)->(.*)$/);
284
285        $schema_insert_xml .= "    "; # indent
286        $schema_insert_xml .= "<field name=\"$field\" ";
287
288        if($field eq "CD" || $field eq "CS") {
289            # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting           
290            # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in.
291            # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true.
292            # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but
293            # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)."
294            # But stored=false makes it not visible in Luke. So setting stored=true as for other fields
295            # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented."
296            # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector)
297            # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring
298            # in the doc, and don't care how often a Coordinate occurs in a document.
299            # Consequently, we don't care about term positions and term offsets for Coordinate meta either.
300           
301            $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
302        }
303       
304        elsif($field eq "ML") {
305            # mapLabel: same attributes as for coord meta CD and CS above
306            # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly
307            $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
308        }
309       
310        else {
311            if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN)
312            {
313                # Latitude and Longitude is being phased out in favour of using Coord meta.
314                # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location"
315                # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng
316                # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected.
317                # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev
318                # which states:
319                #    When indexing, the format is something like:
320                #       <field name="store_lat_lon">12.34,-123.45</field>
321                #
322                $schema_insert_xml .=   "type=\"location\" ";               
323            }
324           
325           
326    #       elsif ($field ne "ZZ" && $field ne "TX")
327    #       {
328    #           $schema_insert_xml .=   "type=\"string\" ";
329    #       }
330            else
331            {
332                #$schema_insert_xml .= "type=\"text_en_splitting\" ";
333
334                # original default solr field type for all fields is text_en_splitting
335                my $solrfieldtype = "text_en_splitting";
336                if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) {   
337                $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'};
338                #print STDERR "@@@@#### found TYPE: $solrfieldtype\n";
339                }
340                $schema_insert_xml .= "type=\"$solrfieldtype\" ";
341               
342            }
343            # set termVectors=\"true\" when term vectors info is required,
344            # see TermsResponse termResponse = solrResponse.getTermsResponse();
345            $schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";
346        }
347    }
348
349    # just the one rule to date
350    my $insert_rules
351    = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
352          'insert' => $schema_insert_xml } ];
353       
354    my $solr_home = $ENV{'GEXT_SOLR'};
355##    my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"etc","conf");
356    my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"conf");
357    my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");
358
359    my $collect_home = $ENV{'GSDLCOLLECTDIR'};
360    my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
361    my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
362   
363    # make sure output conf directory exists
364    if (!&FileUtils::directoryExists($out_dirname)) {
365    &FileUtils::makeDirectory($out_dirname);
366    }
367
368    filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
369
370    # now do the same for solrconfig.xml, stopwords, ...
371    # these are simpler, as they currently do not need any filtering
372
373    my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
374             "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml" );
375 
376    foreach my $file ( @in_file_list ) {
377    my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file.".in");
378    my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
379
380    if(&FileUtils::fileExists($in_filename)) {
381        filter_in_out_file($in_filename,$out_filename,[]);
382    }
383    }
384
385    my @in_dir_list = ( "lang" );
386    foreach my $dir ( @in_dir_list ) {
387   
388    my $full_subdir_name = &FileUtils::filenameConcatenate($in_dirname,$dir);
389
390    if(&FileUtils::directoryExists($full_subdir_name)) {
391        &FileUtils::copyFilesRecursiveNoSVN($full_subdir_name, $out_dirname);
392    }
393    }
394}
395
396
397sub pre_build_indexes
398{
399    my $self = shift (@_);
400    my ($indexname) = @_;
401    my $outhandle = $self->{'outhandle'};
402
403    # If the Solr/Jetty server is not already running, the following starts
404    # it up, and only returns when the server is "reading and listening"
405 
406    my $solr_server = new solrserver($self->{'build_dir'});
407    $solr_server->start();
408    $self->{'solr_server'} = $solr_server;
409
410    my $indexes = [];
411    if (defined $indexname && $indexname =~ /\w/) {
412    push @$indexes, $indexname;
413    } else {
414    $indexes = $self->{'collect_cfg'}->{'indexes'};
415    }
416
417    # skip para-level check, as this is done in the main 'build_indexes'
418    # routine
419
420    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
421    my $allfields_index = 0;        # do we have an allfields index?
422
423    # Using a hashmap here would avoid duplications, but while more space
424    # efficient, it's not entirely clear it would be more computationally
425    # efficient
426    my @all_fields = ();
427
428    foreach my $index (@$indexes) {
429    if ($self->want_built($index)) {
430
431        # get the parameters for the output
432        # split on : just in case there is subcoll and lang stuff
433        my ($fields) = split (/:/, $index);
434
435        foreach my $field (split (/;/, $fields)) {
436        if ($field eq "metadata") {
437            $all_metadata_specified = 1;
438        }
439        else {
440            push(@all_fields,$field);
441        }
442        }
443    }
444    }
445
446    if ($all_metadata_specified) {
447
448    # (Unforunately) we need to process all the documents in the collection
449    # to figure out what the metadata_field_mapping is     
450
451    # set up the document processr
452    $self->{'buildproc'}->set_output_handle (undef);
453    $self->{'buildproc'}->set_mode ('index_field_mapping');
454    $self->{'buildproc'}->reset();
455   
456    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
457               $self->{'buildproc'}, $self->{'maxdocs'});
458    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
459               "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
460    &plugin::end($self->{'pluginfo'});
461   
462    }
463
464    else {
465    # Field mapping solely dependent of entries in 'indexes'
466
467    # No need to explicitly handle "allfields" as create_shortname()
468    # will get a fix on it through it's static_indexfield_map
469
470    my $buildproc = $self->{'buildproc'};
471   
472      foreach my $field (@all_fields)
473      {
474        if (!defined $buildproc->{'indexfieldmap'}->{$field})
475        {
476          my $shortname = '';
477          if (defined $buildproc->{'fieldnamemap'}->{$field})
478          {
479            $shortname = $buildproc->{'fieldnamemap'}->{$field};
480          }
481          else
482          {
483            $shortname = $buildproc->create_shortname($field);
484          }
485          $buildproc->{'indexfieldmap'}->{$field} = $shortname;
486          $buildproc->{'indexfieldmap'}->{$shortname} = 1;
487        }
488      }
489    }
490
491    # Write out solr 'schema.xml' (and related) file
492    #
493    $self->make_final_field_list();
494    $self->premake_solr_auxiliary_files();
495
496    # Now update the solr-core information in solr.xml
497    # => at most two cores <colname>-Doc and <colname>-Sec
498
499    my $site        = $self->{'site'};
500    my $collect     = $self->{'collection'};
501    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
502
503    # my $idx = $self->{'index_mapping'}->{$index};
504    my $idx = "idx";
505
506    my $build_dir = $self->{'build_dir'};
507
508    foreach my $level (keys %{$self->{'levels'}}) {
509   
510    my ($pindex) = $level =~ /^(.)/;
511
512    my $index_dir = $pindex.$idx;
513    my $core = "$core_prefix-$index_dir";
514
515    # force_removeold == opposite of being run in 'incremental' mode
516    my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
517
518    if ($force_removeold) {
519        print $outhandle "\n-removeold set (new index will be created)\n";
520
521        # create cores under temporary core names, corresponding to building directory
522        $core = "building-".$core;
523
524        my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
525        &FileUtils::removeFilesRecursive($full_index_dir);
526        &FileUtils::makeDirectory($full_index_dir);
527
528        my $full_tlog_dir = &FileUtils::filenameConcatenate($full_index_dir, "tlog");
529        &FileUtils::makeDirectory($full_tlog_dir);
530
531        # Solr then wants an "index" folder within this general index area!
532#       my $full_index_index_dir = &FileUtils::filenameConcatenate($full_index_dir,"index");
533#       &FileUtils::makeDirectory($full_index_index_dir);
534
535
536        # now go on and create new index
537        print $outhandle "Creating Solr core: $core\n";
538        $solr_server->admin_create_core($core);
539
540    }
541    else {
542        # if collect==core already in solr.xml (check with STATUS)
543        # => use RELOAD call to refresh fields now expressed in schema.xml
544        #
545        # else
546        # => use CREATE API to add to solr.xml
547       
548        my $check_core_exists = $solr_server->admin_ping_core($core);
549       
550        if ($check_core_exists) {       
551        print $outhandle "Unloading Solr core: $core\n";
552        $solr_server->admin_unload_core($core);
553        }
554       
555        print $outhandle "Creating Solr core: $core\n";
556        $solr_server->admin_create_core($core);
557       
558    }
559    }
560
561}
562
563# Essentially the same as the lucenebuilder.pm version, only using solr_passes
564# => refactor and make better use of inheritence
565
566sub build_index {
567    my $self = shift (@_);
568    my ($index,$llevel) = @_;
569    my $outhandle = $self->{'outhandle'};
570    my $build_dir = $self->{'build_dir'};
571
572    # get the full index directory path and make sure it exists
573    my $indexdir = $self->{'index_mapping'}->{$index};
574    &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));
575
576    # Find the perl script to call to run solr
577    my $solr_passes = $self->{'solr_passes'};
578    my $solr_passes_exe = $self->{'solr_passes_exe'};
579
580    # define the section names for solrpasses
581    # define the section names and possibly the doc name for solrpasses
582    my $solr_passes_sections = $llevel;
583
584    my $osextra = "";
585    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
586    $build_dir =~ s@/@\\@g;
587    } else {
588    if ($outhandle ne "STDERR") {
589        # so solr_passes doesn't print to stderr if we redirect output
590        $osextra .= " 2>/dev/null";
591    }
592    }
593
594    # get the index expression if this index belongs
595    # to a subcollection
596    my $indexexparr = [];
597    my $langarr = [];
598
599    # there may be subcollection info, and language info.
600    my ($fields, $subcollection, $language) = split (":", $index);
601    my @subcollections = ();
602    @subcollections = split /,/, $subcollection if (defined $subcollection);
603
604    foreach $subcollection (@subcollections) {
605    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
606        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
607    }
608    }
609
610    # add expressions for languages if this index belongs to
611    # a language subcollection - only put languages expressions for the
612    # ones we want in the index
613    my @languages = ();
614    my $languagemetadata = "Language";
615    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
616    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
617    }
618    @languages = split /,/, $language if (defined $language);
619    foreach my $language (@languages) {
620    my $not=0;
621    if ($language =~ s/^\!//) {
622        $not = 1;
623    }
624    if($not) {
625        push (@$langarr, "!$language");
626    } else {
627        push (@$langarr, "$language");
628    }
629    }
630
631    # Build index dictionary. Uses verbatim stem method
632    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
633    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
634    my ($handle);
635
636    if ($self->{'debug'}) {
637    $handle = *STDOUT;
638    } else {
639    my $site        = $self->{'site'};
640    my $collect     = $self->{'collection'};
641    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
642    my $ds_idx      = $self->{'index_mapping'}->{$index};
643    my $core        = "$core_prefix-$ds_idx";
644
645    $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
646
647    print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra\n";
648    if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra")) {
649        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
650        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
651    }
652    }
653
654    my $store_levels = $self->{'levels'};
655    my $db_level = "section"; #always
656    my $dom_level = "";
657    foreach my $key (keys %$store_levels) {
658    if ($mgppbuilder::level_map{$key} eq $llevel) {
659        $dom_level = $key;
660    }
661    }
662    if ($dom_level eq "") {
663    print STDERR "Warning: unrecognized tag level $llevel\n";
664    $dom_level = "document";
665    }
666
667    my $local_levels = { $dom_level => 1 }; # work on one level at a time
668
669    # set up the document processr
670    $self->{'buildproc'}->set_output_handle ($handle);
671    $self->{'buildproc'}->set_mode ('text');
672    $self->{'buildproc'}->set_index ($index, $indexexparr);
673    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
674    $self->{'buildproc'}->set_indexing_text (1);
675    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
676    $self->{'buildproc'}->set_levels ($local_levels);
677    if (defined $self->{'collect_cfg'}->{'sortfields'}) {
678    $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
679    }
680    if (defined $self->{'collect_cfg'}->{'facetfields'}) {
681    $self->{'buildproc'}->set_facetfields ($self->{'collect_cfg'}->{'facetfields'});
682    }
683    $self->{'buildproc'}->set_db_level($db_level);
684    $self->{'buildproc'}->reset();
685
686    print $handle "<update>\n";
687
688    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
689           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
690   
691    print $handle "</update>\n";
692
693    close ($handle) unless $self->{'debug'};
694
695    $self->print_stats();
696
697    $self->{'buildproc'}->set_levels ($store_levels);
698    print STDERR "</Stage>\n" if $self->{'gli'};
699
700}
701
702
703sub post_build_indexes {
704    my $self = shift(@_);
705
706    # deliberately override to prevent the mgpp post_build_index() calling
707    #  $self->make_final_field_list()
708    # as this has been done in our pre_build_indexes() phase for solr
709
710
711    # Also need to stop the Solr server (be it tomcat or jetty) if it was explicitly started
712    # in pre_build_indexes()
713   
714    my $solr_server = $self->{'solr_server'};
715
716    if ($solr_server->explicitly_started()) {
717    $solr_server->stop();
718    }
719
720    $self->{'solr_server'} = undef;
721
722}   
723
724sub build_cfg_extra {
725    my $self = shift (@_);
726    my ($build_cfg) = @_;
727
728    $self->lucenebuilder::build_cfg_extra($build_cfg);
729
730    # need to add in facet stuff
731    my @facetfields = ();
732    my @facetfieldmap = ();
733
734    foreach my $sf (@{$self->{'buildproc'}->{'facetfields'}}) {
735    if ($sf eq "rank") {
736        push(@facetfields, $sf);
737    } elsif ($self->{'buildproc'}->{'actualsortfields'}->{$sf}) {
738        my $shortname = $self->{'buildproc'}->{'sortfieldnamemap'}->{$sf};
739        push(@facetfields, $shortname);
740        push (@facetfieldmap, "$sf\-\>$shortname");
741    }
742   
743    }
744    $build_cfg->{'indexfacetfields'} = \@facetfields;
745    $build_cfg->{'indexfacetfieldmap'} = \@facetfieldmap;
746
7471;
748
749
Note: See TracBrowser for help on using the browser.