root/gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm @ 27802

Revision 27802, 19.9 KB (checked in by kjdon, 6 years ago)

adding in code for sort fields. just copied form lucene build code

Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37    @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42    my $class = shift(@_);
43    my $self = new lucenebuilder (@_);
44    $self = bless $self, $class;
45
46    $self->{'buildtype'} = "solr";
47
48    my $solr_passes_script = "solr_passes.pl";
49
50    $self->{'solr_passes'} = "$solr_passes_script";
51    # Tack perl on the beginning to ensure execution
52    $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53    return $self;
54}
55
56
57sub default_buildproc {
58    my $self  = shift (@_);
59
60    return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70    my $self = shift (@_);
71    # do nothing if we don't want compressed text
72    return if $self->{'no_text'};
73
74    my ($textindex) = @_;
75
76    # workaround to avoid hard-coding "solr" check into buildcol.pl
77    $textindex =~ s/^section://;
78
79    my $outhandle = $self->{'outhandle'};
80
81    # the text directory
82    my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
83    my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
84    &FileUtils::makeAllDirectories($text_dir);
85
86    my $osextra = "";
87    if ($ENV{'GSDLOS'} =~ /^windows$/i)
88    {
89    $text_dir =~ s@/@\\@g;
90    }
91    else
92    {
93    if ($outhandle ne "STDERR")
94    {
95        # so solr_passes doesn't print to stderr if we redirect output
96        $osextra .= " 2>/dev/null";
97    }
98    }
99
100    # Find the perl script to call to run solr
101    my $solr_passes = $self->{'solr_passes'};
102    my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104    my $solr_passes_sections = "Doc";
105
106    my ($handle);
107
108    if ($self->{'debug'})
109    {
110    $handle = *STDOUT;
111    }
112    else
113    {
114    my $site        = $self->{'site'};
115    my $collect     = $self->{'collection'};   
116    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117    my $core        = $core_prefix; # unused in this call to solr_passes
118
119    $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
120
121        print STDERR "Executable:    $solr_passes_exe\n";
122        print STDERR "Sections:      $solr_passes_sections\n";
123        print STDERR "Build Dir:     $build_dir\n";
124        print STDERR "Cmd:           $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra\n";
125    if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra"))
126    {
127        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
128        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
129    }
130    }
131
132    # stored text is always Doc and Sec levels   
133    my $levels = { 'document' => 1, 'section' => 1 };
134    # always do database at section level
135    my $db_level = "section";
136
137    # set up the document processr
138    $self->{'buildproc'}->set_output_handle ($handle);
139    $self->{'buildproc'}->set_mode ('text');
140    $self->{'buildproc'}->set_index ($textindex);
141    $self->{'buildproc'}->set_indexing_text (0);
142    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
143    $self->{'buildproc'}->set_levels ($levels);
144    $self->{'buildproc'}->set_db_level ($db_level);
145    $self->{'buildproc'}->reset();
146
147    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
148           $self->{'buildproc'}, $self->{'maxdocs'});
149    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
150           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
151    &plugin::end($self->{'pluginfo'});
152
153    close ($handle) unless $self->{'debug'};
154    $self->print_stats();
155
156    print STDERR "</Stage>\n" if $self->{'gli'};
157}
158
159#----
160
161
162
163sub filter_in_out_file
164{
165    my ($in_filename,$out_filename,$replace_rules) = @_;
166
167    if (open(SIN,"<$in_filename")) {
168
169    if (open(SOUT,">$out_filename")) {
170
171        my $line;
172        while (defined ($line=<SIN>)) {
173        chomp $line;
174
175        my $done_insert = 0;
176        foreach my $rule (@$replace_rules) {
177            my $line_re = $rule->{'regexp'};
178            my $insert  = $rule->{'insert'};
179
180            if ($line =~ m/$line_re/) {
181            print SOUT $insert;
182            $done_insert = 1;
183            last;
184            }
185        }
186        if (!$done_insert) {
187            print SOUT "$line\n";;
188        }
189        }
190
191        close(SOUT);
192    }
193    else {
194        print STDERR "Error: Failed to open $out_filename\n";
195        print STDERR "       $!\n";
196    }
197
198    close(SIN);
199    }
200    else {
201    print STDERR "Error: Failed to open $in_filename\n";
202    print STDERR "       $!\n";
203    }
204
205}
206
207# We need to push the list of indexfield to shortname mappings through to the
208# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
209# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
210# a mess of this - it only output fields that have been processed (none have)
211# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
212# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
213# This version is dumber - just copy them all across verbatum - but works. We
214# do still need to support the special case of 'allfields'
215sub make_final_field_list
216{
217  my $self = shift (@_);
218  $self->{'build_cfg'} = {};
219  my @indexfieldmap = ();
220  my @indexfields = ();
221
222  # @todo support: $self->{'buildproc'}->{'extraindexfields'}
223  foreach my $fields (@{$self->{'collect_cfg'}->{'indexes'}})
224  {
225    # remove subcoll stuff
226    $fields =~ s/:.*$//;
227    foreach my $field (split(';', $fields))
228    {
229      my $shortname = 'ERROR';
230      if ($field eq 'allfields')
231      {
232        $shortname = 'ZZ';
233      }
234      elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
235      {
236        $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
237      }
238      else
239      {
240        print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
241      }
242      push (@indexfieldmap, $field . '->' . $shortname);
243      push (@indexfields, $field);
244    }
245  }
246
247  if (scalar @indexfieldmap)
248  {
249    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
250  }
251
252  if (scalar @indexfields)
253  {
254    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
255  }
256}
257
258# Generate solr schema.xml file based on indexmapfield and other associated
259# config files
260#
261# Unlike make_auxiliary_files(), this needs to be done up-front (rather
262# than at the end) so the data-types in schema.xml are correctly set up
263# prior to document content being pumped through solr_passes.pl
264
265
266sub premake_solr_auxiliary_files
267{
268    my $self = shift (@_);
269
270    # Replace the following marker:
271    #
272    #   <!-- ##GREENSTONE-FIELDS## -->
273    #
274    # with lines of the form:
275    #
276    #   <field name="<field>" type="string" ... />
277    #
278    # for each <field> in 'indexfieldmap'
279
280    my $schema_insert_xml = "";
281
282    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
283
284        my ($field) = ($ifm =~ m/^.*->(.*)$/);
285
286        $schema_insert_xml .= "    "; # indent
287        $schema_insert_xml .= "<field name=\"$field\" ";
288
289        if($field eq "LA" || $field eq "LO")
290        {
291            $schema_insert_xml .=   "type=\"location\" ";
292        }
293#       elsif ($field ne "ZZ" && $field ne "TX")
294#       {
295#           $schema_insert_xml .=   "type=\"string\" ";
296#       }
297        else
298        {
299            $schema_insert_xml .= "type=\"text_en_splitting\" ";
300        }
301        $schema_insert_xml .=  "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
302                #$schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n";
303    }
304
305    # just the one rule to date
306    my $insert_rules
307    = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
308          'insert' => $schema_insert_xml } ];
309       
310    my $solr_home = $ENV{'GEXT_SOLR'};
311##    my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"etc","conf");
312    my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"conf");
313    my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");
314
315    my $collect_home = $ENV{'GSDLCOLLECTDIR'};
316    my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
317    my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
318   
319    # make sure output conf directory exists
320    if (!FileUtils::directoryExists($out_dirname)) {
321    &FileUtils::makeDirectory($out_dirname);
322    }
323
324    filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
325
326    # now do the same for solrconfig.xml, stopwords, ...
327    # these are simpler, as they currently do not need any filtering
328
329    my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
330             "synonyms.txt", "protwords.txt" );
331 
332    foreach my $file ( @in_file_list ) {
333    my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file.".in");
334    my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
335    filter_in_out_file($in_filename,$out_filename,[]);
336    }
337}
338
339
340sub pre_build_indexes
341{
342    my $self = shift (@_);
343    my ($indexname) = @_;
344    my $outhandle = $self->{'outhandle'};
345
346    # If the Solr/Jetty server is not already running, the following starts
347    # it up, and only returns when the server is "reading and listening"
348 
349    my $solr_server = new solrserver($self->{'build_dir'});
350    $solr_server->start();
351    $self->{'solr_server'} = $solr_server;
352
353    my $indexes = [];
354    if (defined $indexname && $indexname =~ /\w/) {
355    push @$indexes, $indexname;
356    } else {
357    $indexes = $self->{'collect_cfg'}->{'indexes'};
358    }
359
360    # skip para-level check, as this is done in the main 'build_indexes'
361    # routine
362
363    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
364    my $allfields_index = 0;        # do we have an allfields index?
365
366    # Using a hashmap here would avoid duplications, but while more space
367    # efficient, it's not entirely clear it would be more computationally
368    # efficient
369    my @all_fields = ();
370
371    foreach my $index (@$indexes) {
372    if ($self->want_built($index)) {
373
374        # get the parameters for the output
375        # split on : just in case there is subcoll and lang stuff
376        my ($fields) = split (/:/, $index);
377
378        foreach my $field (split (/;/, $fields)) {
379        if ($field eq "metadata") {
380            $all_metadata_specified = 1;
381        }
382        else {
383            push(@all_fields,$field);
384        }
385        }
386    }
387    }
388
389    if ($all_metadata_specified) {
390
391    # (Unforunately) we need to process all the documents in the collection
392    # to figure out what the metadata_field_mapping is     
393
394    # set up the document processr
395    $self->{'buildproc'}->set_output_handle (undef);
396    $self->{'buildproc'}->set_mode ('index_field_mapping');
397    $self->{'buildproc'}->reset();
398   
399    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
400               $self->{'buildproc'}, $self->{'maxdocs'});
401    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
402               "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
403    &plugin::end($self->{'pluginfo'});
404   
405    }
406
407    else {
408    # Field mapping solely dependent of entries in 'indexes'
409
410    # No need to explicitly handle "allfields" as create_shortname()
411    # will get a fix on it through it's static_indexfield_map
412
413    my $buildproc = $self->{'buildproc'};
414   
415      foreach my $field (@all_fields)
416      {
417        if (!defined $buildproc->{'indexfieldmap'}->{$field})
418        {
419          my $shortname = '';
420          if (defined $buildproc->{'fieldnamemap'}->{$field})
421          {
422            $shortname = $buildproc->{'fieldnamemap'}->{$field};
423          }
424          else
425          {
426            $shortname = $buildproc->create_shortname($field);
427          }
428          $buildproc->{'indexfieldmap'}->{$field} = $shortname;
429          $buildproc->{'indexfieldmap'}->{$shortname} = 1;
430        }
431      }
432    }
433
434    # Write out solr 'schema.xml' (and related) file
435    #
436    $self->make_final_field_list();
437    $self->premake_solr_auxiliary_files();
438
439    # Now update the solr-core information in solr.xml
440    # => at most two cores <colname>-Doc and <colname>-Sec
441
442    my $site        = $self->{'site'};
443    my $collect     = $self->{'collection'};
444    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
445
446    # my $idx = $self->{'index_mapping'}->{$index};
447    my $idx = "idx";
448
449    my $build_dir = $self->{'build_dir'};
450
451    foreach my $level (keys %{$self->{'levels'}}) {
452   
453    my ($pindex) = $level =~ /^(.)/;
454
455    my $index_dir = $pindex.$idx;
456    my $core = "$core_prefix-$index_dir";
457
458    # force_removeold == opposite of being run in 'incremental' mode
459    my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
460
461    if ($force_removeold) {
462        print $outhandle "\n-removeold set (new index will be created)\n";
463
464        # create cores under temporary core names, corresponding to building directory
465        $core = "building-".$core;
466
467        my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
468        &FileUtils::removeFilesRecursive($full_index_dir);
469        &FileUtils::makeDirectory($full_index_dir);
470
471        # Solr then wants an "index" folder within this general index area!
472#       my $full_index_index_dir = &FileUtils::filenameConcatenate($full_index_dir,"index");
473#       &FileUtils::makeDirectory($full_index_index_dir);
474
475
476        # now go on and create new index
477        print $outhandle "Creating Solr core: $core\n";
478        $solr_server->admin_create_core($core);
479
480    }
481    else {
482        # if collect==core already in solr.xml (check with STATUS)
483        # => use RELOAD call to refresh fields now expressed in schema.xml
484        #
485        # else
486        # => use CREATE API to add to solr.xml
487       
488        my $check_core_exists = $solr_server->admin_ping_core($core);
489       
490        if ($check_core_exists) {       
491        print $outhandle "Reloading Solr core: $core\n";
492        $solr_server->admin_reload_core($core);
493        }
494        else {
495        print $outhandle "Creating Solr core: $core\n";
496        $solr_server->admin_create_core($core);
497        }
498    }
499    }
500
501}
502
503# Essentially the same as the lucenebuilder.pm version, only using solr_passes
504# => refactor and make better use of inheritence
505
506sub build_index {
507    my $self = shift (@_);
508    my ($index,$llevel) = @_;
509    my $outhandle = $self->{'outhandle'};
510    my $build_dir = $self->{'build_dir'};
511
512    # get the full index directory path and make sure it exists
513    my $indexdir = $self->{'index_mapping'}->{$index};
514    &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));
515
516    # Find the perl script to call to run solr
517    my $solr_passes = $self->{'solr_passes'};
518    my $solr_passes_exe = $self->{'solr_passes_exe'};
519
520    # define the section names for solrpasses
521    # define the section names and possibly the doc name for solrpasses
522    my $solr_passes_sections = $llevel;
523
524    my $osextra = "";
525    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
526    $build_dir =~ s@/@\\@g;
527    } else {
528    if ($outhandle ne "STDERR") {
529        # so solr_passes doesn't print to stderr if we redirect output
530        $osextra .= " 2>/dev/null";
531    }
532    }
533
534    # get the index expression if this index belongs
535    # to a subcollection
536    my $indexexparr = [];
537    my $langarr = [];
538
539    # there may be subcollection info, and language info.
540    my ($fields, $subcollection, $language) = split (":", $index);
541    my @subcollections = ();
542    @subcollections = split /,/, $subcollection if (defined $subcollection);
543
544    foreach $subcollection (@subcollections) {
545    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
546        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
547    }
548    }
549
550    # add expressions for languages if this index belongs to
551    # a language subcollection - only put languages expressions for the
552    # ones we want in the index
553    my @languages = ();
554    my $languagemetadata = "Language";
555    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
556    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
557    }
558    @languages = split /,/, $language if (defined $language);
559    foreach my $language (@languages) {
560    my $not=0;
561    if ($language =~ s/^\!//) {
562        $not = 1;
563    }
564    if($not) {
565        push (@$langarr, "!$language");
566    } else {
567        push (@$langarr, "$language");
568    }
569    }
570
571    # Build index dictionary. Uses verbatim stem method
572    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
573    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
574    my ($handle);
575
576    if ($self->{'debug'}) {
577    $handle = *STDOUT;
578    } else {
579    my $site        = $self->{'site'};
580    my $collect     = $self->{'collection'};
581    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
582    my $ds_idx      = $self->{'index_mapping'}->{$index};
583    my $core        = "$core_prefix-$ds_idx";
584
585    $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
586
587    print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra\n";
588    if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra")) {
589        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
590        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
591    }
592    }
593
594    my $store_levels = $self->{'levels'};
595    my $db_level = "section"; #always
596    my $dom_level = "";
597    foreach my $key (keys %$store_levels) {
598    if ($mgppbuilder::level_map{$key} eq $llevel) {
599        $dom_level = $key;
600    }
601    }
602    if ($dom_level eq "") {
603    print STDERR "Warning: unrecognized tag level $llevel\n";
604    $dom_level = "document";
605    }
606
607    my $local_levels = { $dom_level => 1 }; # work on one level at a time
608
609    # set up the document processr
610    $self->{'buildproc'}->set_output_handle ($handle);
611    $self->{'buildproc'}->set_mode ('text');
612    $self->{'buildproc'}->set_index ($index, $indexexparr);
613    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
614    $self->{'buildproc'}->set_indexing_text (1);
615    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
616    $self->{'buildproc'}->set_levels ($local_levels);
617    if (defined $self->{'collect_cfg'}->{'sortfields'}) {
618    $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
619    }
620    $self->{'buildproc'}->set_db_level($db_level);
621    $self->{'buildproc'}->reset();
622
623    print $handle "<update>\n";
624
625    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
626           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
627   
628    print $handle "</update>\n";
629
630    close ($handle) unless $self->{'debug'};
631
632    $self->print_stats();
633
634    $self->{'buildproc'}->set_levels ($store_levels);
635    print STDERR "</Stage>\n" if $self->{'gli'};
636
637}
638
639
640sub post_build_indexes {
641    my $self = shift(@_);
642
643    # deliberately override to prevent the mgpp post_build_index() calling
644    #  $self->make_final_field_list()
645    # as this has been done in our pre_build_indexes() phase for solr
646
647
648    # Also need to stop the Solr/jetty server if it was explicitly started
649    # in pre_build_indexes()
650   
651    my $solr_server = $self->{'solr_server'};
652
653    if ($solr_server->explicitly_started()) {
654    $solr_server->stop();
655    }
656
657    $self->{'solr_server'} = undef;
658
659}   
660
661
6621;
663
664
Note: See TracBrowser for help on using the browser.