root/gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm @ 25846

Revision 25846, 17.2 KB (checked in by sjm84, 7 years ago)

Some fixes and additions to the Solr perl code

Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37    @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42    my $class = shift(@_);
43    my $self = new lucenebuilder (@_);
44    $self = bless $self, $class;
45
46    $self->{'buildtype'} = "solr";
47
48    my $solr_passes_script = "solr_passes.pl";
49
50    $self->{'solr_passes'} = "$solr_passes_script";
51    # Tack perl on the beginning to ensure execution
52    $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53    return $self;
54}
55
56
57sub default_buildproc {
58    my $self  = shift (@_);
59
60    return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70    my $self = shift (@_);
71    # do nothing if we don't want compressed text
72    return if $self->{'no_text'};
73
74    my ($textindex) = @_;
75
76    # workaround to avoid hard-coding "solr" check into buildcol.pl
77    $textindex =~ s/^section://;
78
79    my $outhandle = $self->{'outhandle'};
80
81    # the text directory
82    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84    &util::mk_all_dir ($text_dir);
85
86    my $osextra = "";
87    if ($ENV{'GSDLOS'} =~ /^windows$/i)
88    {
89    $text_dir =~ s@/@\\@g;
90    }
91    else
92    {
93    if ($outhandle ne "STDERR")
94    {
95        # so solr_passes doesn't print to stderr if we redirect output
96        $osextra .= " 2>/dev/null";
97    }
98    }
99
100    # Find the perl script to call to run solr
101    my $solr_passes = $self->{'solr_passes'};
102    my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104    my $solr_passes_sections = "Doc";
105
106    my ($handle);
107
108    if ($self->{'debug'})
109    {
110    $handle = *STDOUT;
111    }
112    else
113    {
114    my $site        = $self->{'site'};
115    my $collect     = $self->{'collection'};   
116    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117    my $core        = $core_prefix; # unused in this call to solr_passes
118
119        print STDERR "Executable:    $solr_passes_exe\n";
120        print STDERR "Sections:      $solr_passes_sections\n";
121        print STDERR "Build Dir:     $build_dir\n";
122        print STDERR "Cmd:           $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra\n";
123    if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra"))
124    {
125        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
126        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
127    }
128    }
129
130    # stored text is always Doc and Sec levels   
131    my $levels = { 'document' => 1, 'section' => 1 };
132    # always do database at section level
133    my $db_level = "section";
134
135    # set up the document processr
136    $self->{'buildproc'}->set_output_handle ($handle);
137    $self->{'buildproc'}->set_mode ('text');
138    $self->{'buildproc'}->set_index ($textindex);
139    $self->{'buildproc'}->set_indexing_text (0);
140    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
141    $self->{'buildproc'}->set_levels ($levels);
142    $self->{'buildproc'}->set_db_level ($db_level);
143    $self->{'buildproc'}->reset();
144
145    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
146           $self->{'buildproc'}, $self->{'maxdocs'});
147    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
148           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
149    &plugin::end($self->{'pluginfo'});
150
151    close ($handle) unless $self->{'debug'};
152    $self->print_stats();
153
154    print STDERR "</Stage>\n" if $self->{'gli'};
155}
156
157#----
158
159
160
161sub filter_in_out_file
162{
163    my ($in_filename,$out_filename,$replace_rules) = @_;
164
165    if (open(SIN,"<$in_filename")) {
166
167    if (open(SOUT,">$out_filename")) {
168
169        my $line;
170        while (defined ($line=<SIN>)) {
171        chomp $line;
172
173        my $done_insert = 0;
174        foreach my $rule (@$replace_rules) {
175            my $line_re = $rule->{'regexp'};
176            my $insert  = $rule->{'insert'};
177
178            if ($line =~ m/$line_re/) {
179            print SOUT $insert;
180            $done_insert = 1;
181            last;
182            }
183        }
184        if (!$done_insert) {
185            print SOUT "$line\n";;
186        }
187        }
188
189        close(SOUT);
190    }
191    else {
192        print STDERR "Error: Failed to open $out_filename\n";
193        print STDERR "       $!\n";
194    }
195
196    close(SIN);
197    }
198    else {
199    print STDERR "Error: Failed to open $in_filename\n";
200    print STDERR "       $!\n";
201    }
202
203}
204
205# Generate solr schema.xml file based on indexmapfield and other associated
206# config files
207#
208# Unlike make_auxiliary_files(), this needs to be done up-front (rather
209# than at the end) so the data-types in schema.xml are correctly set up
210# prior to document content being pumped through solr_passes.pl
211
212
213sub premake_solr_auxiliary_files
214{
215    my $self = shift (@_);
216   
217    # Replace the following marker:
218    #
219    #   <!-- ##GREENSTONE-FIELDS## -->
220    #
221    # with lines of the form:
222    #
223    #   <field name="<field>" type="string" ... />
224    #
225    # for each <field> in 'indexfieldmap'
226 
227    my $schema_insert_xml = "";
228
229    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
230
231        my ($field) = ($ifm =~ m/^.*->(.*)$/);
232
233        $schema_insert_xml .= "    "; # indent
234        $schema_insert_xml .= "<field name=\"$field\" ";
235
236        if($field eq "LA" || $field eq "LO")
237        {
238            $schema_insert_xml .=   "type=\"location\" ";
239        }
240        elsif ($field ne "ZZ" && $field ne "TX")
241        {
242            $schema_insert_xml .=   "type=\"string\" ";
243        }
244        else
245        {
246            $schema_insert_xml .= "type=\"text_en_splitting\" ";
247        }
248        $schema_insert_xml .=  "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
249    }
250
251    # just the one rule to date
252    my $insert_rules
253    = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
254          'insert' => $schema_insert_xml } ];
255       
256    my $solr_home = $ENV{'GEXT_SOLR'};
257##    my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
258    my $in_dirname = &util::filename_cat($solr_home,"conf");
259    my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
260
261    my $collect_home = $ENV{'GSDLCOLLECTDIR'};
262    my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
263    my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
264   
265    # make sure output conf directory exists
266    if (!-d $out_dirname) {
267    &util::mk_dir($out_dirname);
268    }
269
270    filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
271
272    # now do the same for solrconfig.xml, stopwords, ...
273    # these are simpler, as they currently do not need any filtering
274
275    my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
276             "synonyms.txt", "protwords.txt" );
277 
278    foreach my $file ( @in_file_list ) {
279    my $in_filename = &util::filename_cat($in_dirname,$file.".in");
280    my $out_filename = &util::filename_cat($out_dirname,$file);
281    filter_in_out_file($in_filename,$out_filename,[]);
282    }
283}
284
285
286sub pre_build_indexes
287{
288    my $self = shift (@_);
289    my ($indexname) = @_;
290    my $outhandle = $self->{'outhandle'};
291
292    # If the Solr/Jetty server is not already running, the following starts
293    # it up, and only returns when the server is "reading and listening"
294 
295    my $solr_server = new solrserver($self->{'build_dir'});
296    $solr_server->start();
297    $self->{'solr_server'} = $solr_server;
298
299    my $indexes = [];
300    if (defined $indexname && $indexname =~ /\w/) {
301    push @$indexes, $indexname;
302    } else {
303    $indexes = $self->{'collect_cfg'}->{'indexes'};
304    }
305
306    # skip para-level check, as this is done in the main 'build_indexes'
307    # routine
308
309    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
310    my $allfields_index = 0;        # do we have an allfields index?
311
312    # Using a hashmap here would duplications, but while more space
313    # efficient, it's not entirely clear it would be more computationally
314    # efficient
315    my @all_fields = ();
316
317    foreach my $index (@$indexes) {
318    if ($self->want_built($index)) {
319
320        # get the parameters for the output
321        # split on : just in case there is subcoll and lang stuff
322        my ($fields) = split (/:/, $index);
323
324        foreach my $field (split (/;/, $fields)) {
325        if ($field eq "metadata") {
326            $all_metadata_specified = 1;
327        }
328        else {
329            push(@all_fields,$field);
330        }
331        }
332    }
333    }
334
335    if ($all_metadata_specified) {
336
337    # (Unforunately) we need to process all the documents in the collection
338    # to figure out what the metadata_field_mapping is     
339
340    # set up the document processr
341    $self->{'buildproc'}->set_output_handle (undef);
342    $self->{'buildproc'}->set_mode ('index_field_mapping');
343    $self->{'buildproc'}->reset();
344   
345    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
346               $self->{'buildproc'}, $self->{'maxdocs'});
347    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
348               "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
349    &plugin::end($self->{'pluginfo'});
350   
351    }
352
353    else {
354    # Field mapping solely dependent of entries in 'indexes'
355
356    # No need to explicitly handle "allfields" as create_shortname()
357    # will get a fix on it through it's static_indexfield_map
358
359    my $buildproc = $self->{'buildproc'};
360   
361    foreach my $field (@all_fields) {
362        if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
363        my $shortname = $buildproc->create_shortname($field);
364        $buildproc->{'indexfieldmap'}->{$field} = $shortname;
365        $buildproc->{'indexfieldmap'}->{$shortname} = 1;
366        }
367    }
368    }
369
370    # Write out solr 'schema.xml' (and related) file
371    #
372    $self->make_final_field_list();
373    $self->premake_solr_auxiliary_files();
374
375    # Now update the solr-core information in solr.xml
376    # => at most two cores <colname>-Doc and <colname>-Sec
377
378    my $site        = $self->{'site'};
379    my $collect     = $self->{'collection'};
380    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
381
382    # my $idx = $self->{'index_mapping'}->{$index};
383    my $idx = "idx";
384
385    my $build_dir = $self->{'build_dir'};
386
387    foreach my $level (keys %{$self->{'levels'}}) {
388   
389    my ($pindex) = $level =~ /^(.)/;
390
391    my $index_dir = $pindex.$idx;
392    my $core = "$core_prefix-$index_dir";
393
394    # force_removeold == opposite of being run in 'incremental' mode
395    my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
396
397    if ($force_removeold) {
398        print $outhandle "\n-removeold set (new index will be created)\n";
399
400        my $full_index_dir = &util::filename_cat($build_dir,$index_dir);
401        &util::rm_r($full_index_dir);
402        &util::mk_dir($full_index_dir);
403
404        # Solr then wants an "index" folder within this general index area!
405#       my $full_index_index_dir = &util::filename_cat($full_index_dir,"index");
406#       &util::mk_dir($full_index_index_dir);
407
408
409        # now go on and create new index
410        print $outhandle "Creating Solr core: $core\n";
411        $solr_server->admin_create_core($core);
412
413    }
414    else {
415        # if collect==core already in solr.xml (check with STATUS)
416        # => use RELOAD call to refresh fields now expressed in schema.xml
417        #
418        # else
419        # => use CREATE API to add to solr.xml
420       
421        my $check_core_exists = $solr_server->admin_ping_core($core);
422       
423        if ($check_core_exists) {       
424        print $outhandle "Reloading Solr core: $core\n";
425        $solr_server->admin_reload_core($core);
426        }
427        else {
428        print $outhandle "Creating Solr core: $core\n";
429        $solr_server->admin_create_core($core);
430        }
431    }
432    }
433
434}
435
436# Essentially the same as the lucenebuilder.pm version, only using solr_passes
437# => refactor and make better use of inheritence
438
439sub build_index {
440    my $self = shift (@_);
441    my ($index,$llevel) = @_;
442    my $outhandle = $self->{'outhandle'};
443    my $build_dir = $self->{'build_dir'};
444
445    # get the full index directory path and make sure it exists
446    my $indexdir = $self->{'index_mapping'}->{$index};
447    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
448
449    # Find the perl script to call to run solr
450    my $solr_passes = $self->{'solr_passes'};
451    my $solr_passes_exe = $self->{'solr_passes_exe'};
452
453    # define the section names for solrpasses
454    # define the section names and possibly the doc name for solrpasses
455    my $solr_passes_sections = $llevel;
456
457    my $osextra = "";
458    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
459    $build_dir =~ s@/@\\@g;
460    } else {
461    if ($outhandle ne "STDERR") {
462        # so solr_passes doesn't print to stderr if we redirect output
463        $osextra .= " 2>/dev/null";
464    }
465    }
466
467    # get the index expression if this index belongs
468    # to a subcollection
469    my $indexexparr = [];
470    my $langarr = [];
471
472    # there may be subcollection info, and language info.
473    my ($fields, $subcollection, $language) = split (":", $index);
474    my @subcollections = ();
475    @subcollections = split /,/, $subcollection if (defined $subcollection);
476
477    foreach $subcollection (@subcollections) {
478    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
479        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
480    }
481    }
482
483    # add expressions for languages if this index belongs to
484    # a language subcollection - only put languages expressions for the
485    # ones we want in the index
486    my @languages = ();
487    my $languagemetadata = "Language";
488    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
489    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
490    }
491    @languages = split /,/, $language if (defined $language);
492    foreach my $language (@languages) {
493    my $not=0;
494    if ($language =~ s/^\!//) {
495        $not = 1;
496    }
497    if($not) {
498        push (@$langarr, "!$language");
499    } else {
500        push (@$langarr, "$language");
501    }
502    }
503
504    # Build index dictionary. Uses verbatim stem method
505    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
506    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
507    my ($handle);
508
509    if ($self->{'debug'}) {
510    $handle = *STDOUT;
511    } else {
512    my $site        = $self->{'site'};
513    my $collect     = $self->{'collection'};
514    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
515    my $ds_idx      = $self->{'index_mapping'}->{$index};
516    my $core        = "$core_prefix-$ds_idx";
517
518    print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra\n";
519    if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra")) {
520        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
521        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
522    }
523    }
524
525    my $store_levels = $self->{'levels'};
526    my $db_level = "section"; #always
527    my $dom_level = "";
528    foreach my $key (keys %$store_levels) {
529    if ($mgppbuilder::level_map{$key} eq $llevel) {
530        $dom_level = $key;
531    }
532    }
533    if ($dom_level eq "") {
534    print STDERR "Warning: unrecognized tag level $llevel\n";
535    $dom_level = "document";
536    }
537
538    my $local_levels = { $dom_level => 1 }; # work on one level at a time
539
540    # set up the document processr
541    $self->{'buildproc'}->set_output_handle ($handle);
542    $self->{'buildproc'}->set_mode ('text');
543    $self->{'buildproc'}->set_index ($index, $indexexparr);
544    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
545    $self->{'buildproc'}->set_indexing_text (1);
546    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
547    $self->{'buildproc'}->set_levels ($local_levels);
548    $self->{'buildproc'}->set_db_level($db_level);
549    $self->{'buildproc'}->reset();
550
551    print $handle "<update>\n";
552
553    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
554           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
555   
556    print $handle "</update>\n";
557
558    close ($handle) unless $self->{'debug'};
559
560    $self->print_stats();
561
562    $self->{'buildproc'}->set_levels ($store_levels);
563    print STDERR "</Stage>\n" if $self->{'gli'};
564
565}
566
567
568sub post_build_indexes {
569    my $self = shift(@_);
570
571    # deliberately override to prevent the mgpp post_build_index() calling
572    #  $self->make_final_field_list()
573    # as this has been done in our pre_build_indexes() phase for solr
574
575
576    # Also need to stop the Solr/jetty server if it was explicitly started
577    # in pre_build_indexes()
578   
579    my $solr_server = $self->{'solr_server'};
580
581    if ($solr_server->explicitly_started()) {
582    $solr_server->stop();
583    }
584
585    $self->{'solr_server'} = undef;
586
587}   
588
589
5901;
591
592
Note: See TracBrowser for help on using the browser.