root/gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm @ 25889

Revision 25889, 17.6 KB (checked in by ak19, 7 years ago)

Second set of commits for getting activate.pl to deal with solr cores when moving building to index. This time it uses the building- prefix and things still work. However, if the GS3 server is already running, an ant restart is required before searches return results and not sure about whether the incremental case is covered properly. The index reason is still being created for some reason when building.

Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37    @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42    my $class = shift(@_);
43    my $self = new lucenebuilder (@_);
44    $self = bless $self, $class;
45
46    $self->{'buildtype'} = "solr";
47
48    my $solr_passes_script = "solr_passes.pl";
49
50    $self->{'solr_passes'} = "$solr_passes_script";
51    # Tack perl on the beginning to ensure execution
52    $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53    return $self;
54}
55
56
57sub default_buildproc {
58    my $self  = shift (@_);
59
60    return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70    my $self = shift (@_);
71    # do nothing if we don't want compressed text
72    return if $self->{'no_text'};
73
74    my ($textindex) = @_;
75
76    # workaround to avoid hard-coding "solr" check into buildcol.pl
77    $textindex =~ s/^section://;
78
79    my $outhandle = $self->{'outhandle'};
80
81    # the text directory
82    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84    &util::mk_all_dir ($text_dir);
85
86    my $osextra = "";
87    if ($ENV{'GSDLOS'} =~ /^windows$/i)
88    {
89    $text_dir =~ s@/@\\@g;
90    }
91    else
92    {
93    if ($outhandle ne "STDERR")
94    {
95        # so solr_passes doesn't print to stderr if we redirect output
96        $osextra .= " 2>/dev/null";
97    }
98    }
99
100    # Find the perl script to call to run solr
101    my $solr_passes = $self->{'solr_passes'};
102    my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104    my $solr_passes_sections = "Doc";
105
106    my ($handle);
107
108    if ($self->{'debug'})
109    {
110    $handle = *STDOUT;
111    }
112    else
113    {
114    my $site        = $self->{'site'};
115    my $collect     = $self->{'collection'};   
116    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117    my $core        = $core_prefix; # unused in this call to solr_passes
118
119    $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
120
121        print STDERR "Executable:    $solr_passes_exe\n";
122        print STDERR "Sections:      $solr_passes_sections\n";
123        print STDERR "Build Dir:     $build_dir\n";
124        print STDERR "Cmd:           $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra\n";
125    if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\"   $osextra"))
126    {
127        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
128        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
129    }
130    }
131
132    # stored text is always Doc and Sec levels   
133    my $levels = { 'document' => 1, 'section' => 1 };
134    # always do database at section level
135    my $db_level = "section";
136
137    # set up the document processr
138    $self->{'buildproc'}->set_output_handle ($handle);
139    $self->{'buildproc'}->set_mode ('text');
140    $self->{'buildproc'}->set_index ($textindex);
141    $self->{'buildproc'}->set_indexing_text (0);
142    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
143    $self->{'buildproc'}->set_levels ($levels);
144    $self->{'buildproc'}->set_db_level ($db_level);
145    $self->{'buildproc'}->reset();
146
147    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
148           $self->{'buildproc'}, $self->{'maxdocs'});
149    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
150           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
151    &plugin::end($self->{'pluginfo'});
152
153    close ($handle) unless $self->{'debug'};
154    $self->print_stats();
155
156    print STDERR "</Stage>\n" if $self->{'gli'};
157}
158
159#----
160
161
162
163sub filter_in_out_file
164{
165    my ($in_filename,$out_filename,$replace_rules) = @_;
166
167    if (open(SIN,"<$in_filename")) {
168
169    if (open(SOUT,">$out_filename")) {
170
171        my $line;
172        while (defined ($line=<SIN>)) {
173        chomp $line;
174
175        my $done_insert = 0;
176        foreach my $rule (@$replace_rules) {
177            my $line_re = $rule->{'regexp'};
178            my $insert  = $rule->{'insert'};
179
180            if ($line =~ m/$line_re/) {
181            print SOUT $insert;
182            $done_insert = 1;
183            last;
184            }
185        }
186        if (!$done_insert) {
187            print SOUT "$line\n";;
188        }
189        }
190
191        close(SOUT);
192    }
193    else {
194        print STDERR "Error: Failed to open $out_filename\n";
195        print STDERR "       $!\n";
196    }
197
198    close(SIN);
199    }
200    else {
201    print STDERR "Error: Failed to open $in_filename\n";
202    print STDERR "       $!\n";
203    }
204
205}
206
207# Generate solr schema.xml file based on indexmapfield and other associated
208# config files
209#
210# Unlike make_auxiliary_files(), this needs to be done up-front (rather
211# than at the end) so the data-types in schema.xml are correctly set up
212# prior to document content being pumped through solr_passes.pl
213
214
215sub premake_solr_auxiliary_files
216{
217    my $self = shift (@_);
218   
219    # Replace the following marker:
220    #
221    #   <!-- ##GREENSTONE-FIELDS## -->
222    #
223    # with lines of the form:
224    #
225    #   <field name="<field>" type="string" ... />
226    #
227    # for each <field> in 'indexfieldmap'
228 
229    my $schema_insert_xml = "";
230
231    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
232
233        my ($field) = ($ifm =~ m/^.*->(.*)$/);
234
235        $schema_insert_xml .= "    "; # indent
236        $schema_insert_xml .= "<field name=\"$field\" ";
237
238        if($field eq "LA" || $field eq "LO")
239        {
240            $schema_insert_xml .=   "type=\"location\" ";
241        }
242        elsif ($field ne "ZZ" && $field ne "TX")
243        {
244            $schema_insert_xml .=   "type=\"string\" ";
245        }
246        else
247        {
248            $schema_insert_xml .= "type=\"text_en_splitting\" ";
249        }
250        $schema_insert_xml .=  "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
251    }
252
253    # just the one rule to date
254    my $insert_rules
255    = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
256          'insert' => $schema_insert_xml } ];
257       
258    my $solr_home = $ENV{'GEXT_SOLR'};
259##    my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
260    my $in_dirname = &util::filename_cat($solr_home,"conf");
261    my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
262
263    my $collect_home = $ENV{'GSDLCOLLECTDIR'};
264    my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
265    my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
266   
267    # make sure output conf directory exists
268    if (!-d $out_dirname) {
269    &util::mk_dir($out_dirname);
270    }
271
272    filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
273
274    # now do the same for solrconfig.xml, stopwords, ...
275    # these are simpler, as they currently do not need any filtering
276
277    my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
278             "synonyms.txt", "protwords.txt" );
279 
280    foreach my $file ( @in_file_list ) {
281    my $in_filename = &util::filename_cat($in_dirname,$file.".in");
282    my $out_filename = &util::filename_cat($out_dirname,$file);
283    filter_in_out_file($in_filename,$out_filename,[]);
284    }
285}
286
287
288sub pre_build_indexes
289{
290    my $self = shift (@_);
291    my ($indexname) = @_;
292    my $outhandle = $self->{'outhandle'};
293
294    # If the Solr/Jetty server is not already running, the following starts
295    # it up, and only returns when the server is "reading and listening"
296 
297    my $solr_server = new solrserver($self->{'build_dir'});
298    $solr_server->start();
299    $self->{'solr_server'} = $solr_server;
300
301    my $indexes = [];
302    if (defined $indexname && $indexname =~ /\w/) {
303    push @$indexes, $indexname;
304    } else {
305    $indexes = $self->{'collect_cfg'}->{'indexes'};
306    }
307
308    # skip para-level check, as this is done in the main 'build_indexes'
309    # routine
310
311    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
312    my $allfields_index = 0;        # do we have an allfields index?
313
314    # Using a hashmap here would duplications, but while more space
315    # efficient, it's not entirely clear it would be more computationally
316    # efficient
317    my @all_fields = ();
318
319    foreach my $index (@$indexes) {
320    if ($self->want_built($index)) {
321
322        # get the parameters for the output
323        # split on : just in case there is subcoll and lang stuff
324        my ($fields) = split (/:/, $index);
325
326        foreach my $field (split (/;/, $fields)) {
327        if ($field eq "metadata") {
328            $all_metadata_specified = 1;
329        }
330        else {
331            push(@all_fields,$field);
332        }
333        }
334    }
335    }
336
337    if ($all_metadata_specified) {
338
339    # (Unforunately) we need to process all the documents in the collection
340    # to figure out what the metadata_field_mapping is     
341
342    # set up the document processr
343    $self->{'buildproc'}->set_output_handle (undef);
344    $self->{'buildproc'}->set_mode ('index_field_mapping');
345    $self->{'buildproc'}->reset();
346   
347    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
348               $self->{'buildproc'}, $self->{'maxdocs'});
349    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
350               "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
351    &plugin::end($self->{'pluginfo'});
352   
353    }
354
355    else {
356    # Field mapping solely dependent of entries in 'indexes'
357
358    # No need to explicitly handle "allfields" as create_shortname()
359    # will get a fix on it through it's static_indexfield_map
360
361    my $buildproc = $self->{'buildproc'};
362   
363    foreach my $field (@all_fields) {
364        if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
365        my $shortname = $buildproc->create_shortname($field);
366        $buildproc->{'indexfieldmap'}->{$field} = $shortname;
367        $buildproc->{'indexfieldmap'}->{$shortname} = 1;
368        }
369    }
370    }
371
372    # Write out solr 'schema.xml' (and related) file
373    #
374    $self->make_final_field_list();
375    $self->premake_solr_auxiliary_files();
376
377    # Now update the solr-core information in solr.xml
378    # => at most two cores <colname>-Doc and <colname>-Sec
379
380    my $site        = $self->{'site'};
381    my $collect     = $self->{'collection'};
382    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
383
384    # my $idx = $self->{'index_mapping'}->{$index};
385    my $idx = "idx";
386
387    my $build_dir = $self->{'build_dir'};
388
389    foreach my $level (keys %{$self->{'levels'}}) {
390   
391    my ($pindex) = $level =~ /^(.)/;
392
393    my $index_dir = $pindex.$idx;
394    my $core = "$core_prefix-$index_dir";
395
396    # force_removeold == opposite of being run in 'incremental' mode
397    my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
398
399    if ($force_removeold) {
400        print $outhandle "\n-removeold set (new index will be created)\n";
401
402        # create cores under temporary core names, corresponding to building directory
403        $core = "building-".$core;
404
405        my $full_index_dir = &util::filename_cat($build_dir,$index_dir);
406        &util::rm_r($full_index_dir);
407        &util::mk_dir($full_index_dir);
408
409        # Solr then wants an "index" folder within this general index area!
410#       my $full_index_index_dir = &util::filename_cat($full_index_dir,"index");
411#       &util::mk_dir($full_index_index_dir);
412
413
414        # now go on and create new index
415        print $outhandle "Creating Solr core: $core\n";
416        $solr_server->admin_create_core($core);
417
418    }
419    else {
420        # if collect==core already in solr.xml (check with STATUS)
421        # => use RELOAD call to refresh fields now expressed in schema.xml
422        #
423        # else
424        # => use CREATE API to add to solr.xml
425       
426        my $check_core_exists = $solr_server->admin_ping_core($core);
427       
428        if ($check_core_exists) {       
429        print $outhandle "Reloading Solr core: $core\n";
430        $solr_server->admin_reload_core($core);
431        }
432        else {
433        print $outhandle "Creating Solr core: $core\n";
434        $solr_server->admin_create_core($core);
435        }
436    }
437    }
438
439}
440
441# Essentially the same as the lucenebuilder.pm version, only using solr_passes
442# => refactor and make better use of inheritence
443
444sub build_index {
445    my $self = shift (@_);
446    my ($index,$llevel) = @_;
447    my $outhandle = $self->{'outhandle'};
448    my $build_dir = $self->{'build_dir'};
449
450    # get the full index directory path and make sure it exists
451    my $indexdir = $self->{'index_mapping'}->{$index};
452    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
453
454    # Find the perl script to call to run solr
455    my $solr_passes = $self->{'solr_passes'};
456    my $solr_passes_exe = $self->{'solr_passes_exe'};
457
458    # define the section names for solrpasses
459    # define the section names and possibly the doc name for solrpasses
460    my $solr_passes_sections = $llevel;
461
462    my $osextra = "";
463    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
464    $build_dir =~ s@/@\\@g;
465    } else {
466    if ($outhandle ne "STDERR") {
467        # so solr_passes doesn't print to stderr if we redirect output
468        $osextra .= " 2>/dev/null";
469    }
470    }
471
472    # get the index expression if this index belongs
473    # to a subcollection
474    my $indexexparr = [];
475    my $langarr = [];
476
477    # there may be subcollection info, and language info.
478    my ($fields, $subcollection, $language) = split (":", $index);
479    my @subcollections = ();
480    @subcollections = split /,/, $subcollection if (defined $subcollection);
481
482    foreach $subcollection (@subcollections) {
483    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
484        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
485    }
486    }
487
488    # add expressions for languages if this index belongs to
489    # a language subcollection - only put languages expressions for the
490    # ones we want in the index
491    my @languages = ();
492    my $languagemetadata = "Language";
493    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
494    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
495    }
496    @languages = split /,/, $language if (defined $language);
497    foreach my $language (@languages) {
498    my $not=0;
499    if ($language =~ s/^\!//) {
500        $not = 1;
501    }
502    if($not) {
503        push (@$langarr, "!$language");
504    } else {
505        push (@$langarr, "$language");
506    }
507    }
508
509    # Build index dictionary. Uses verbatim stem method
510    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
511    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
512    my ($handle);
513
514    if ($self->{'debug'}) {
515    $handle = *STDOUT;
516    } else {
517    my $site        = $self->{'site'};
518    my $collect     = $self->{'collection'};
519    my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
520    my $ds_idx      = $self->{'index_mapping'}->{$index};
521    my $core        = "$core_prefix-$ds_idx";
522
523    $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
524
525    print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra\n";
526    if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\"   $osextra")) {
527        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
528        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
529    }
530    }
531
532    my $store_levels = $self->{'levels'};
533    my $db_level = "section"; #always
534    my $dom_level = "";
535    foreach my $key (keys %$store_levels) {
536    if ($mgppbuilder::level_map{$key} eq $llevel) {
537        $dom_level = $key;
538    }
539    }
540    if ($dom_level eq "") {
541    print STDERR "Warning: unrecognized tag level $llevel\n";
542    $dom_level = "document";
543    }
544
545    my $local_levels = { $dom_level => 1 }; # work on one level at a time
546
547    # set up the document processr
548    $self->{'buildproc'}->set_output_handle ($handle);
549    $self->{'buildproc'}->set_mode ('text');
550    $self->{'buildproc'}->set_index ($index, $indexexparr);
551    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
552    $self->{'buildproc'}->set_indexing_text (1);
553    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
554    $self->{'buildproc'}->set_levels ($local_levels);
555    $self->{'buildproc'}->set_db_level($db_level);
556    $self->{'buildproc'}->reset();
557
558    print $handle "<update>\n";
559
560    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
561           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
562   
563    print $handle "</update>\n";
564
565    close ($handle) unless $self->{'debug'};
566
567    $self->print_stats();
568
569    $self->{'buildproc'}->set_levels ($store_levels);
570    print STDERR "</Stage>\n" if $self->{'gli'};
571
572}
573
574
575sub post_build_indexes {
576    my $self = shift(@_);
577
578    # deliberately override to prevent the mgpp post_build_index() calling
579    #  $self->make_final_field_list()
580    # as this has been done in our pre_build_indexes() phase for solr
581
582
583    # Also need to stop the Solr/jetty server if it was explicitly started
584    # in pre_build_indexes()
585   
586    my $solr_server = $self->{'solr_server'};
587
588    if ($solr_server->explicitly_started()) {
589    $solr_server->stop();
590    }
591
592    $self->{'solr_server'} = undef;
593
594}   
595
596
5971;
598
599
Note: See TracBrowser for help on using the browser.