root/gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm @ 24447

Revision 24447, 14.8 KB (checked in by davidb, 9 years ago)

Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running

Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use Config; # for getting the perlpath in the recommended way
34
35sub BEGIN {
36    @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41    my $class = shift(@_);
42    my $self = new lucenebuilder (@_);
43    $self = bless $self, $class;
44
45    $self->{'buildtype'} = "solr";
46
47    my $solr_passes_script = "solr_passes.pl";
48
49    $self->{'solr_passes'} = "$solr_passes_script";
50    # Tack perl on the beginning to ensure execution
51    $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
52    return $self;
53}
54
55
56sub default_buildproc {
57    my $self  = shift (@_);
58
59    return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69    my $self = shift (@_);
70    # do nothing if we don't want compressed text
71    return if $self->{'no_text'};
72
73    my ($textindex) = @_;
74
75    # workaround to avoid hard-coding "solr" check into buildcol.pl
76    $textindex =~ s/^section://;
77
78    my $outhandle = $self->{'outhandle'};
79
80    # the text directory
81    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
82    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
83    &util::mk_all_dir ($text_dir);
84
85    my $osextra = "";
86    if ($ENV{'GSDLOS'} =~ /^windows$/i)
87    {
88    $text_dir =~ s@/@\\@g;
89    }
90    else
91    {
92    if ($outhandle ne "STDERR")
93    {
94        # so solr_passes doesn't print to stderr if we redirect output
95        $osextra .= " 2>/dev/null";
96    }
97    }
98
99    # Find the perl script to call to run solr
100    my $solr_passes = $self->{'solr_passes'};
101    my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103    my $solr_passes_sections = "Doc";
104
105    my ($handle);
106
107    if ($self->{'debug'})
108    {
109    $handle = *STDOUT;
110    }
111    else
112    {
113    my $collection = $self->{'collection'};
114
115        print STDERR "Executable:    $solr_passes_exe\n";
116        print STDERR "Sections:      $solr_passes_sections\n";
117        print STDERR "Build Dir:     $build_dir\n";
118        print STDERR "Cmd:           $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
119    if (!open($handle, "| $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
120    {
121        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
122        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
123    }
124    }
125
126    # stored text is always Doc and Sec levels   
127    my $levels = { 'document' => 1, 'section' => 1 };
128    # always do database at section level
129    my $db_level = "section";
130
131    # set up the document processr
132    $self->{'buildproc'}->set_output_handle ($handle);
133    $self->{'buildproc'}->set_mode ('text');
134    $self->{'buildproc'}->set_index ($textindex);
135    $self->{'buildproc'}->set_indexing_text (0);
136    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
137    $self->{'buildproc'}->set_levels ($levels);
138    $self->{'buildproc'}->set_db_level ($db_level);
139    $self->{'buildproc'}->reset();
140
141    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
142           $self->{'buildproc'}, $self->{'maxdocs'});
143    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
144           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
145    &plugin::end($self->{'pluginfo'});
146
147    close ($handle) unless $self->{'debug'};
148    $self->print_stats();
149
150    print STDERR "</Stage>\n" if $self->{'gli'};
151}
152
153#----
154
155
156
157sub filter_in_out_file
158{
159    my ($in_filename,$out_filename,$replace_rules) = @_;
160
161    if (open(SIN,"<$in_filename")) {
162
163    if (open(SOUT,">$out_filename")) {
164
165        my $line;
166        while (defined ($line=<SIN>)) {
167        chomp $line;
168
169        my $done_insert = 0;
170        foreach my $rule (@$replace_rules) {
171            my $line_re = $rule->{'regexp'};
172            my $insert  = $rule->{'insert'};
173
174            if ($line =~ m/$line_re/) {
175            print SOUT $insert;
176            $done_insert = 1;
177            last;
178            }
179        }
180        if (!$done_insert) {
181            print SOUT "$line\n";;
182        }
183        }
184
185        close(SOUT);
186    }
187    else {
188        print STDERR "Error: Failed to open $out_filename\n";
189        print STDERR "       $!\n";
190    }
191
192    close(SIN);
193    }
194    else {
195    print STDERR "Error: Failed to open $in_filename\n";
196    print STDERR "       $!\n";
197    }
198
199}
200
201# Generate solr schema.xml file based on indexmapfield and other associated
202# config files
203#
204# Unlike make_auxiliary_files(), this needs to be done up-front (rather
205# than at the end) so the data-types in schema.xml are correctly set up
206# prior to document content being pumped through solr_passes.pl
207
208
209sub premake_solr_auxiliary_files
210{
211    my $self = shift (@_);
212   
213    # Replace the following marker:
214    #
215    #   <!-- ##GREENSTONE-FIELDS## -->
216    #
217    # with lines of the form:
218    #
219    #   <field name="<field>" type="string" ... />
220    #
221    # for each <field> in 'indexfieldmap'
222 
223    my $schema_insert_xml = "";
224
225    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
226
227    my ($field) = ($ifm =~ m/^.*->(.*)$/);
228
229    # Need special case for Long/Lat
230    # ... but for now treat everything as of type string
231
232    $schema_insert_xml .= "    "; # indent
233    $schema_insert_xml .= "<field name=\"$field\" ";
234    $schema_insert_xml .=   "type=\"string\" indexed=\"true\" ";
235    $schema_insert_xml .=   "stored=\"false\" multiValued=\"true\" />\n";
236    }
237
238    # just the one rule to date
239    my $insert_rules
240    = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
241          'insert' => $schema_insert_xml } ];
242       
243    my $solr_home = $ENV{'GEXT_SOLR'};
244    my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
245    my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
246
247
248    my $collect_home = $ENV{'GSDLCOLLECTDIR'};
249    my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
250    my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
251   
252    # make sure output conf directory exists
253    if (!-d $out_dirname) {
254    &util::mk_dir($out_dirname);
255    }
256
257    filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
258
259    # now do the same for solrconfig.xml, stopwords, ...
260    # these are simpler, as they currently do not need any filtering
261
262    my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
263             "synonyms.txt", "protwords.txt" );
264
265    foreach my $file ( @in_file_list ) {
266    my $in_filename = &util::filename_cat($in_dirname,$file.".in");
267    my $out_filename = &util::filename_cat($out_dirname,$file);
268    filter_in_out_file($in_filename,$out_filename,[]);
269    }
270}
271
272
273sub pre_build_indexes
274{
275    my $self = shift (@_);
276    my ($indexname) = @_;
277    my $outhandle = $self->{'outhandle'};
278
279    # read in build.cfg if in incremental mode???
280
281    my $indexes = [];
282    if (defined $indexname && $indexname =~ /\w/) {
283    push @$indexes, $indexname;
284    } else {
285    $indexes = $self->{'collect_cfg'}->{'indexes'};
286    }
287
288    # skip para-level check, as this is done in the main 'build_indexes'
289    # routine
290
291    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
292    my $allfields_index = 0;        # do we have an allfields index?
293
294    # Using a hashmap here would duplications, but while more space
295    # efficient, it's not entirely clear it would be more computationally
296    # efficient
297    my @all_fields = ();
298
299    foreach my $index (@$indexes) {
300    if ($self->want_built($index)) {
301
302        # get the parameters for the output
303        # split on : just in case there is subcoll and lang stuff
304        my ($fields) = split (/:/, $index);
305
306        foreach my $field (split (/;/, $fields)) {
307        if ($field eq "metadata") {
308            $all_metadata_specified = 1;
309        }
310        else {
311            push(@all_fields,$field);
312        }
313        }
314    }
315    }
316
317    if ($all_metadata_specified) {
318
319    # (Unforunately) we need to process all the documents in the collection
320    # to figure out what the metadata_field_mapping is     
321
322    # set up the document processr
323    $self->{'buildproc'}->set_output_handle (undef);
324    $self->{'buildproc'}->set_mode ('index_field_mapping');
325    $self->{'buildproc'}->reset();
326   
327    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
328               $self->{'buildproc'}, $self->{'maxdocs'});
329    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
330               "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
331    &plugin::end($self->{'pluginfo'});
332   
333    }
334
335    else {
336    # Field mapping solely dependent of entries in 'indexes'
337
338    # No need to explicitly handle "allfields" as create_shortname()
339    # will get a fix on it through it's static_indexfield_map
340
341    my $buildproc = $self->{'buildproc'};
342   
343    foreach my $field (@all_fields) {
344        if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
345        my $shortname = $buildproc->create_shortname($field);
346        $buildproc->{'indexfieldmap'}->{$field} = $shortname;
347        $buildproc->{'indexfieldmap'}->{$shortname} = 1;
348        }
349    }
350    }
351
352    # write out solr 'schema.xml' (and related) file
353    $self->make_final_field_list();
354    $self->premake_solr_auxiliary_files();
355
356    # if collect==core not already in solr.xml (check with STATUS)
357    # => use CREATE API to add to solr.xml
358    #
359    # else
360    # => use RELOAD call to refresh fields now expressed in schema.xml
361
362}
363
364# Essentially the same as the lucenebuilder.pm version, only using solr_passes
365# => refactor and make better use of inheritence
366
367sub build_index {
368    my $self = shift (@_);
369    my ($index,$llevel) = @_;
370    my $outhandle = $self->{'outhandle'};
371    my $build_dir = $self->{'build_dir'};
372
373    # get the full index directory path and make sure it exists
374    my $indexdir = $self->{'index_mapping'}->{$index};
375    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
376
377    # Find the perl script to call to run solr
378    my $solr_passes = $self->{'solr_passes'};
379    my $solr_passes_exe = $self->{'solr_passes_exe'};
380
381    # define the section names for solrpasses
382    # define the section names and possibly the doc name for solrpasses
383    my $solr_passes_sections = $llevel;
384
385    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
386
387    my $osextra = "";
388    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
389    $build_dir =~ s@/@\\@g;
390    } else {
391    if ($outhandle ne "STDERR") {
392        # so solr_passes doesn't print to stderr if we redirect output
393        $osextra .= " 2>/dev/null";
394    }
395    }
396
397    # get the index expression if this index belongs
398    # to a subcollection
399    my $indexexparr = [];
400    my $langarr = [];
401
402    # there may be subcollection info, and language info.
403    my ($fields, $subcollection, $language) = split (":", $index);
404    my @subcollections = ();
405    @subcollections = split /,/, $subcollection if (defined $subcollection);
406
407    foreach $subcollection (@subcollections) {
408    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
409        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
410    }
411    }
412
413    # add expressions for languages if this index belongs to
414    # a language subcollection - only put languages expressions for the
415    # ones we want in the index
416    my @languages = ();
417    my $languagemetadata = "Language";
418    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
419    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
420    }
421    @languages = split /,/, $language if (defined $language);
422    foreach my $language (@languages) {
423    my $not=0;
424    if ($language =~ s/^\!//) {
425        $not = 1;
426    }
427    if($not) {
428        push (@$langarr, "!$language");
429    } else {
430        push (@$langarr, "$language");
431    }
432    }
433
434    # Build index dictionary. Uses verbatim stem method
435    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
436    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
437    my ($handle);
438
439    if ($self->{'debug'}) {
440    $handle = *STDOUT;
441    } else {
442    my $collection = $self->{'collection'};
443
444    print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
445    if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
446        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
447        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
448    }
449    }
450
451    my $store_levels = $self->{'levels'};
452    my $db_level = "section"; #always
453    my $dom_level = "";
454    foreach my $key (keys %$store_levels) {
455    if ($mgppbuilder::level_map{$key} eq $llevel) {
456        $dom_level = $key;
457    }
458    }
459    if ($dom_level eq "") {
460    print STDERR "Warning: unrecognized tag level $llevel\n";
461    $dom_level = "document";
462    }
463
464    my $local_levels = { $dom_level => 1 }; # work on one level at a time
465
466    # set up the document processr
467    $self->{'buildproc'}->set_output_handle ($handle);
468    $self->{'buildproc'}->set_mode ('text');
469    $self->{'buildproc'}->set_index ($index, $indexexparr);
470    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
471    $self->{'buildproc'}->set_indexing_text (1);
472    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
473    $self->{'buildproc'}->set_levels ($local_levels);
474    $self->{'buildproc'}->set_db_level($db_level);
475    $self->{'buildproc'}->reset();
476
477    print $handle "<update>\n";
478
479    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
481
482
483    print $handle "</update>\n";
484
485    close ($handle) unless $self->{'debug'};
486
487    $self->print_stats();
488
489    $self->{'buildproc'}->set_levels ($store_levels);
490    print STDERR "</Stage>\n" if $self->{'gli'};
491
492}
493
494
495sub post_build_indexes {
496    my $self = shift(@_);
497
498    # deliberately override to prevent the mgpp post_build_index() calling
499    #  $self->make_final_field_list()
500    # as this has been done in our pre_build_indexes() phase for solr
501   
502}   
503
504
5051;
506
507
Note: See TracBrowser for help on using the browser.