root/gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm @ 24446

Revision 24446, 17.2 KB (checked in by davidb, 8 years ago)

Start of Solr extension for Greenstone3

Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use Config; # for getting the perlpath in the recommended way
34
35sub BEGIN {
36    @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41    my $class = shift(@_);
42    my $self = new lucenebuilder (@_);
43    $self = bless $self, $class;
44
45    $self->{'buildtype'} = "solr";
46
47    my $solr_passes_script = "solr_passes.pl";
48
49    $self->{'solr_passes'} = "$solr_passes_script";
50    # Tack perl on the beginning to ensure execution
51    $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
52    return $self;
53}
54
55
56sub default_buildproc {
57    my $self  = shift (@_);
58
59    return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69    my $self = shift (@_);
70    # do nothing if we don't want compressed text
71    return if $self->{'no_text'};
72
73    my ($textindex) = @_;
74
75    # workaround to avoid hard-coding "solr" check into buildcol.pl
76    $textindex =~ s/^section://;
77
78    my $outhandle = $self->{'outhandle'};
79
80    # the text directory
81    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
82    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
83    &util::mk_all_dir ($text_dir);
84
85    my $osextra = "";
86    if ($ENV{'GSDLOS'} =~ /^windows$/i)
87    {
88    $text_dir =~ s@/@\\@g;
89    }
90    else
91    {
92    if ($outhandle ne "STDERR")
93    {
94        # so solr_passes doesn't print to stderr if we redirect output
95        $osextra .= " 2>/dev/null";
96    }
97    }
98
99    # Find the perl script to call to run solr
100    my $solr_passes = $self->{'solr_passes'};
101    my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103    my $solr_passes_sections = "Doc";
104
105    my ($handle);
106
107    if ($self->{'debug'})
108    {
109    $handle = *STDOUT;
110    }
111    else
112    {
113    my $collection = $self->{'collection'};
114
115        print STDERR "Executable:    $solr_passes_exe\n";
116        print STDERR "Sections:      $solr_passes_sections\n";
117        print STDERR "Build Dir:     $build_dir\n";
118        print STDERR "Cmd:           $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
119    if (!open($handle, "| $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
120    {
121        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
122        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
123    }
124    }
125
126    # stored text is always Doc and Sec levels   
127    my $levels = { 'document' => 1, 'section' => 1 };
128    # always do database at section level
129    my $db_level = "section";
130
131    # set up the document processr
132    $self->{'buildproc'}->set_output_handle ($handle);
133    $self->{'buildproc'}->set_mode ('text');
134    $self->{'buildproc'}->set_index ($textindex);
135    $self->{'buildproc'}->set_indexing_text (0);
136    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
137    $self->{'buildproc'}->set_levels ($levels);
138    $self->{'buildproc'}->set_db_level ($db_level);
139    $self->{'buildproc'}->reset();
140
141    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
142           $self->{'buildproc'}, $self->{'maxdocs'});
143    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
144           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
145    &plugin::end($self->{'pluginfo'});
146
147    close ($handle) unless $self->{'debug'};
148    $self->print_stats();
149
150    print STDERR "</Stage>\n" if $self->{'gli'};
151}
152
153#----
154
155
156
157sub filter_in_out_file
158{
159    my ($in_filename,$out_filename,$replace_rules) = @_;
160
161    if (open(SIN,"<$in_filename")) {
162
163    if (open(SOUT,">$out_filename")) {
164
165        my $line;
166        while (defined ($line=<SIN>)) {
167        chomp $line;
168
169        my $done_insert = 0;
170        foreach my $rule (@$replace_rules) {
171            my $line_re = $rule->{'regexp'};
172            my $insert  = $rule->{'insert'};
173
174            if ($line =~ m/$line_re/) {
175            print SOUT $insert;
176            $done_insert = 1;
177            last;
178            }
179        }
180        if (!$done_insert) {
181            print SOUT "$line\n";;
182        }
183        }
184
185        close(SOUT);
186    }
187    else {
188        print STDERR "Error: Failed to open $out_filename\n";
189        print STDERR "       $!\n";
190    }
191
192    close(SIN);
193    }
194    else {
195    print STDERR "Error: Failed to open $in_filename\n";
196    print STDERR "       $!\n";
197    }
198
199}
200
201# Generate solr schema.xml file based on indexmapfield and other associated
202# config files
203#
204# Unlike make_auxiliary_files(), this needs to be done up-front (rather
205# than at the end) so the data-types in schema.xml are correctly set up
206# prior to document content being pumped through solr_passes.pl
207
208
209sub premake_solr_auxiliary_files
210{
211    my $self = shift (@_);
212   
213    # Replace the following marker:
214    #
215    #   <!-- ##GREENSTONE-FIELDS## -->
216    #
217    # with lines of the form:
218    #
219    #   <field name="<field>" type="string" ... />
220    #
221    # for each <field> in 'indexfieldmap'
222 
223    my $schema_insert_xml = "";
224
225    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
226
227    my ($field) = ($ifm =~ m/^.*->(.*)$/);
228
229    # Need special case for Long/Lat
230    # ... but for now treat everything as of type string
231
232    $schema_insert_xml .= "    "; # indent
233    $schema_insert_xml .= "<field name=\"$field\" ";
234    $schema_insert_xml .=   "type=\"string\" indexed=\"true\" ";
235    $schema_insert_xml .=   "stored=\"false\" multiValued=\"true\" />\n";
236    }
237
238    # just the one rule to date
239    my $insert_rules
240    = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
241          'insert' => $schema_insert_xml } ];
242       
243    my $solr_home = $ENV{'GEXT_SOLR'};
244    my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
245    my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
246
247
248    my $collect_home = $ENV{'GSDLCOLLECTDIR'};
249    my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
250    my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
251   
252    # make sure output conf directory exists
253    if (!-d $out_dirname) {
254    &util::mk_dir($out_dirname);
255    }
256
257    filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
258
259    # now do the same for solrconfig.xml, stopwords, ...
260    # these are simpler, as they currently do not need any filtering
261
262    my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
263             "synonyms.txt", "protwords.txt" );
264
265    foreach my $file ( @in_file_list ) {
266    my $in_filename = &util::filename_cat($in_dirname,$file.".in");
267    my $out_filename = &util::filename_cat($out_dirname,$file);
268    filter_in_out_file($in_filename,$out_filename,[]);
269    }
270}
271
272
273sub pre_build_indexes
274{
275    my $self = shift (@_);
276    my ($indexname) = @_;
277    my $outhandle = $self->{'outhandle'};
278
279    # read in build.cfg if in incremental mode???
280
281    my $indexes = [];
282    if (defined $indexname && $indexname =~ /\w/) {
283    push @$indexes, $indexname;
284    } else {
285    $indexes = $self->{'collect_cfg'}->{'indexes'};
286    }
287
288    # skip para-level check, as this is done in the main 'build_indexes'
289    # routine
290
291    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
292    my $allfields_index = 0;        # do we have an allfields index?
293
294    # Using a hashmap here would duplications, but while more space
295    # efficient, it's not entirely clear it would be more computationally
296    # efficient
297    my @all_fields = ();
298
299    foreach my $index (@$indexes) {
300    if ($self->want_built($index)) {
301
302        # get the parameters for the output
303        # split on : just in case there is subcoll and lang stuff
304        my ($fields) = split (/:/, $index);
305
306        foreach my $field (split (/;/, $fields)) {
307        if ($field eq "metadata") {
308            $all_metadata_specified = 1;
309        }
310        else {
311            push(@all_fields,$field);
312        }
313        }
314    }
315    }
316
317    if ($all_metadata_specified) {
318
319    # (Unforunately) we need to process all the documents in the collection
320    # to figure out what the metadata_field_mapping is     
321
322##  my $db_level = "section"; #always
323
324    # set up the document processr
325    $self->{'buildproc'}->set_output_handle (undef);
326    $self->{'buildproc'}->set_mode ('index_field_mapping');
327##  $self->{'buildproc'}->set_index ($index);
328##  $self->{'buildproc'}->set_indexing_text (0);
329    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
330##  $self->{'buildproc'}->set_levels ($levels);
331##  $self->{'buildproc'}->set_db_level ($db_level);
332    $self->{'buildproc'}->reset();
333   
334    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
335               $self->{'buildproc'}, $self->{'maxdocs'});
336    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
337               "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
338    &plugin::end($self->{'pluginfo'});
339   
340##  close ($handle) unless $self->{'debug'};
341
342##  $self->print_stats();
343
344    # just make "delete" stop  ???
345    }
346
347    else {
348    # Field mapping solely dependent of entries in 'indexes'
349
350    # No need to explicitly handle "allfields" as create_shortname()
351    # will get a fix on it through it's static_indexfield_map
352
353    my $buildproc = $self->{'buildproc'};
354   
355    foreach my $field (@all_fields) {
356        if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
357        my $shortname = $buildproc->create_shortname($field);
358        $buildproc->{'indexfieldmap'}->{$field} = $shortname;
359        $buildproc->{'indexfieldmap'}->{$shortname} = 1;
360        }
361    }
362    }
363
364    # write out solr 'schema.xml' (and related) file
365    $self->make_final_field_list();
366    $self->premake_solr_auxiliary_files();
367
368    # if collect==core not already in solr.xml (check with STATUS)
369    # => use CREATE API to add to solr.xml
370    #
371    # else
372    # => use RELOAD call to refresh fields now expressed in schema.xml
373
374}
375
376# Essentially the same as the lucenebuilder.pm version, only using solr_passes
377# => refactor and make better use of inheritence
378
379sub build_indexesXXXX {
380    my $self = shift (@_);
381    my ($indexname) = @_;
382    my $outhandle = $self->{'outhandle'};
383
384    $self->pre_build_indexes($indexname);
385
386    my $indexes = [];
387    if (defined $indexname && $indexname =~ /\w/) {
388    push @$indexes, $indexname;
389    } else {
390    $indexes = $self->{'collect_cfg'}->{'indexes'};
391    }
392
393    # have we got para index?
394    foreach my $level (keys %{$self->{'levels'}}) {
395    if ($level =~ /paragraph/) {
396        print $outhandle "Warning: Paragraph level indexing not supported by Solr\n";
397        last;
398    }
399    }
400    # create the mapping between the index descriptions
401    # and their directory names (includes subcolls and langs)
402    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
403
404    # build each of the indexes
405    foreach my $index (@$indexes) {
406    if ($self->want_built($index)) {
407
408        my $idx = $self->{'index_mapping'}->{$index};
409        foreach my $level (keys %{$self->{'levels'}}) {
410        next if $level =~ /paragraph/; # we don't do para indexing
411        my ($pindex) = $level =~ /^(.)/;
412        # should probably check that new name with level
413        # is unique ... but currently (with doc sec and para)
414        # each has unique first letter.
415        $self->{'index_mapping'}->{$index} = $pindex.$idx;
416
417        my $llevel = $mgppbuilder::level_map{$level};
418        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
419            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
420        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
421
422        $self->build_index($index,$llevel);
423        }
424        $self->{'index_mapping'}->{$index} = $idx;
425
426    } else {
427        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
428    }
429    }
430}
431
432
433# Essentially the same as the lucenebuilder.pm version, only using solr_passes
434# => refactor and make better use of inheritence
435
436sub build_index {
437    my $self = shift (@_);
438    my ($index,$llevel) = @_;
439    my $outhandle = $self->{'outhandle'};
440    my $build_dir = $self->{'build_dir'};
441
442    # get the full index directory path and make sure it exists
443    my $indexdir = $self->{'index_mapping'}->{$index};
444    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
445
446    # Find the perl script to call to run solr
447    my $solr_passes = $self->{'solr_passes'};
448    my $solr_passes_exe = $self->{'solr_passes_exe'};
449
450    # define the section names for solrpasses
451    # define the section names and possibly the doc name for solrpasses
452    my $solr_passes_sections = $llevel;
453
454    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
455
456    my $osextra = "";
457    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
458    $build_dir =~ s@/@\\@g;
459    } else {
460    if ($outhandle ne "STDERR") {
461        # so solr_passes doesn't print to stderr if we redirect output
462        $osextra .= " 2>/dev/null";
463    }
464    }
465
466    # get the index expression if this index belongs
467    # to a subcollection
468    my $indexexparr = [];
469    my $langarr = [];
470
471    # there may be subcollection info, and language info.
472    my ($fields, $subcollection, $language) = split (":", $index);
473    my @subcollections = ();
474    @subcollections = split /,/, $subcollection if (defined $subcollection);
475
476    foreach $subcollection (@subcollections) {
477    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
478        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
479    }
480    }
481
482    # add expressions for languages if this index belongs to
483    # a language subcollection - only put languages expressions for the
484    # ones we want in the index
485    my @languages = ();
486    my $languagemetadata = "Language";
487    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
488    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
489    }
490    @languages = split /,/, $language if (defined $language);
491    foreach my $language (@languages) {
492    my $not=0;
493    if ($language =~ s/^\!//) {
494        $not = 1;
495    }
496    if($not) {
497        push (@$langarr, "!$language");
498    } else {
499        push (@$langarr, "$language");
500    }
501    }
502
503    # Build index dictionary. Uses verbatim stem method
504    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
505    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
506    my ($handle);
507
508    if ($self->{'debug'}) {
509    $handle = *STDOUT;
510    } else {
511    my $collection = $self->{'collection'};
512
513    print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
514    if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
515        print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
516        die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
517    }
518    }
519
520    my $store_levels = $self->{'levels'};
521    my $db_level = "section"; #always
522    my $dom_level = "";
523    foreach my $key (keys %$store_levels) {
524    if ($mgppbuilder::level_map{$key} eq $llevel) {
525        $dom_level = $key;
526    }
527    }
528    if ($dom_level eq "") {
529    print STDERR "Warning: unrecognized tag level $llevel\n";
530    $dom_level = "document";
531    }
532
533    my $local_levels = { $dom_level => 1 }; # work on one level at a time
534
535    # set up the document processr
536    $self->{'buildproc'}->set_output_handle ($handle);
537    $self->{'buildproc'}->set_mode ('text');
538    $self->{'buildproc'}->set_index ($index, $indexexparr);
539    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
540    $self->{'buildproc'}->set_indexing_text (1);
541    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
542    $self->{'buildproc'}->set_levels ($local_levels);
543    $self->{'buildproc'}->set_db_level($db_level);
544    $self->{'buildproc'}->reset();
545
546    print $handle "<update>\n";
547
548    open(TOUT,">/tmp/solr.out"); binmode(TOUT,":utf8");
549    print TOUT "<update>\n";
550    close(TOUT);
551
552    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
553           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
554
555
556    print $handle "</update>\n";
557
558    open(TOUT,">>/tmp/solr.out"); binmode(TOUT,":utf8");
559    print TOUT "</update>\n";
560    close(TOUT);
561
562    close ($handle) unless $self->{'debug'};
563
564    $self->print_stats();
565
566    $self->{'buildproc'}->set_levels ($store_levels);
567    print STDERR "</Stage>\n" if $self->{'gli'};
568
569}
570
571
572
573sub post_build_indexes {
574    my $self = shift(@_);
575
576    # deliberately override to prevent the mgpp post_build_index() calling
577    #  $self->make_final_field_list()
578    # as this has been done in our pre_build_indexes() phase for solr
579   
580}   
581
582
5831;
584
585
Note: See TracBrowser for help on using the browser.