source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24446

Last change on this file since 24446 was 24446, checked in by davidb, 13 years ago

Start of Solr extension for Greenstone3

File size: 17.2 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use Config; # for getting the perlpath in the recommended way
34
35sub BEGIN {
36 @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41 my $class = shift(@_);
42 my $self = new lucenebuilder (@_);
43 $self = bless $self, $class;
44
45 $self->{'buildtype'} = "solr";
46
47 my $solr_passes_script = "solr_passes.pl";
48
49 $self->{'solr_passes'} = "$solr_passes_script";
50 # Tack perl on the beginning to ensure execution
51 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
52 return $self;
53}
54
55
56sub default_buildproc {
57 my $self = shift (@_);
58
59 return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69 my $self = shift (@_);
70 # do nothing if we don't want compressed text
71 return if $self->{'no_text'};
72
73 my ($textindex) = @_;
74
75 # workaround to avoid hard-coding "solr" check into buildcol.pl
76 $textindex =~ s/^section://;
77
78 my $outhandle = $self->{'outhandle'};
79
80 # the text directory
81 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
82 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
83 &util::mk_all_dir ($text_dir);
84
85 my $osextra = "";
86 if ($ENV{'GSDLOS'} =~ /^windows$/i)
87 {
88 $text_dir =~ s@/@\\@g;
89 }
90 else
91 {
92 if ($outhandle ne "STDERR")
93 {
94 # so solr_passes doesn't print to stderr if we redirect output
95 $osextra .= " 2>/dev/null";
96 }
97 }
98
99 # Find the perl script to call to run solr
100 my $solr_passes = $self->{'solr_passes'};
101 my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103 my $solr_passes_sections = "Doc";
104
105 my ($handle);
106
107 if ($self->{'debug'})
108 {
109 $handle = *STDOUT;
110 }
111 else
112 {
113 my $collection = $self->{'collection'};
114
115 print STDERR "Executable: $solr_passes_exe\n";
116 print STDERR "Sections: $solr_passes_sections\n";
117 print STDERR "Build Dir: $build_dir\n";
118 print STDERR "Cmd: $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
119 if (!open($handle, "| $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra"))
120 {
121 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
122 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
123 }
124 }
125
126 # stored text is always Doc and Sec levels
127 my $levels = { 'document' => 1, 'section' => 1 };
128 # always do database at section level
129 my $db_level = "section";
130
131 # set up the document processr
132 $self->{'buildproc'}->set_output_handle ($handle);
133 $self->{'buildproc'}->set_mode ('text');
134 $self->{'buildproc'}->set_index ($textindex);
135 $self->{'buildproc'}->set_indexing_text (0);
136 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
137 $self->{'buildproc'}->set_levels ($levels);
138 $self->{'buildproc'}->set_db_level ($db_level);
139 $self->{'buildproc'}->reset();
140
141 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
142 $self->{'buildproc'}, $self->{'maxdocs'});
143 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
144 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
145 &plugin::end($self->{'pluginfo'});
146
147 close ($handle) unless $self->{'debug'};
148 $self->print_stats();
149
150 print STDERR "</Stage>\n" if $self->{'gli'};
151}
152
153#----
154
155
156
157sub filter_in_out_file
158{
159 my ($in_filename,$out_filename,$replace_rules) = @_;
160
161 if (open(SIN,"<$in_filename")) {
162
163 if (open(SOUT,">$out_filename")) {
164
165 my $line;
166 while (defined ($line=<SIN>)) {
167 chomp $line;
168
169 my $done_insert = 0;
170 foreach my $rule (@$replace_rules) {
171 my $line_re = $rule->{'regexp'};
172 my $insert = $rule->{'insert'};
173
174 if ($line =~ m/$line_re/) {
175 print SOUT $insert;
176 $done_insert = 1;
177 last;
178 }
179 }
180 if (!$done_insert) {
181 print SOUT "$line\n";;
182 }
183 }
184
185 close(SOUT);
186 }
187 else {
188 print STDERR "Error: Failed to open $out_filename\n";
189 print STDERR " $!\n";
190 }
191
192 close(SIN);
193 }
194 else {
195 print STDERR "Error: Failed to open $in_filename\n";
196 print STDERR " $!\n";
197 }
198
199}
200
201# Generate solr schema.xml file based on indexmapfield and other associated
202# config files
203#
204# Unlike make_auxiliary_files(), this needs to be done up-front (rather
205# than at the end) so the data-types in schema.xml are correctly set up
206# prior to document content being pumped through solr_passes.pl
207
208
209sub premake_solr_auxiliary_files
210{
211 my $self = shift (@_);
212
213 # Replace the following marker:
214 #
215 # <!-- ##GREENSTONE-FIELDS## -->
216 #
217 # with lines of the form:
218 #
219 # <field name="<field>" type="string" ... />
220 #
221 # for each <field> in 'indexfieldmap'
222
223 my $schema_insert_xml = "";
224
225 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
226
227 my ($field) = ($ifm =~ m/^.*->(.*)$/);
228
229 # Need special case for Long/Lat
230 # ... but for now treat everything as of type string
231
232 $schema_insert_xml .= " "; # indent
233 $schema_insert_xml .= "<field name=\"$field\" ";
234 $schema_insert_xml .= "type=\"string\" indexed=\"true\" ";
235 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
236 }
237
238 # just the one rule to date
239 my $insert_rules
240 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
241 'insert' => $schema_insert_xml } ];
242
243 my $solr_home = $ENV{'GEXT_SOLR'};
244 my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
245 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
246
247
248 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
249 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
250 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
251
252 # make sure output conf directory exists
253 if (!-d $out_dirname) {
254 &util::mk_dir($out_dirname);
255 }
256
257 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
258
259 # now do the same for solrconfig.xml, stopwords, ...
260 # these are simpler, as they currently do not need any filtering
261
262 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
263 "synonyms.txt", "protwords.txt" );
264
265 foreach my $file ( @in_file_list ) {
266 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
267 my $out_filename = &util::filename_cat($out_dirname,$file);
268 filter_in_out_file($in_filename,$out_filename,[]);
269 }
270}
271
272
273sub pre_build_indexes
274{
275 my $self = shift (@_);
276 my ($indexname) = @_;
277 my $outhandle = $self->{'outhandle'};
278
279 # read in build.cfg if in incremental mode???
280
281 my $indexes = [];
282 if (defined $indexname && $indexname =~ /\w/) {
283 push @$indexes, $indexname;
284 } else {
285 $indexes = $self->{'collect_cfg'}->{'indexes'};
286 }
287
288 # skip para-level check, as this is done in the main 'build_indexes'
289 # routine
290
291 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
292 my $allfields_index = 0; # do we have an allfields index?
293
294 # Using a hashmap here would duplications, but while more space
295 # efficient, it's not entirely clear it would be more computationally
296 # efficient
297 my @all_fields = ();
298
299 foreach my $index (@$indexes) {
300 if ($self->want_built($index)) {
301
302 # get the parameters for the output
303 # split on : just in case there is subcoll and lang stuff
304 my ($fields) = split (/:/, $index);
305
306 foreach my $field (split (/;/, $fields)) {
307 if ($field eq "metadata") {
308 $all_metadata_specified = 1;
309 }
310 else {
311 push(@all_fields,$field);
312 }
313 }
314 }
315 }
316
317 if ($all_metadata_specified) {
318
319 # (Unforunately) we need to process all the documents in the collection
320 # to figure out what the metadata_field_mapping is
321
322## my $db_level = "section"; #always
323
324 # set up the document processr
325 $self->{'buildproc'}->set_output_handle (undef);
326 $self->{'buildproc'}->set_mode ('index_field_mapping');
327## $self->{'buildproc'}->set_index ($index);
328## $self->{'buildproc'}->set_indexing_text (0);
329 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
330## $self->{'buildproc'}->set_levels ($levels);
331## $self->{'buildproc'}->set_db_level ($db_level);
332 $self->{'buildproc'}->reset();
333
334 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
335 $self->{'buildproc'}, $self->{'maxdocs'});
336 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
337 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
338 &plugin::end($self->{'pluginfo'});
339
340## close ($handle) unless $self->{'debug'};
341
342## $self->print_stats();
343
344 # just make "delete" stop ???
345 }
346
347 else {
348 # Field mapping solely dependent of entries in 'indexes'
349
350 # No need to explicitly handle "allfields" as create_shortname()
351 # will get a fix on it through it's static_indexfield_map
352
353 my $buildproc = $self->{'buildproc'};
354
355 foreach my $field (@all_fields) {
356 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
357 my $shortname = $buildproc->create_shortname($field);
358 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
359 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
360 }
361 }
362 }
363
364 # write out solr 'schema.xml' (and related) file
365 $self->make_final_field_list();
366 $self->premake_solr_auxiliary_files();
367
368 # if collect==core not already in solr.xml (check with STATUS)
369 # => use CREATE API to add to solr.xml
370 #
371 # else
372 # => use RELOAD call to refresh fields now expressed in schema.xml
373
374}
375
376# Essentially the same as the lucenebuilder.pm version, only using solr_passes
377# => refactor and make better use of inheritence
378
379sub build_indexesXXXX {
380 my $self = shift (@_);
381 my ($indexname) = @_;
382 my $outhandle = $self->{'outhandle'};
383
384 $self->pre_build_indexes($indexname);
385
386 my $indexes = [];
387 if (defined $indexname && $indexname =~ /\w/) {
388 push @$indexes, $indexname;
389 } else {
390 $indexes = $self->{'collect_cfg'}->{'indexes'};
391 }
392
393 # have we got para index?
394 foreach my $level (keys %{$self->{'levels'}}) {
395 if ($level =~ /paragraph/) {
396 print $outhandle "Warning: Paragraph level indexing not supported by Solr\n";
397 last;
398 }
399 }
400 # create the mapping between the index descriptions
401 # and their directory names (includes subcolls and langs)
402 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
403
404 # build each of the indexes
405 foreach my $index (@$indexes) {
406 if ($self->want_built($index)) {
407
408 my $idx = $self->{'index_mapping'}->{$index};
409 foreach my $level (keys %{$self->{'levels'}}) {
410 next if $level =~ /paragraph/; # we don't do para indexing
411 my ($pindex) = $level =~ /^(.)/;
412 # should probably check that new name with level
413 # is unique ... but currently (with doc sec and para)
414 # each has unique first letter.
415 $self->{'index_mapping'}->{$index} = $pindex.$idx;
416
417 my $llevel = $mgppbuilder::level_map{$level};
418 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
419 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
420 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
421
422 $self->build_index($index,$llevel);
423 }
424 $self->{'index_mapping'}->{$index} = $idx;
425
426 } else {
427 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
428 }
429 }
430}
431
432
433# Essentially the same as the lucenebuilder.pm version, only using solr_passes
434# => refactor and make better use of inheritence
435
436sub build_index {
437 my $self = shift (@_);
438 my ($index,$llevel) = @_;
439 my $outhandle = $self->{'outhandle'};
440 my $build_dir = $self->{'build_dir'};
441
442 # get the full index directory path and make sure it exists
443 my $indexdir = $self->{'index_mapping'}->{$index};
444 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
445
446 # Find the perl script to call to run solr
447 my $solr_passes = $self->{'solr_passes'};
448 my $solr_passes_exe = $self->{'solr_passes_exe'};
449
450 # define the section names for solrpasses
451 # define the section names and possibly the doc name for solrpasses
452 my $solr_passes_sections = $llevel;
453
454 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
455
456 my $osextra = "";
457 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
458 $build_dir =~ s@/@\\@g;
459 } else {
460 if ($outhandle ne "STDERR") {
461 # so solr_passes doesn't print to stderr if we redirect output
462 $osextra .= " 2>/dev/null";
463 }
464 }
465
466 # get the index expression if this index belongs
467 # to a subcollection
468 my $indexexparr = [];
469 my $langarr = [];
470
471 # there may be subcollection info, and language info.
472 my ($fields, $subcollection, $language) = split (":", $index);
473 my @subcollections = ();
474 @subcollections = split /,/, $subcollection if (defined $subcollection);
475
476 foreach $subcollection (@subcollections) {
477 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
478 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
479 }
480 }
481
482 # add expressions for languages if this index belongs to
483 # a language subcollection - only put languages expressions for the
484 # ones we want in the index
485 my @languages = ();
486 my $languagemetadata = "Language";
487 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
488 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
489 }
490 @languages = split /,/, $language if (defined $language);
491 foreach my $language (@languages) {
492 my $not=0;
493 if ($language =~ s/^\!//) {
494 $not = 1;
495 }
496 if($not) {
497 push (@$langarr, "!$language");
498 } else {
499 push (@$langarr, "$language");
500 }
501 }
502
503 # Build index dictionary. Uses verbatim stem method
504 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
505 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
506 my ($handle);
507
508 if ($self->{'debug'}) {
509 $handle = *STDOUT;
510 } else {
511 my $collection = $self->{'collection'};
512
513 print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
514 if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
515 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
516 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
517 }
518 }
519
520 my $store_levels = $self->{'levels'};
521 my $db_level = "section"; #always
522 my $dom_level = "";
523 foreach my $key (keys %$store_levels) {
524 if ($mgppbuilder::level_map{$key} eq $llevel) {
525 $dom_level = $key;
526 }
527 }
528 if ($dom_level eq "") {
529 print STDERR "Warning: unrecognized tag level $llevel\n";
530 $dom_level = "document";
531 }
532
533 my $local_levels = { $dom_level => 1 }; # work on one level at a time
534
535 # set up the document processr
536 $self->{'buildproc'}->set_output_handle ($handle);
537 $self->{'buildproc'}->set_mode ('text');
538 $self->{'buildproc'}->set_index ($index, $indexexparr);
539 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
540 $self->{'buildproc'}->set_indexing_text (1);
541 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
542 $self->{'buildproc'}->set_levels ($local_levels);
543 $self->{'buildproc'}->set_db_level($db_level);
544 $self->{'buildproc'}->reset();
545
546 print $handle "<update>\n";
547
548 open(TOUT,">/tmp/solr.out"); binmode(TOUT,":utf8");
549 print TOUT "<update>\n";
550 close(TOUT);
551
552 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
553 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
554
555
556 print $handle "</update>\n";
557
558 open(TOUT,">>/tmp/solr.out"); binmode(TOUT,":utf8");
559 print TOUT "</update>\n";
560 close(TOUT);
561
562 close ($handle) unless $self->{'debug'};
563
564 $self->print_stats();
565
566 $self->{'buildproc'}->set_levels ($store_levels);
567 print STDERR "</Stage>\n" if $self->{'gli'};
568
569}
570
571
572
573sub post_build_indexes {
574 my $self = shift(@_);
575
576 # deliberately override to prevent the mgpp post_build_index() calling
577 # $self->make_final_field_list()
578 # as this has been done in our pre_build_indexes() phase for solr
579
580}
581
582
5831;
584
585
Note: See TracBrowser for help on using the repository browser.