source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24447

Last change on this file since 24447 was 24447, checked in by davidb, 13 years ago

Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running

File size: 14.8 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use Config; # for getting the perlpath in the recommended way
34
35sub BEGIN {
36 @solrbuilder::ISA = ('lucenebuilder');
37}
38
39
40sub new {
41 my $class = shift(@_);
42 my $self = new lucenebuilder (@_);
43 $self = bless $self, $class;
44
45 $self->{'buildtype'} = "solr";
46
47 my $solr_passes_script = "solr_passes.pl";
48
49 $self->{'solr_passes'} = "$solr_passes_script";
50 # Tack perl on the beginning to ensure execution
51 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
52 return $self;
53}
54
55
56sub default_buildproc {
57 my $self = shift (@_);
58
59 return "solrbuildproc";
60}
61
62# This writes a nice version of the text docs
63#
64# Essentially the same as the lucenebuilder.pm version, only using solr_passes
65# => refactor and make better use of inheritence
66#
67sub compress_text
68{
69 my $self = shift (@_);
70 # do nothing if we don't want compressed text
71 return if $self->{'no_text'};
72
73 my ($textindex) = @_;
74
75 # workaround to avoid hard-coding "solr" check into buildcol.pl
76 $textindex =~ s/^section://;
77
78 my $outhandle = $self->{'outhandle'};
79
80 # the text directory
81 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
82 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
83 &util::mk_all_dir ($text_dir);
84
85 my $osextra = "";
86 if ($ENV{'GSDLOS'} =~ /^windows$/i)
87 {
88 $text_dir =~ s@/@\\@g;
89 }
90 else
91 {
92 if ($outhandle ne "STDERR")
93 {
94 # so solr_passes doesn't print to stderr if we redirect output
95 $osextra .= " 2>/dev/null";
96 }
97 }
98
99 # Find the perl script to call to run solr
100 my $solr_passes = $self->{'solr_passes'};
101 my $solr_passes_exe = $self->{'solr_passes_exe'};
102
103 my $solr_passes_sections = "Doc";
104
105 my ($handle);
106
107 if ($self->{'debug'})
108 {
109 $handle = *STDOUT;
110 }
111 else
112 {
113 my $collection = $self->{'collection'};
114
115 print STDERR "Executable: $solr_passes_exe\n";
116 print STDERR "Sections: $solr_passes_sections\n";
117 print STDERR "Build Dir: $build_dir\n";
118 print STDERR "Cmd: $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
119 if (!open($handle, "| $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra"))
120 {
121 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
122 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
123 }
124 }
125
126 # stored text is always Doc and Sec levels
127 my $levels = { 'document' => 1, 'section' => 1 };
128 # always do database at section level
129 my $db_level = "section";
130
131 # set up the document processr
132 $self->{'buildproc'}->set_output_handle ($handle);
133 $self->{'buildproc'}->set_mode ('text');
134 $self->{'buildproc'}->set_index ($textindex);
135 $self->{'buildproc'}->set_indexing_text (0);
136 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
137 $self->{'buildproc'}->set_levels ($levels);
138 $self->{'buildproc'}->set_db_level ($db_level);
139 $self->{'buildproc'}->reset();
140
141 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
142 $self->{'buildproc'}, $self->{'maxdocs'});
143 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
144 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
145 &plugin::end($self->{'pluginfo'});
146
147 close ($handle) unless $self->{'debug'};
148 $self->print_stats();
149
150 print STDERR "</Stage>\n" if $self->{'gli'};
151}
152
153#----
154
155
156
157sub filter_in_out_file
158{
159 my ($in_filename,$out_filename,$replace_rules) = @_;
160
161 if (open(SIN,"<$in_filename")) {
162
163 if (open(SOUT,">$out_filename")) {
164
165 my $line;
166 while (defined ($line=<SIN>)) {
167 chomp $line;
168
169 my $done_insert = 0;
170 foreach my $rule (@$replace_rules) {
171 my $line_re = $rule->{'regexp'};
172 my $insert = $rule->{'insert'};
173
174 if ($line =~ m/$line_re/) {
175 print SOUT $insert;
176 $done_insert = 1;
177 last;
178 }
179 }
180 if (!$done_insert) {
181 print SOUT "$line\n";;
182 }
183 }
184
185 close(SOUT);
186 }
187 else {
188 print STDERR "Error: Failed to open $out_filename\n";
189 print STDERR " $!\n";
190 }
191
192 close(SIN);
193 }
194 else {
195 print STDERR "Error: Failed to open $in_filename\n";
196 print STDERR " $!\n";
197 }
198
199}
200
201# Generate solr schema.xml file based on indexmapfield and other associated
202# config files
203#
204# Unlike make_auxiliary_files(), this needs to be done up-front (rather
205# than at the end) so the data-types in schema.xml are correctly set up
206# prior to document content being pumped through solr_passes.pl
207
208
209sub premake_solr_auxiliary_files
210{
211 my $self = shift (@_);
212
213 # Replace the following marker:
214 #
215 # <!-- ##GREENSTONE-FIELDS## -->
216 #
217 # with lines of the form:
218 #
219 # <field name="<field>" type="string" ... />
220 #
221 # for each <field> in 'indexfieldmap'
222
223 my $schema_insert_xml = "";
224
225 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
226
227 my ($field) = ($ifm =~ m/^.*->(.*)$/);
228
229 # Need special case for Long/Lat
230 # ... but for now treat everything as of type string
231
232 $schema_insert_xml .= " "; # indent
233 $schema_insert_xml .= "<field name=\"$field\" ";
234 $schema_insert_xml .= "type=\"string\" indexed=\"true\" ";
235 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
236 }
237
238 # just the one rule to date
239 my $insert_rules
240 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
241 'insert' => $schema_insert_xml } ];
242
243 my $solr_home = $ENV{'GEXT_SOLR'};
244 my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
245 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
246
247
248 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
249 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
250 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
251
252 # make sure output conf directory exists
253 if (!-d $out_dirname) {
254 &util::mk_dir($out_dirname);
255 }
256
257 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
258
259 # now do the same for solrconfig.xml, stopwords, ...
260 # these are simpler, as they currently do not need any filtering
261
262 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
263 "synonyms.txt", "protwords.txt" );
264
265 foreach my $file ( @in_file_list ) {
266 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
267 my $out_filename = &util::filename_cat($out_dirname,$file);
268 filter_in_out_file($in_filename,$out_filename,[]);
269 }
270}
271
272
273sub pre_build_indexes
274{
275 my $self = shift (@_);
276 my ($indexname) = @_;
277 my $outhandle = $self->{'outhandle'};
278
279 # read in build.cfg if in incremental mode???
280
281 my $indexes = [];
282 if (defined $indexname && $indexname =~ /\w/) {
283 push @$indexes, $indexname;
284 } else {
285 $indexes = $self->{'collect_cfg'}->{'indexes'};
286 }
287
288 # skip para-level check, as this is done in the main 'build_indexes'
289 # routine
290
291 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
292 my $allfields_index = 0; # do we have an allfields index?
293
294 # Using a hashmap here would duplications, but while more space
295 # efficient, it's not entirely clear it would be more computationally
296 # efficient
297 my @all_fields = ();
298
299 foreach my $index (@$indexes) {
300 if ($self->want_built($index)) {
301
302 # get the parameters for the output
303 # split on : just in case there is subcoll and lang stuff
304 my ($fields) = split (/:/, $index);
305
306 foreach my $field (split (/;/, $fields)) {
307 if ($field eq "metadata") {
308 $all_metadata_specified = 1;
309 }
310 else {
311 push(@all_fields,$field);
312 }
313 }
314 }
315 }
316
317 if ($all_metadata_specified) {
318
319 # (Unforunately) we need to process all the documents in the collection
320 # to figure out what the metadata_field_mapping is
321
322 # set up the document processr
323 $self->{'buildproc'}->set_output_handle (undef);
324 $self->{'buildproc'}->set_mode ('index_field_mapping');
325 $self->{'buildproc'}->reset();
326
327 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
328 $self->{'buildproc'}, $self->{'maxdocs'});
329 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
330 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
331 &plugin::end($self->{'pluginfo'});
332
333 }
334
335 else {
336 # Field mapping solely dependent of entries in 'indexes'
337
338 # No need to explicitly handle "allfields" as create_shortname()
339 # will get a fix on it through it's static_indexfield_map
340
341 my $buildproc = $self->{'buildproc'};
342
343 foreach my $field (@all_fields) {
344 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
345 my $shortname = $buildproc->create_shortname($field);
346 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
347 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
348 }
349 }
350 }
351
352 # write out solr 'schema.xml' (and related) file
353 $self->make_final_field_list();
354 $self->premake_solr_auxiliary_files();
355
356 # if collect==core not already in solr.xml (check with STATUS)
357 # => use CREATE API to add to solr.xml
358 #
359 # else
360 # => use RELOAD call to refresh fields now expressed in schema.xml
361
362}
363
364# Essentially the same as the lucenebuilder.pm version, only using solr_passes
365# => refactor and make better use of inheritence
366
367sub build_index {
368 my $self = shift (@_);
369 my ($index,$llevel) = @_;
370 my $outhandle = $self->{'outhandle'};
371 my $build_dir = $self->{'build_dir'};
372
373 # get the full index directory path and make sure it exists
374 my $indexdir = $self->{'index_mapping'}->{$index};
375 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
376
377 # Find the perl script to call to run solr
378 my $solr_passes = $self->{'solr_passes'};
379 my $solr_passes_exe = $self->{'solr_passes_exe'};
380
381 # define the section names for solrpasses
382 # define the section names and possibly the doc name for solrpasses
383 my $solr_passes_sections = $llevel;
384
385 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
386
387 my $osextra = "";
388 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
389 $build_dir =~ s@/@\\@g;
390 } else {
391 if ($outhandle ne "STDERR") {
392 # so solr_passes doesn't print to stderr if we redirect output
393 $osextra .= " 2>/dev/null";
394 }
395 }
396
397 # get the index expression if this index belongs
398 # to a subcollection
399 my $indexexparr = [];
400 my $langarr = [];
401
402 # there may be subcollection info, and language info.
403 my ($fields, $subcollection, $language) = split (":", $index);
404 my @subcollections = ();
405 @subcollections = split /,/, $subcollection if (defined $subcollection);
406
407 foreach $subcollection (@subcollections) {
408 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
409 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
410 }
411 }
412
413 # add expressions for languages if this index belongs to
414 # a language subcollection - only put languages expressions for the
415 # ones we want in the index
416 my @languages = ();
417 my $languagemetadata = "Language";
418 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
419 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
420 }
421 @languages = split /,/, $language if (defined $language);
422 foreach my $language (@languages) {
423 my $not=0;
424 if ($language =~ s/^\!//) {
425 $not = 1;
426 }
427 if($not) {
428 push (@$langarr, "!$language");
429 } else {
430 push (@$langarr, "$language");
431 }
432 }
433
434 # Build index dictionary. Uses verbatim stem method
435 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
436 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
437 my ($handle);
438
439 if ($self->{'debug'}) {
440 $handle = *STDOUT;
441 } else {
442 my $collection = $self->{'collection'};
443
444 print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
445 if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
446 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
447 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
448 }
449 }
450
451 my $store_levels = $self->{'levels'};
452 my $db_level = "section"; #always
453 my $dom_level = "";
454 foreach my $key (keys %$store_levels) {
455 if ($mgppbuilder::level_map{$key} eq $llevel) {
456 $dom_level = $key;
457 }
458 }
459 if ($dom_level eq "") {
460 print STDERR "Warning: unrecognized tag level $llevel\n";
461 $dom_level = "document";
462 }
463
464 my $local_levels = { $dom_level => 1 }; # work on one level at a time
465
466 # set up the document processr
467 $self->{'buildproc'}->set_output_handle ($handle);
468 $self->{'buildproc'}->set_mode ('text');
469 $self->{'buildproc'}->set_index ($index, $indexexparr);
470 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
471 $self->{'buildproc'}->set_indexing_text (1);
472 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
473 $self->{'buildproc'}->set_levels ($local_levels);
474 $self->{'buildproc'}->set_db_level($db_level);
475 $self->{'buildproc'}->reset();
476
477 print $handle "<update>\n";
478
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
481
482
483 print $handle "</update>\n";
484
485 close ($handle) unless $self->{'debug'};
486
487 $self->print_stats();
488
489 $self->{'buildproc'}->set_levels ($store_levels);
490 print STDERR "</Stage>\n" if $self->{'gli'};
491
492}
493
494
495sub post_build_indexes {
496 my $self = shift(@_);
497
498 # deliberately override to prevent the mgpp post_build_index() calling
499 # $self->make_final_field_list()
500 # as this has been done in our pre_build_indexes() phase for solr
501
502}
503
504
5051;
506
507
Note: See TracBrowser for help on using the repository browser.