source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24483

Last change on this file since 24483 was 24483, checked in by davidb, 13 years ago
Reworking of code that detects existing running instance of Solr/Jetty server. This was due to Windows version of Perl not implementing '' on an open call. Code currently messy and needs a further tidy up.
File size: 16.1 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84 &util::mk_all_dir ($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
114 my $collection = $self->{'collection'};
115
116 print STDERR "Executable: $solr_passes_exe\n";
117 print STDERR "Sections: $solr_passes_sections\n";
118 print STDERR "Build Dir: $build_dir\n";
119 print STDERR "Cmd: $solr_passes_exe $collection text dummy \"$build_dir\" \"dummy\" $osextra\n";
120 if (!open($handle, "| $solr_passes_exe $collection text dummy \"$build_dir\" \"dummy\" $osextra"))
121 {
122 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
123 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
124 }
125 }
126
127 # stored text is always Doc and Sec levels
128 my $levels = { 'document' => 1, 'section' => 1 };
129 # always do database at section level
130 my $db_level = "section";
131
132 # set up the document processr
133 $self->{'buildproc'}->set_output_handle ($handle);
134 $self->{'buildproc'}->set_mode ('text');
135 $self->{'buildproc'}->set_index ($textindex);
136 $self->{'buildproc'}->set_indexing_text (0);
137 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
138 $self->{'buildproc'}->set_levels ($levels);
139 $self->{'buildproc'}->set_db_level ($db_level);
140 $self->{'buildproc'}->reset();
141
142 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
143 $self->{'buildproc'}, $self->{'maxdocs'});
144 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
145 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
146 &plugin::end($self->{'pluginfo'});
147
148 close ($handle) unless $self->{'debug'};
149 $self->print_stats();
150
151 print STDERR "</Stage>\n" if $self->{'gli'};
152}
153
154#----
155
156
157
158sub filter_in_out_file
159{
160 my ($in_filename,$out_filename,$replace_rules) = @_;
161
162 if (open(SIN,"<$in_filename")) {
163
164 if (open(SOUT,">$out_filename")) {
165
166 my $line;
167 while (defined ($line=<SIN>)) {
168 chomp $line;
169
170 my $done_insert = 0;
171 foreach my $rule (@$replace_rules) {
172 my $line_re = $rule->{'regexp'};
173 my $insert = $rule->{'insert'};
174
175 if ($line =~ m/$line_re/) {
176 print SOUT $insert;
177 $done_insert = 1;
178 last;
179 }
180 }
181 if (!$done_insert) {
182 print SOUT "$line\n";;
183 }
184 }
185
186 close(SOUT);
187 }
188 else {
189 print STDERR "Error: Failed to open $out_filename\n";
190 print STDERR " $!\n";
191 }
192
193 close(SIN);
194 }
195 else {
196 print STDERR "Error: Failed to open $in_filename\n";
197 print STDERR " $!\n";
198 }
199
200}
201
202# Generate solr schema.xml file based on indexmapfield and other associated
203# config files
204#
205# Unlike make_auxiliary_files(), this needs to be done up-front (rather
206# than at the end) so the data-types in schema.xml are correctly set up
207# prior to document content being pumped through solr_passes.pl
208
209
210sub premake_solr_auxiliary_files
211{
212 my $self = shift (@_);
213
214 # Replace the following marker:
215 #
216 # <!-- ##GREENSTONE-FIELDS## -->
217 #
218 # with lines of the form:
219 #
220 # <field name="<field>" type="string" ... />
221 #
222 # for each <field> in 'indexfieldmap'
223
224 my $schema_insert_xml = "";
225
226 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
227
228 my ($field) = ($ifm =~ m/^.*->(.*)$/);
229
230 # Need special case for Long/Lat
231 # ... but for now treat everything as of type string
232
233 $schema_insert_xml .= " "; # indent
234 $schema_insert_xml .= "<field name=\"$field\" ";
235 $schema_insert_xml .= "type=\"string\" indexed=\"true\" ";
236 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
237 }
238
239 # just the one rule to date
240 my $insert_rules
241 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
242 'insert' => $schema_insert_xml } ];
243
244 my $solr_home = $ENV{'GEXT_SOLR'};
245 my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
246 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
247
248
249 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
250 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
251 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
252
253 # make sure output conf directory exists
254 if (!-d $out_dirname) {
255 &util::mk_dir($out_dirname);
256 }
257
258 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
259
260 # now do the same for solrconfig.xml, stopwords, ...
261 # these are simpler, as they currently do not need any filtering
262
263 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
264 "synonyms.txt", "protwords.txt" );
265
266 foreach my $file ( @in_file_list ) {
267 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
268 my $out_filename = &util::filename_cat($out_dirname,$file);
269 filter_in_out_file($in_filename,$out_filename,[]);
270 }
271}
272
273
274sub pre_build_indexes
275{
276 my $self = shift (@_);
277 my ($indexname) = @_;
278 my $outhandle = $self->{'outhandle'};
279
280 # If the Solr/Jetty server is not already running, the following starts
281 # it up, and only returns when the server is "reading and listening"
282
283 my $solr_server = new solrserver();
284 $solr_server->start();
285 $self->{'solr_server'} = $solr_server;
286
287 my $indexes = [];
288 if (defined $indexname && $indexname =~ /\w/) {
289 push @$indexes, $indexname;
290 } else {
291 $indexes = $self->{'collect_cfg'}->{'indexes'};
292 }
293
294 # skip para-level check, as this is done in the main 'build_indexes'
295 # routine
296
297 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
298 my $allfields_index = 0; # do we have an allfields index?
299
300 # Using a hashmap here would duplications, but while more space
301 # efficient, it's not entirely clear it would be more computationally
302 # efficient
303 my @all_fields = ();
304
305 foreach my $index (@$indexes) {
306 if ($self->want_built($index)) {
307
308 # get the parameters for the output
309 # split on : just in case there is subcoll and lang stuff
310 my ($fields) = split (/:/, $index);
311
312 foreach my $field (split (/;/, $fields)) {
313 if ($field eq "metadata") {
314 $all_metadata_specified = 1;
315 }
316 else {
317 push(@all_fields,$field);
318 }
319 }
320 }
321 }
322
323 if ($all_metadata_specified) {
324
325 # (Unforunately) we need to process all the documents in the collection
326 # to figure out what the metadata_field_mapping is
327
328 # set up the document processr
329 $self->{'buildproc'}->set_output_handle (undef);
330 $self->{'buildproc'}->set_mode ('index_field_mapping');
331 $self->{'buildproc'}->reset();
332
333 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
334 $self->{'buildproc'}, $self->{'maxdocs'});
335 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
336 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
337 &plugin::end($self->{'pluginfo'});
338
339 }
340
341 else {
342 # Field mapping solely dependent of entries in 'indexes'
343
344 # No need to explicitly handle "allfields" as create_shortname()
345 # will get a fix on it through it's static_indexfield_map
346
347 my $buildproc = $self->{'buildproc'};
348
349 foreach my $field (@all_fields) {
350 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
351 my $shortname = $buildproc->create_shortname($field);
352 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
353 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
354 }
355 }
356 }
357
358 # Write out solr 'schema.xml' (and related) file
359 #
360 $self->make_final_field_list();
361 $self->premake_solr_auxiliary_files();
362
363 # Now update the solr-core information in solr.xml
364 # => at most two cores <colname>-Doc and <colname>-Sec
365
366 my $collection = $self->{'collection'};
367
368 # my $idx = $self->{'index_mapping'}->{$index};
369 my $idx = "idx";
370
371 foreach my $level (keys %{$self->{'levels'}}) {
372
373 my ($pindex) = $level =~ /^(.)/;
374
375## my $llevel = $mgppbuilder::level_map{$level};
376## my $core = $collection."-".lc($llevel);
377
378 my $core = $collection."-".$pindex.$idx;
379
380 # if collect==core already in solr.xml (check with STATUS)
381 # => use RELOAD call to refresh fields now expressed in schema.xml
382 #
383 # else
384 # => use CREATE API to add to solr.xml
385
386 my $check_core_exists = $solr_server->admin_ping_core($core);
387
388 if ($check_core_exists) {
389 print $outhandle "Reloading Solr core: $core\n";
390 $solr_server->admin_reload_core($core);
391 }
392 else {
393 print $outhandle "Creating Solr core: $core\n";
394 $solr_server->admin_create_core($core);
395 }
396 }
397
398}
399
400# Essentially the same as the lucenebuilder.pm version, only using solr_passes
401# => refactor and make better use of inheritence
402
403sub build_index {
404 my $self = shift (@_);
405 my ($index,$llevel) = @_;
406 my $outhandle = $self->{'outhandle'};
407 my $build_dir = $self->{'build_dir'};
408
409 # get the full index directory path and make sure it exists
410 my $indexdir = $self->{'index_mapping'}->{$index};
411 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
412
413 # Find the perl script to call to run solr
414 my $solr_passes = $self->{'solr_passes'};
415 my $solr_passes_exe = $self->{'solr_passes_exe'};
416
417 # define the section names for solrpasses
418 # define the section names and possibly the doc name for solrpasses
419 my $solr_passes_sections = $llevel;
420
421 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
422
423 my $osextra = "";
424 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
425 $build_dir =~ s@/@\\@g;
426 } else {
427 if ($outhandle ne "STDERR") {
428 # so solr_passes doesn't print to stderr if we redirect output
429 $osextra .= " 2>/dev/null";
430 }
431 }
432
433 # get the index expression if this index belongs
434 # to a subcollection
435 my $indexexparr = [];
436 my $langarr = [];
437
438 # there may be subcollection info, and language info.
439 my ($fields, $subcollection, $language) = split (":", $index);
440 my @subcollections = ();
441 @subcollections = split /,/, $subcollection if (defined $subcollection);
442
443 foreach $subcollection (@subcollections) {
444 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
445 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
446 }
447 }
448
449 # add expressions for languages if this index belongs to
450 # a language subcollection - only put languages expressions for the
451 # ones we want in the index
452 my @languages = ();
453 my $languagemetadata = "Language";
454 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
455 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
456 }
457 @languages = split /,/, $language if (defined $language);
458 foreach my $language (@languages) {
459 my $not=0;
460 if ($language =~ s/^\!//) {
461 $not = 1;
462 }
463 if($not) {
464 push (@$langarr, "!$language");
465 } else {
466 push (@$langarr, "$language");
467 }
468 }
469
470 # Build index dictionary. Uses verbatim stem method
471 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
472 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
473 my ($handle);
474
475 if ($self->{'debug'}) {
476 $handle = *STDOUT;
477 } else {
478 my $collection = $self->{'collection'};
479 my $ds_idx = $self->{'index_mapping'}->{$index};
480
481 print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $ds_idx \"$build_dir\" \"$indexdir\" $osextra\n";
482 if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $ds_idx \"$build_dir\" \"$indexdir\" $osextra")) {
483 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
484 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
485 }
486 }
487
488 my $store_levels = $self->{'levels'};
489 my $db_level = "section"; #always
490 my $dom_level = "";
491 foreach my $key (keys %$store_levels) {
492 if ($mgppbuilder::level_map{$key} eq $llevel) {
493 $dom_level = $key;
494 }
495 }
496 if ($dom_level eq "") {
497 print STDERR "Warning: unrecognized tag level $llevel\n";
498 $dom_level = "document";
499 }
500
501 my $local_levels = { $dom_level => 1 }; # work on one level at a time
502
503 # set up the document processr
504 $self->{'buildproc'}->set_output_handle ($handle);
505 $self->{'buildproc'}->set_mode ('text');
506 $self->{'buildproc'}->set_index ($index, $indexexparr);
507 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
508 $self->{'buildproc'}->set_indexing_text (1);
509 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
510 $self->{'buildproc'}->set_levels ($local_levels);
511 $self->{'buildproc'}->set_db_level($db_level);
512 $self->{'buildproc'}->reset();
513
514 print $handle "<update>\n";
515
516 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
517 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
518
519
520 print $handle "</update>\n";
521
522 close ($handle) unless $self->{'debug'};
523
524 $self->print_stats();
525
526 $self->{'buildproc'}->set_levels ($store_levels);
527 print STDERR "</Stage>\n" if $self->{'gli'};
528
529}
530
531
532sub post_build_indexes {
533 my $self = shift(@_);
534
535 # deliberately override to prevent the mgpp post_build_index() calling
536 # $self->make_final_field_list()
537 # as this has been done in our pre_build_indexes() phase for solr
538
539
540 # Also need to stop the Solr/jetty server if it was explicitly started
541 # in pre_build_indexes()
542
543 my $solr_server = $self->{'solr_server'};
544
545 if ($solr_server->explicitly_started()) {
546 $solr_server->stop();
547 }
548
549 $self->{'solr_server'} = undef;
550
551}
552
553
5541;
555
556
Note: See TracBrowser for help on using the repository browser.