source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24501

Last change on this file since 24501 was 24501, checked in by davidb, 13 years ago

Relocation of files to make solr.solr.home more natural. Plus, more carefully control the order in which the build_dir/index_dir folder is deleted in. For solr we need to do this earlier than lucene

File size: 16.7 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84 &util::mk_all_dir ($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
114 my $site = $self->{'site'};
115 my $collect = $self->{'collection'};
116 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117 my $core = $core_prefix; # unused in this call to solr_passes
118
119 print STDERR "Executable: $solr_passes_exe\n";
120 print STDERR "Sections: $solr_passes_sections\n";
121 print STDERR "Build Dir: $build_dir\n";
122 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
123 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
124 {
125 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
126 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
127 }
128 }
129
130 # stored text is always Doc and Sec levels
131 my $levels = { 'document' => 1, 'section' => 1 };
132 # always do database at section level
133 my $db_level = "section";
134
135 # set up the document processr
136 $self->{'buildproc'}->set_output_handle ($handle);
137 $self->{'buildproc'}->set_mode ('text');
138 $self->{'buildproc'}->set_index ($textindex);
139 $self->{'buildproc'}->set_indexing_text (0);
140 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
141 $self->{'buildproc'}->set_levels ($levels);
142 $self->{'buildproc'}->set_db_level ($db_level);
143 $self->{'buildproc'}->reset();
144
145 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
146 $self->{'buildproc'}, $self->{'maxdocs'});
147 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
148 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
149 &plugin::end($self->{'pluginfo'});
150
151 close ($handle) unless $self->{'debug'};
152 $self->print_stats();
153
154 print STDERR "</Stage>\n" if $self->{'gli'};
155}
156
157#----
158
159
160
161sub filter_in_out_file
162{
163 my ($in_filename,$out_filename,$replace_rules) = @_;
164
165 if (open(SIN,"<$in_filename")) {
166
167 if (open(SOUT,">$out_filename")) {
168
169 my $line;
170 while (defined ($line=<SIN>)) {
171 chomp $line;
172
173 my $done_insert = 0;
174 foreach my $rule (@$replace_rules) {
175 my $line_re = $rule->{'regexp'};
176 my $insert = $rule->{'insert'};
177
178 if ($line =~ m/$line_re/) {
179 print SOUT $insert;
180 $done_insert = 1;
181 last;
182 }
183 }
184 if (!$done_insert) {
185 print SOUT "$line\n";;
186 }
187 }
188
189 close(SOUT);
190 }
191 else {
192 print STDERR "Error: Failed to open $out_filename\n";
193 print STDERR " $!\n";
194 }
195
196 close(SIN);
197 }
198 else {
199 print STDERR "Error: Failed to open $in_filename\n";
200 print STDERR " $!\n";
201 }
202
203}
204
205# Generate solr schema.xml file based on indexmapfield and other associated
206# config files
207#
208# Unlike make_auxiliary_files(), this needs to be done up-front (rather
209# than at the end) so the data-types in schema.xml are correctly set up
210# prior to document content being pumped through solr_passes.pl
211
212
213sub premake_solr_auxiliary_files
214{
215 my $self = shift (@_);
216
217 # Replace the following marker:
218 #
219 # <!-- ##GREENSTONE-FIELDS## -->
220 #
221 # with lines of the form:
222 #
223 # <field name="<field>" type="string" ... />
224 #
225 # for each <field> in 'indexfieldmap'
226
227 my $schema_insert_xml = "";
228
229 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
230
231 my ($field) = ($ifm =~ m/^.*->(.*)$/);
232
233 # Need special case for Long/Lat
234 # ... but for now treat everything as of type string
235
236 $schema_insert_xml .= " "; # indent
237 $schema_insert_xml .= "<field name=\"$field\" ";
238 $schema_insert_xml .= "type=\"string\" indexed=\"true\" ";
239 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
240 }
241
242 # just the one rule to date
243 my $insert_rules
244 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
245 'insert' => $schema_insert_xml } ];
246
247 my $solr_home = $ENV{'GEXT_SOLR'};
248## my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
249 my $in_dirname = &util::filename_cat($solr_home,"conf");
250 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
251
252
253 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
254 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
255 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
256
257 # make sure output conf directory exists
258 if (!-d $out_dirname) {
259 &util::mk_dir($out_dirname);
260 }
261
262 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
263
264 # now do the same for solrconfig.xml, stopwords, ...
265 # these are simpler, as they currently do not need any filtering
266
267 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
268 "synonyms.txt", "protwords.txt" );
269
270 foreach my $file ( @in_file_list ) {
271 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
272 my $out_filename = &util::filename_cat($out_dirname,$file);
273 filter_in_out_file($in_filename,$out_filename,[]);
274 }
275}
276
277
278sub pre_build_indexes
279{
280 my $self = shift (@_);
281 my ($indexname) = @_;
282 my $outhandle = $self->{'outhandle'};
283
284 # If the Solr/Jetty server is not already running, the following starts
285 # it up, and only returns when the server is "reading and listening"
286
287 my $solr_server = new solrserver($self->{'build_dir'});
288 $solr_server->start();
289 $self->{'solr_server'} = $solr_server;
290
291 my $indexes = [];
292 if (defined $indexname && $indexname =~ /\w/) {
293 push @$indexes, $indexname;
294 } else {
295 $indexes = $self->{'collect_cfg'}->{'indexes'};
296 }
297
298 # skip para-level check, as this is done in the main 'build_indexes'
299 # routine
300
301 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
302 my $allfields_index = 0; # do we have an allfields index?
303
304 # Using a hashmap here would duplications, but while more space
305 # efficient, it's not entirely clear it would be more computationally
306 # efficient
307 my @all_fields = ();
308
309 foreach my $index (@$indexes) {
310 if ($self->want_built($index)) {
311
312 # get the parameters for the output
313 # split on : just in case there is subcoll and lang stuff
314 my ($fields) = split (/:/, $index);
315
316 foreach my $field (split (/;/, $fields)) {
317 if ($field eq "metadata") {
318 $all_metadata_specified = 1;
319 }
320 else {
321 push(@all_fields,$field);
322 }
323 }
324 }
325 }
326
327 if ($all_metadata_specified) {
328
329 # (Unforunately) we need to process all the documents in the collection
330 # to figure out what the metadata_field_mapping is
331
332 # set up the document processr
333 $self->{'buildproc'}->set_output_handle (undef);
334 $self->{'buildproc'}->set_mode ('index_field_mapping');
335 $self->{'buildproc'}->reset();
336
337 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
338 $self->{'buildproc'}, $self->{'maxdocs'});
339 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
340 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
341 &plugin::end($self->{'pluginfo'});
342
343 }
344
345 else {
346 # Field mapping solely dependent of entries in 'indexes'
347
348 # No need to explicitly handle "allfields" as create_shortname()
349 # will get a fix on it through it's static_indexfield_map
350
351 my $buildproc = $self->{'buildproc'};
352
353 foreach my $field (@all_fields) {
354 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
355 my $shortname = $buildproc->create_shortname($field);
356 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
357 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
358 }
359 }
360 }
361
362 # Write out solr 'schema.xml' (and related) file
363 #
364 $self->make_final_field_list();
365 $self->premake_solr_auxiliary_files();
366
367 # Now update the solr-core information in solr.xml
368 # => at most two cores <colname>-Doc and <colname>-Sec
369
370 my $site = $self->{'site'};
371 my $collect = $self->{'collection'};
372 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
373
374 # my $idx = $self->{'index_mapping'}->{$index};
375 my $idx = "idx";
376
377 my $build_dir = $self->{'build_dir'};
378
379 foreach my $level (keys %{$self->{'levels'}}) {
380
381 my ($pindex) = $level =~ /^(.)/;
382
383 my $index_dir = $pindex.$idx;
384 my $core = "$core_prefix-$index_dir";
385
386 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
387 if ($force_removeold) {
388 print $outhandle "\n-removeold set (new index will be created)\n";
389
390 my $full_index_dir = &util::filename_cat($build_dir,$index_dir);
391 &util::rm_r($full_index_dir);
392 &util::mk_dir($full_index_dir);
393 }
394
395 # if collect==core already in solr.xml (check with STATUS)
396 # => use RELOAD call to refresh fields now expressed in schema.xml
397 #
398 # else
399 # => use CREATE API to add to solr.xml
400
401 my $check_core_exists = $solr_server->admin_ping_core($core);
402
403 if ($check_core_exists) {
404 print $outhandle "Reloading Solr core: $core\n";
405 $solr_server->admin_reload_core($core);
406 }
407 else {
408 print $outhandle "Creating Solr core: $core\n";
409 $solr_server->admin_create_core($core);
410 }
411 }
412
413}
414
415# Essentially the same as the lucenebuilder.pm version, only using solr_passes
416# => refactor and make better use of inheritence
417
418sub build_index {
419 my $self = shift (@_);
420 my ($index,$llevel) = @_;
421 my $outhandle = $self->{'outhandle'};
422 my $build_dir = $self->{'build_dir'};
423
424 # get the full index directory path and make sure it exists
425 my $indexdir = $self->{'index_mapping'}->{$index};
426 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
427
428 # Find the perl script to call to run solr
429 my $solr_passes = $self->{'solr_passes'};
430 my $solr_passes_exe = $self->{'solr_passes_exe'};
431
432 # define the section names for solrpasses
433 # define the section names and possibly the doc name for solrpasses
434 my $solr_passes_sections = $llevel;
435
436 my $osextra = "";
437 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
438 $build_dir =~ s@/@\\@g;
439 } else {
440 if ($outhandle ne "STDERR") {
441 # so solr_passes doesn't print to stderr if we redirect output
442 $osextra .= " 2>/dev/null";
443 }
444 }
445
446 # get the index expression if this index belongs
447 # to a subcollection
448 my $indexexparr = [];
449 my $langarr = [];
450
451 # there may be subcollection info, and language info.
452 my ($fields, $subcollection, $language) = split (":", $index);
453 my @subcollections = ();
454 @subcollections = split /,/, $subcollection if (defined $subcollection);
455
456 foreach $subcollection (@subcollections) {
457 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
458 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
459 }
460 }
461
462 # add expressions for languages if this index belongs to
463 # a language subcollection - only put languages expressions for the
464 # ones we want in the index
465 my @languages = ();
466 my $languagemetadata = "Language";
467 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
468 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
469 }
470 @languages = split /,/, $language if (defined $language);
471 foreach my $language (@languages) {
472 my $not=0;
473 if ($language =~ s/^\!//) {
474 $not = 1;
475 }
476 if($not) {
477 push (@$langarr, "!$language");
478 } else {
479 push (@$langarr, "$language");
480 }
481 }
482
483 # Build index dictionary. Uses verbatim stem method
484 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
485 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
486 my ($handle);
487
488 if ($self->{'debug'}) {
489 $handle = *STDOUT;
490 } else {
491 my $site = $self->{'site'};
492 my $collect = $self->{'collection'};
493 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
494 my $ds_idx = $self->{'index_mapping'}->{$index};
495 my $core = "$core_prefix-$ds_idx";
496
497 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
498 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
499 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
500 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
501 }
502 }
503
504 my $store_levels = $self->{'levels'};
505 my $db_level = "section"; #always
506 my $dom_level = "";
507 foreach my $key (keys %$store_levels) {
508 if ($mgppbuilder::level_map{$key} eq $llevel) {
509 $dom_level = $key;
510 }
511 }
512 if ($dom_level eq "") {
513 print STDERR "Warning: unrecognized tag level $llevel\n";
514 $dom_level = "document";
515 }
516
517 my $local_levels = { $dom_level => 1 }; # work on one level at a time
518
519 # set up the document processr
520 $self->{'buildproc'}->set_output_handle ($handle);
521 $self->{'buildproc'}->set_mode ('text');
522 $self->{'buildproc'}->set_index ($index, $indexexparr);
523 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
524 $self->{'buildproc'}->set_indexing_text (1);
525 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
526 $self->{'buildproc'}->set_levels ($local_levels);
527 $self->{'buildproc'}->set_db_level($db_level);
528 $self->{'buildproc'}->reset();
529
530 print $handle "<update>\n";
531
532 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
533 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
534
535
536 print $handle "</update>\n";
537
538 close ($handle) unless $self->{'debug'};
539
540 $self->print_stats();
541
542 $self->{'buildproc'}->set_levels ($store_levels);
543 print STDERR "</Stage>\n" if $self->{'gli'};
544
545}
546
547
548sub post_build_indexes {
549 my $self = shift(@_);
550
551 # deliberately override to prevent the mgpp post_build_index() calling
552 # $self->make_final_field_list()
553 # as this has been done in our pre_build_indexes() phase for solr
554
555
556 # Also need to stop the Solr/jetty server if it was explicitly started
557 # in pre_build_indexes()
558
559 my $solr_server = $self->{'solr_server'};
560
561 if ($solr_server->explicitly_started()) {
562 $solr_server->stop();
563 }
564
565 $self->{'solr_server'} = undef;
566
567}
568
569
5701;
571
572
Note: See TracBrowser for help on using the repository browser.