source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 25889

Last change on this file since 25889 was 25889, checked in by ak19, 12 years ago

Second set of commits for getting activate.pl to deal with solr cores when moving building to index. This time it uses the building- prefix and things still work. However, if the GS3 server is already running, an ant restart is required before searches return results and not sure about whether the incremental case is covered properly. The index reason is still being created for some reason when building.

File size: 17.6 KB
RevLine 
[24446]1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
[24453]33use solrserver;
[24446]34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84 &util::mk_all_dir ($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
[24501]114 my $site = $self->{'site'};
115 my $collect = $self->{'collection'};
116 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117 my $core = $core_prefix; # unused in this call to solr_passes
[24446]118
[25889]119 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
120
[24446]121 print STDERR "Executable: $solr_passes_exe\n";
122 print STDERR "Sections: $solr_passes_sections\n";
123 print STDERR "Build Dir: $build_dir\n";
[24501]124 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
125 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
[24446]126 {
127 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
128 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
129 }
130 }
131
132 # stored text is always Doc and Sec levels
133 my $levels = { 'document' => 1, 'section' => 1 };
134 # always do database at section level
135 my $db_level = "section";
136
137 # set up the document processr
138 $self->{'buildproc'}->set_output_handle ($handle);
139 $self->{'buildproc'}->set_mode ('text');
140 $self->{'buildproc'}->set_index ($textindex);
141 $self->{'buildproc'}->set_indexing_text (0);
142 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
143 $self->{'buildproc'}->set_levels ($levels);
144 $self->{'buildproc'}->set_db_level ($db_level);
145 $self->{'buildproc'}->reset();
146
147 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
148 $self->{'buildproc'}, $self->{'maxdocs'});
149 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
150 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
151 &plugin::end($self->{'pluginfo'});
152
153 close ($handle) unless $self->{'debug'};
154 $self->print_stats();
155
156 print STDERR "</Stage>\n" if $self->{'gli'};
157}
158
159#----
160
161
162
163sub filter_in_out_file
164{
165 my ($in_filename,$out_filename,$replace_rules) = @_;
166
167 if (open(SIN,"<$in_filename")) {
168
169 if (open(SOUT,">$out_filename")) {
170
171 my $line;
172 while (defined ($line=<SIN>)) {
173 chomp $line;
174
175 my $done_insert = 0;
176 foreach my $rule (@$replace_rules) {
177 my $line_re = $rule->{'regexp'};
178 my $insert = $rule->{'insert'};
179
180 if ($line =~ m/$line_re/) {
181 print SOUT $insert;
182 $done_insert = 1;
183 last;
184 }
185 }
186 if (!$done_insert) {
187 print SOUT "$line\n";;
188 }
189 }
190
191 close(SOUT);
192 }
193 else {
194 print STDERR "Error: Failed to open $out_filename\n";
195 print STDERR " $!\n";
196 }
197
198 close(SIN);
199 }
200 else {
201 print STDERR "Error: Failed to open $in_filename\n";
202 print STDERR " $!\n";
203 }
204
205}
206
207# Generate solr schema.xml file based on indexmapfield and other associated
208# config files
209#
210# Unlike make_auxiliary_files(), this needs to be done up-front (rather
211# than at the end) so the data-types in schema.xml are correctly set up
212# prior to document content being pumped through solr_passes.pl
213
214
215sub premake_solr_auxiliary_files
216{
217 my $self = shift (@_);
218
219 # Replace the following marker:
220 #
221 # <!-- ##GREENSTONE-FIELDS## -->
222 #
223 # with lines of the form:
224 #
225 # <field name="<field>" type="string" ... />
226 #
227 # for each <field> in 'indexfieldmap'
228
229 my $schema_insert_xml = "";
230
231 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
232
[25846]233 my ($field) = ($ifm =~ m/^.*->(.*)$/);
[24446]234
[25846]235 $schema_insert_xml .= " "; # indent
236 $schema_insert_xml .= "<field name=\"$field\" ";
[24446]237
[25846]238 if($field eq "LA" || $field eq "LO")
239 {
240 $schema_insert_xml .= "type=\"location\" ";
241 }
242 elsif ($field ne "ZZ" && $field ne "TX")
243 {
244 $schema_insert_xml .= "type=\"string\" ";
245 }
246 else
247 {
248 $schema_insert_xml .= "type=\"text_en_splitting\" ";
249 }
250 $schema_insert_xml .= "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
[24446]251 }
252
253 # just the one rule to date
254 my $insert_rules
255 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
256 'insert' => $schema_insert_xml } ];
257
258 my $solr_home = $ENV{'GEXT_SOLR'};
[24501]259## my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
260 my $in_dirname = &util::filename_cat($solr_home,"conf");
[24446]261 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
262
263 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
264 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
265 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
266
267 # make sure output conf directory exists
268 if (!-d $out_dirname) {
269 &util::mk_dir($out_dirname);
270 }
271
272 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
273
274 # now do the same for solrconfig.xml, stopwords, ...
275 # these are simpler, as they currently do not need any filtering
276
277 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
278 "synonyms.txt", "protwords.txt" );
[24497]279
[24446]280 foreach my $file ( @in_file_list ) {
281 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
282 my $out_filename = &util::filename_cat($out_dirname,$file);
283 filter_in_out_file($in_filename,$out_filename,[]);
284 }
285}
286
287
288sub pre_build_indexes
289{
290 my $self = shift (@_);
291 my ($indexname) = @_;
292 my $outhandle = $self->{'outhandle'};
293
[24453]294 # If the Solr/Jetty server is not already running, the following starts
295 # it up, and only returns when the server is "reading and listening"
296
[24501]297 my $solr_server = new solrserver($self->{'build_dir'});
[24453]298 $solr_server->start();
299 $self->{'solr_server'} = $solr_server;
[24446]300
301 my $indexes = [];
302 if (defined $indexname && $indexname =~ /\w/) {
303 push @$indexes, $indexname;
304 } else {
305 $indexes = $self->{'collect_cfg'}->{'indexes'};
306 }
307
308 # skip para-level check, as this is done in the main 'build_indexes'
309 # routine
310
311 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
312 my $allfields_index = 0; # do we have an allfields index?
313
314 # Using a hashmap here would duplications, but while more space
315 # efficient, it's not entirely clear it would be more computationally
316 # efficient
317 my @all_fields = ();
318
319 foreach my $index (@$indexes) {
320 if ($self->want_built($index)) {
321
322 # get the parameters for the output
323 # split on : just in case there is subcoll and lang stuff
324 my ($fields) = split (/:/, $index);
325
326 foreach my $field (split (/;/, $fields)) {
327 if ($field eq "metadata") {
328 $all_metadata_specified = 1;
329 }
330 else {
331 push(@all_fields,$field);
332 }
333 }
334 }
335 }
336
337 if ($all_metadata_specified) {
338
339 # (Unforunately) we need to process all the documents in the collection
340 # to figure out what the metadata_field_mapping is
341
342 # set up the document processr
343 $self->{'buildproc'}->set_output_handle (undef);
344 $self->{'buildproc'}->set_mode ('index_field_mapping');
345 $self->{'buildproc'}->reset();
346
347 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
348 $self->{'buildproc'}, $self->{'maxdocs'});
349 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
350 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
351 &plugin::end($self->{'pluginfo'});
352
353 }
354
355 else {
356 # Field mapping solely dependent of entries in 'indexes'
357
358 # No need to explicitly handle "allfields" as create_shortname()
359 # will get a fix on it through it's static_indexfield_map
360
361 my $buildproc = $self->{'buildproc'};
362
363 foreach my $field (@all_fields) {
364 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
365 my $shortname = $buildproc->create_shortname($field);
366 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
367 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
368 }
369 }
370 }
371
[24453]372 # Write out solr 'schema.xml' (and related) file
373 #
[24446]374 $self->make_final_field_list();
375 $self->premake_solr_auxiliary_files();
376
[24453]377 # Now update the solr-core information in solr.xml
378 # => at most two cores <colname>-Doc and <colname>-Sec
[24446]379
[24501]380 my $site = $self->{'site'};
381 my $collect = $self->{'collection'};
382 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
[24453]383
[24456]384 # my $idx = $self->{'index_mapping'}->{$index};
385 my $idx = "idx";
386
[24501]387 my $build_dir = $self->{'build_dir'};
[24497]388
[24453]389 foreach my $level (keys %{$self->{'levels'}}) {
[24456]390
391 my ($pindex) = $level =~ /^(.)/;
[24483]392
[24501]393 my $index_dir = $pindex.$idx;
394 my $core = "$core_prefix-$index_dir";
[24497]395
[24643]396 # force_removeold == opposite of being run in 'incremental' mode
[24501]397 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
[24643]398
[24501]399 if ($force_removeold) {
400 print $outhandle "\n-removeold set (new index will be created)\n";
401
[25889]402 # create cores under temporary core names, corresponding to building directory
403 $core = "building-".$core;
404
[24501]405 my $full_index_dir = &util::filename_cat($build_dir,$index_dir);
406 &util::rm_r($full_index_dir);
407 &util::mk_dir($full_index_dir);
[24643]408
409 # Solr then wants an "index" folder within this general index area!
410# my $full_index_index_dir = &util::filename_cat($full_index_dir,"index");
411# &util::mk_dir($full_index_index_dir);
412
413
414 # now go on and create new index
415 print $outhandle "Creating Solr core: $core\n";
416 $solr_server->admin_create_core($core);
417
[24501]418 }
[24643]419 else {
420 # if collect==core already in solr.xml (check with STATUS)
421 # => use RELOAD call to refresh fields now expressed in schema.xml
422 #
423 # else
424 # => use CREATE API to add to solr.xml
[24483]425
[24643]426 my $check_core_exists = $solr_server->admin_ping_core($core);
[24483]427
[24643]428 if ($check_core_exists) {
429 print $outhandle "Reloading Solr core: $core\n";
430 $solr_server->admin_reload_core($core);
431 }
432 else {
433 print $outhandle "Creating Solr core: $core\n";
434 $solr_server->admin_create_core($core);
435 }
[24456]436 }
[24453]437 }
438
[24446]439}
440
441# Essentially the same as the lucenebuilder.pm version, only using solr_passes
442# => refactor and make better use of inheritence
443
444sub build_index {
445 my $self = shift (@_);
446 my ($index,$llevel) = @_;
447 my $outhandle = $self->{'outhandle'};
448 my $build_dir = $self->{'build_dir'};
449
450 # get the full index directory path and make sure it exists
451 my $indexdir = $self->{'index_mapping'}->{$index};
452 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
453
454 # Find the perl script to call to run solr
455 my $solr_passes = $self->{'solr_passes'};
456 my $solr_passes_exe = $self->{'solr_passes_exe'};
457
458 # define the section names for solrpasses
459 # define the section names and possibly the doc name for solrpasses
460 my $solr_passes_sections = $llevel;
461
462 my $osextra = "";
463 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
464 $build_dir =~ s@/@\\@g;
465 } else {
466 if ($outhandle ne "STDERR") {
467 # so solr_passes doesn't print to stderr if we redirect output
468 $osextra .= " 2>/dev/null";
469 }
470 }
471
472 # get the index expression if this index belongs
473 # to a subcollection
474 my $indexexparr = [];
475 my $langarr = [];
476
477 # there may be subcollection info, and language info.
478 my ($fields, $subcollection, $language) = split (":", $index);
479 my @subcollections = ();
480 @subcollections = split /,/, $subcollection if (defined $subcollection);
481
482 foreach $subcollection (@subcollections) {
483 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
484 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
485 }
486 }
487
488 # add expressions for languages if this index belongs to
489 # a language subcollection - only put languages expressions for the
490 # ones we want in the index
491 my @languages = ();
492 my $languagemetadata = "Language";
493 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
494 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
495 }
496 @languages = split /,/, $language if (defined $language);
497 foreach my $language (@languages) {
498 my $not=0;
499 if ($language =~ s/^\!//) {
500 $not = 1;
501 }
502 if($not) {
503 push (@$langarr, "!$language");
504 } else {
505 push (@$langarr, "$language");
506 }
507 }
508
509 # Build index dictionary. Uses verbatim stem method
510 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
511 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
512 my ($handle);
513
514 if ($self->{'debug'}) {
515 $handle = *STDOUT;
516 } else {
[24501]517 my $site = $self->{'site'};
518 my $collect = $self->{'collection'};
519 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
520 my $ds_idx = $self->{'index_mapping'}->{$index};
521 my $core = "$core_prefix-$ds_idx";
[24446]522
[25889]523 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
524
[24501]525 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
526 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
[24446]527 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
528 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
529 }
530 }
531
532 my $store_levels = $self->{'levels'};
533 my $db_level = "section"; #always
534 my $dom_level = "";
535 foreach my $key (keys %$store_levels) {
536 if ($mgppbuilder::level_map{$key} eq $llevel) {
537 $dom_level = $key;
538 }
539 }
540 if ($dom_level eq "") {
541 print STDERR "Warning: unrecognized tag level $llevel\n";
542 $dom_level = "document";
543 }
544
545 my $local_levels = { $dom_level => 1 }; # work on one level at a time
546
547 # set up the document processr
548 $self->{'buildproc'}->set_output_handle ($handle);
549 $self->{'buildproc'}->set_mode ('text');
550 $self->{'buildproc'}->set_index ($index, $indexexparr);
551 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
552 $self->{'buildproc'}->set_indexing_text (1);
553 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
554 $self->{'buildproc'}->set_levels ($local_levels);
555 $self->{'buildproc'}->set_db_level($db_level);
556 $self->{'buildproc'}->reset();
557
558 print $handle "<update>\n";
559
560 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
561 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[25846]562
[24446]563 print $handle "</update>\n";
564
565 close ($handle) unless $self->{'debug'};
566
567 $self->print_stats();
568
569 $self->{'buildproc'}->set_levels ($store_levels);
570 print STDERR "</Stage>\n" if $self->{'gli'};
571
572}
573
574
575sub post_build_indexes {
576 my $self = shift(@_);
577
578 # deliberately override to prevent the mgpp post_build_index() calling
579 # $self->make_final_field_list()
580 # as this has been done in our pre_build_indexes() phase for solr
[24453]581
582
583 # Also need to stop the Solr/jetty server if it was explicitly started
584 # in pre_build_indexes()
[24446]585
[24453]586 my $solr_server = $self->{'solr_server'};
587
588 if ($solr_server->explicitly_started()) {
589 $solr_server->stop();
590 }
591
592 $self->{'solr_server'} = undef;
593
[24446]594}
595
596
5971;
598
599
Note: See TracBrowser for help on using the repository browser.