source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24643

Last change on this file since 24643 was 24643, checked in by davidb, 13 years ago

Adjustments to code as a result of testing

File size: 17.1 KB
RevLine 
[24446]1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
[24453]33use solrserver;
[24446]34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84 &util::mk_all_dir ($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
[24501]114 my $site = $self->{'site'};
115 my $collect = $self->{'collection'};
116 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117 my $core = $core_prefix; # unused in this call to solr_passes
[24446]118
119 print STDERR "Executable: $solr_passes_exe\n";
120 print STDERR "Sections: $solr_passes_sections\n";
121 print STDERR "Build Dir: $build_dir\n";
[24501]122 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
123 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
[24446]124 {
125 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
126 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
127 }
128 }
129
130 # stored text is always Doc and Sec levels
131 my $levels = { 'document' => 1, 'section' => 1 };
132 # always do database at section level
133 my $db_level = "section";
134
135 # set up the document processr
136 $self->{'buildproc'}->set_output_handle ($handle);
137 $self->{'buildproc'}->set_mode ('text');
138 $self->{'buildproc'}->set_index ($textindex);
139 $self->{'buildproc'}->set_indexing_text (0);
140 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
141 $self->{'buildproc'}->set_levels ($levels);
142 $self->{'buildproc'}->set_db_level ($db_level);
143 $self->{'buildproc'}->reset();
144
145 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
146 $self->{'buildproc'}, $self->{'maxdocs'});
147 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
148 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
149 &plugin::end($self->{'pluginfo'});
150
151 close ($handle) unless $self->{'debug'};
152 $self->print_stats();
153
154 print STDERR "</Stage>\n" if $self->{'gli'};
155}
156
157#----
158
159
160
161sub filter_in_out_file
162{
163 my ($in_filename,$out_filename,$replace_rules) = @_;
164
165 if (open(SIN,"<$in_filename")) {
166
167 if (open(SOUT,">$out_filename")) {
168
169 my $line;
170 while (defined ($line=<SIN>)) {
171 chomp $line;
172
173 my $done_insert = 0;
174 foreach my $rule (@$replace_rules) {
175 my $line_re = $rule->{'regexp'};
176 my $insert = $rule->{'insert'};
177
178 if ($line =~ m/$line_re/) {
179 print SOUT $insert;
180 $done_insert = 1;
181 last;
182 }
183 }
184 if (!$done_insert) {
185 print SOUT "$line\n";;
186 }
187 }
188
189 close(SOUT);
190 }
191 else {
192 print STDERR "Error: Failed to open $out_filename\n";
193 print STDERR " $!\n";
194 }
195
196 close(SIN);
197 }
198 else {
199 print STDERR "Error: Failed to open $in_filename\n";
200 print STDERR " $!\n";
201 }
202
203}
204
205# Generate solr schema.xml file based on indexmapfield and other associated
206# config files
207#
208# Unlike make_auxiliary_files(), this needs to be done up-front (rather
209# than at the end) so the data-types in schema.xml are correctly set up
210# prior to document content being pumped through solr_passes.pl
211
212
213sub premake_solr_auxiliary_files
214{
215 my $self = shift (@_);
216
217 # Replace the following marker:
218 #
219 # <!-- ##GREENSTONE-FIELDS## -->
220 #
221 # with lines of the form:
222 #
223 # <field name="<field>" type="string" ... />
224 #
225 # for each <field> in 'indexfieldmap'
226
227 my $schema_insert_xml = "";
228
229 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
230
231 my ($field) = ($ifm =~ m/^.*->(.*)$/);
232
233 # Need special case for Long/Lat
234 # ... but for now treat everything as of type string
235
236 $schema_insert_xml .= " "; # indent
237 $schema_insert_xml .= "<field name=\"$field\" ";
[24643]238 $schema_insert_xml .= "type=\"text_en_splitting\" indexed=\"true\" ";
[24446]239 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
240 }
241
242 # just the one rule to date
243 my $insert_rules
244 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
245 'insert' => $schema_insert_xml } ];
246
247 my $solr_home = $ENV{'GEXT_SOLR'};
[24501]248## my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
249 my $in_dirname = &util::filename_cat($solr_home,"conf");
[24446]250 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
251
252 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
253 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
254 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
255
256 # make sure output conf directory exists
257 if (!-d $out_dirname) {
258 &util::mk_dir($out_dirname);
259 }
260
261 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
262
263 # now do the same for solrconfig.xml, stopwords, ...
264 # these are simpler, as they currently do not need any filtering
265
266 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
267 "synonyms.txt", "protwords.txt" );
[24497]268
[24446]269 foreach my $file ( @in_file_list ) {
270 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
271 my $out_filename = &util::filename_cat($out_dirname,$file);
272 filter_in_out_file($in_filename,$out_filename,[]);
273 }
274}
275
276
277sub pre_build_indexes
278{
279 my $self = shift (@_);
280 my ($indexname) = @_;
281 my $outhandle = $self->{'outhandle'};
282
[24453]283 # If the Solr/Jetty server is not already running, the following starts
284 # it up, and only returns when the server is "reading and listening"
285
[24501]286 my $solr_server = new solrserver($self->{'build_dir'});
[24453]287 $solr_server->start();
288 $self->{'solr_server'} = $solr_server;
[24446]289
290 my $indexes = [];
291 if (defined $indexname && $indexname =~ /\w/) {
292 push @$indexes, $indexname;
293 } else {
294 $indexes = $self->{'collect_cfg'}->{'indexes'};
295 }
296
297 # skip para-level check, as this is done in the main 'build_indexes'
298 # routine
299
300 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
301 my $allfields_index = 0; # do we have an allfields index?
302
303 # Using a hashmap here would duplications, but while more space
304 # efficient, it's not entirely clear it would be more computationally
305 # efficient
306 my @all_fields = ();
307
308 foreach my $index (@$indexes) {
309 if ($self->want_built($index)) {
310
311 # get the parameters for the output
312 # split on : just in case there is subcoll and lang stuff
313 my ($fields) = split (/:/, $index);
314
315 foreach my $field (split (/;/, $fields)) {
316 if ($field eq "metadata") {
317 $all_metadata_specified = 1;
318 }
319 else {
320 push(@all_fields,$field);
321 }
322 }
323 }
324 }
325
326 if ($all_metadata_specified) {
327
328 # (Unforunately) we need to process all the documents in the collection
329 # to figure out what the metadata_field_mapping is
330
331 # set up the document processr
332 $self->{'buildproc'}->set_output_handle (undef);
333 $self->{'buildproc'}->set_mode ('index_field_mapping');
334 $self->{'buildproc'}->reset();
335
336 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
337 $self->{'buildproc'}, $self->{'maxdocs'});
338 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
339 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
340 &plugin::end($self->{'pluginfo'});
341
342 }
343
344 else {
345 # Field mapping solely dependent of entries in 'indexes'
346
347 # No need to explicitly handle "allfields" as create_shortname()
348 # will get a fix on it through it's static_indexfield_map
349
350 my $buildproc = $self->{'buildproc'};
351
352 foreach my $field (@all_fields) {
353 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
354 my $shortname = $buildproc->create_shortname($field);
355 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
356 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
357 }
358 }
359 }
360
[24453]361 # Write out solr 'schema.xml' (and related) file
362 #
[24446]363 $self->make_final_field_list();
364 $self->premake_solr_auxiliary_files();
365
[24453]366 # Now update the solr-core information in solr.xml
367 # => at most two cores <colname>-Doc and <colname>-Sec
[24446]368
[24501]369 my $site = $self->{'site'};
370 my $collect = $self->{'collection'};
371 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
[24453]372
[24456]373 # my $idx = $self->{'index_mapping'}->{$index};
374 my $idx = "idx";
375
[24501]376 my $build_dir = $self->{'build_dir'};
[24497]377
[24453]378 foreach my $level (keys %{$self->{'levels'}}) {
[24456]379
380 my ($pindex) = $level =~ /^(.)/;
[24483]381
[24501]382 my $index_dir = $pindex.$idx;
383 my $core = "$core_prefix-$index_dir";
[24497]384
[24643]385 # force_removeold == opposite of being run in 'incremental' mode
[24501]386 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
[24643]387
[24501]388 if ($force_removeold) {
389 print $outhandle "\n-removeold set (new index will be created)\n";
390
391 my $full_index_dir = &util::filename_cat($build_dir,$index_dir);
392 &util::rm_r($full_index_dir);
393 &util::mk_dir($full_index_dir);
[24643]394
395 # Solr then wants an "index" folder within this general index area!
396# my $full_index_index_dir = &util::filename_cat($full_index_dir,"index");
397# &util::mk_dir($full_index_index_dir);
398
399
400 # now go on and create new index
401 print $outhandle "Creating Solr core: $core\n";
402 $solr_server->admin_create_core($core);
403
[24501]404 }
[24643]405 else {
406 # if collect==core already in solr.xml (check with STATUS)
407 # => use RELOAD call to refresh fields now expressed in schema.xml
408 #
409 # else
410 # => use CREATE API to add to solr.xml
[24483]411
[24643]412 my $check_core_exists = $solr_server->admin_ping_core($core);
[24483]413
[24643]414 if ($check_core_exists) {
415 print $outhandle "Reloading Solr core: $core\n";
416 $solr_server->admin_reload_core($core);
417 }
418 else {
419 print $outhandle "Creating Solr core: $core\n";
420 $solr_server->admin_create_core($core);
421 }
[24456]422 }
[24453]423 }
424
[24446]425}
426
427# Essentially the same as the lucenebuilder.pm version, only using solr_passes
428# => refactor and make better use of inheritence
429
430sub build_index {
431 my $self = shift (@_);
432 my ($index,$llevel) = @_;
433 my $outhandle = $self->{'outhandle'};
434 my $build_dir = $self->{'build_dir'};
435
436 # get the full index directory path and make sure it exists
437 my $indexdir = $self->{'index_mapping'}->{$index};
438 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
439
440 # Find the perl script to call to run solr
441 my $solr_passes = $self->{'solr_passes'};
442 my $solr_passes_exe = $self->{'solr_passes_exe'};
443
444 # define the section names for solrpasses
445 # define the section names and possibly the doc name for solrpasses
446 my $solr_passes_sections = $llevel;
447
448 my $osextra = "";
449 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
450 $build_dir =~ s@/@\\@g;
451 } else {
452 if ($outhandle ne "STDERR") {
453 # so solr_passes doesn't print to stderr if we redirect output
454 $osextra .= " 2>/dev/null";
455 }
456 }
457
458 # get the index expression if this index belongs
459 # to a subcollection
460 my $indexexparr = [];
461 my $langarr = [];
462
463 # there may be subcollection info, and language info.
464 my ($fields, $subcollection, $language) = split (":", $index);
465 my @subcollections = ();
466 @subcollections = split /,/, $subcollection if (defined $subcollection);
467
468 foreach $subcollection (@subcollections) {
469 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
470 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
471 }
472 }
473
474 # add expressions for languages if this index belongs to
475 # a language subcollection - only put languages expressions for the
476 # ones we want in the index
477 my @languages = ();
478 my $languagemetadata = "Language";
479 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
480 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
481 }
482 @languages = split /,/, $language if (defined $language);
483 foreach my $language (@languages) {
484 my $not=0;
485 if ($language =~ s/^\!//) {
486 $not = 1;
487 }
488 if($not) {
489 push (@$langarr, "!$language");
490 } else {
491 push (@$langarr, "$language");
492 }
493 }
494
495 # Build index dictionary. Uses verbatim stem method
496 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
497 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
498 my ($handle);
499
500 if ($self->{'debug'}) {
501 $handle = *STDOUT;
502 } else {
[24501]503 my $site = $self->{'site'};
504 my $collect = $self->{'collection'};
505 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
506 my $ds_idx = $self->{'index_mapping'}->{$index};
507 my $core = "$core_prefix-$ds_idx";
[24446]508
[24501]509 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
510 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
[24446]511 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
512 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
513 }
514 }
515
516 my $store_levels = $self->{'levels'};
517 my $db_level = "section"; #always
518 my $dom_level = "";
519 foreach my $key (keys %$store_levels) {
520 if ($mgppbuilder::level_map{$key} eq $llevel) {
521 $dom_level = $key;
522 }
523 }
524 if ($dom_level eq "") {
525 print STDERR "Warning: unrecognized tag level $llevel\n";
526 $dom_level = "document";
527 }
528
529 my $local_levels = { $dom_level => 1 }; # work on one level at a time
530
531 # set up the document processr
532 $self->{'buildproc'}->set_output_handle ($handle);
533 $self->{'buildproc'}->set_mode ('text');
534 $self->{'buildproc'}->set_index ($index, $indexexparr);
535 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
536 $self->{'buildproc'}->set_indexing_text (1);
537 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
538 $self->{'buildproc'}->set_levels ($local_levels);
539 $self->{'buildproc'}->set_db_level($db_level);
540 $self->{'buildproc'}->reset();
541
542 print $handle "<update>\n";
543
544 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
545 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
546
547
548 print $handle "</update>\n";
549
550 close ($handle) unless $self->{'debug'};
551
552 $self->print_stats();
553
554 $self->{'buildproc'}->set_levels ($store_levels);
555 print STDERR "</Stage>\n" if $self->{'gli'};
556
557}
558
559
560sub post_build_indexes {
561 my $self = shift(@_);
562
563 # deliberately override to prevent the mgpp post_build_index() calling
564 # $self->make_final_field_list()
565 # as this has been done in our pre_build_indexes() phase for solr
[24453]566
567
568 # Also need to stop the Solr/jetty server if it was explicitly started
569 # in pre_build_indexes()
[24446]570
[24453]571 my $solr_server = $self->{'solr_server'};
572
573 if ($solr_server->explicitly_started()) {
574 $solr_server->stop();
575 }
576
577 $self->{'solr_server'} = undef;
578
[24446]579}
580
581
5821;
583
584
Note: See TracBrowser for help on using the repository browser.