source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24456

Last change on this file since 24456 was 24456, checked in by davidb, 13 years ago

Improved error handling of solr-admin over 'wget'

File size: 18.3 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84 &util::mk_all_dir ($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
114 my $collection = $self->{'collection'};
115
116 print STDERR "Executable: $solr_passes_exe\n";
117 print STDERR "Sections: $solr_passes_sections\n";
118 print STDERR "Build Dir: $build_dir\n";
119 print STDERR "Cmd: $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
120 if (!open($handle, "| $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra"))
121 {
122 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
123 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
124 }
125 }
126
127 # stored text is always Doc and Sec levels
128 my $levels = { 'document' => 1, 'section' => 1 };
129 # always do database at section level
130 my $db_level = "section";
131
132 # set up the document processr
133 $self->{'buildproc'}->set_output_handle ($handle);
134 $self->{'buildproc'}->set_mode ('text');
135 $self->{'buildproc'}->set_index ($textindex);
136 $self->{'buildproc'}->set_indexing_text (0);
137 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
138 $self->{'buildproc'}->set_levels ($levels);
139 $self->{'buildproc'}->set_db_level ($db_level);
140 $self->{'buildproc'}->reset();
141
142 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
143 $self->{'buildproc'}, $self->{'maxdocs'});
144 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
145 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
146 &plugin::end($self->{'pluginfo'});
147
148 close ($handle) unless $self->{'debug'};
149 $self->print_stats();
150
151 print STDERR "</Stage>\n" if $self->{'gli'};
152}
153
154#----
155
156
157
158sub filter_in_out_file
159{
160 my ($in_filename,$out_filename,$replace_rules) = @_;
161
162 if (open(SIN,"<$in_filename")) {
163
164 if (open(SOUT,">$out_filename")) {
165
166 my $line;
167 while (defined ($line=<SIN>)) {
168 chomp $line;
169
170 my $done_insert = 0;
171 foreach my $rule (@$replace_rules) {
172 my $line_re = $rule->{'regexp'};
173 my $insert = $rule->{'insert'};
174
175 if ($line =~ m/$line_re/) {
176 print SOUT $insert;
177 $done_insert = 1;
178 last;
179 }
180 }
181 if (!$done_insert) {
182 print SOUT "$line\n";;
183 }
184 }
185
186 close(SOUT);
187 }
188 else {
189 print STDERR "Error: Failed to open $out_filename\n";
190 print STDERR " $!\n";
191 }
192
193 close(SIN);
194 }
195 else {
196 print STDERR "Error: Failed to open $in_filename\n";
197 print STDERR " $!\n";
198 }
199
200}
201
202# Generate solr schema.xml file based on indexmapfield and other associated
203# config files
204#
205# Unlike make_auxiliary_files(), this needs to be done up-front (rather
206# than at the end) so the data-types in schema.xml are correctly set up
207# prior to document content being pumped through solr_passes.pl
208
209
210sub premake_solr_auxiliary_files
211{
212 my $self = shift (@_);
213
214 # Replace the following marker:
215 #
216 # <!-- ##GREENSTONE-FIELDS## -->
217 #
218 # with lines of the form:
219 #
220 # <field name="<field>" type="string" ... />
221 #
222 # for each <field> in 'indexfieldmap'
223
224 my $schema_insert_xml = "";
225
226 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
227
228 my ($field) = ($ifm =~ m/^.*->(.*)$/);
229
230 # Need special case for Long/Lat
231 # ... but for now treat everything as of type string
232
233 $schema_insert_xml .= " "; # indent
234 $schema_insert_xml .= "<field name=\"$field\" ";
235 $schema_insert_xml .= "type=\"string\" indexed=\"true\" ";
236 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
237 }
238
239 # just the one rule to date
240 my $insert_rules
241 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
242 'insert' => $schema_insert_xml } ];
243
244 my $solr_home = $ENV{'GEXT_SOLR'};
245 my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
246 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
247
248
249 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
250 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
251 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
252
253 # make sure output conf directory exists
254 if (!-d $out_dirname) {
255 &util::mk_dir($out_dirname);
256 }
257
258 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
259
260 # now do the same for solrconfig.xml, stopwords, ...
261 # these are simpler, as they currently do not need any filtering
262
263 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
264 "synonyms.txt", "protwords.txt" );
265
266 foreach my $file ( @in_file_list ) {
267 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
268 my $out_filename = &util::filename_cat($out_dirname,$file);
269 filter_in_out_file($in_filename,$out_filename,[]);
270 }
271}
272
273
274sub solr_core_admin
275{
276 my $self = shift (@_);
277 my ($url) = @_;
278
279 my $cmd = "wget -O - \"$url\" 2>&1";
280
281 my $preamble_output = "";
282 my $xml_output = "";
283 my $error_output = undef;
284
285 my $in_preamble = 1;
286
287 if (open(WIN,"$cmd |")) {
288
289 my $line;
290 while (defined ($line=<WIN>)) {
291
292 if ($line =~ m/ERROR \d+:/) {
293 chomp $line;
294 $error_output = $line;
295 last;
296 }
297 elsif ($in_preamble) {
298 if ($line =~ m/<.*>/) {
299 $in_preamble = 0;
300 }
301 else {
302 $preamble_output .= $line;
303 }
304 }
305
306 if (!$in_preamble) {
307 $xml_output .= $line;
308 }
309 }
310 close(WIN);
311
312 }
313 else {
314 $error_output = "Error: failed to run $cmd\n";
315 $error_output .= " $!\n";
316 }
317
318 my $output = { 'preamble' => $preamble_output,
319 'output' => $xml_output,
320 'error' => $error_output };
321
322 return $output;
323}
324
325sub pre_build_indexes
326{
327 my $self = shift (@_);
328 my ($indexname) = @_;
329 my $outhandle = $self->{'outhandle'};
330
331 # If the Solr/Jetty server is not already running, the following starts
332 # it up, and only returns when the server is "reading and listening"
333
334 my $solr_server = new solrserver();
335 $solr_server->start();
336 $self->{'solr_server'} = $solr_server;
337
338 my $indexes = [];
339 if (defined $indexname && $indexname =~ /\w/) {
340 push @$indexes, $indexname;
341 } else {
342 $indexes = $self->{'collect_cfg'}->{'indexes'};
343 }
344
345 # skip para-level check, as this is done in the main 'build_indexes'
346 # routine
347
348 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
349 my $allfields_index = 0; # do we have an allfields index?
350
351 # Using a hashmap here would duplications, but while more space
352 # efficient, it's not entirely clear it would be more computationally
353 # efficient
354 my @all_fields = ();
355
356 foreach my $index (@$indexes) {
357 if ($self->want_built($index)) {
358
359 # get the parameters for the output
360 # split on : just in case there is subcoll and lang stuff
361 my ($fields) = split (/:/, $index);
362
363 foreach my $field (split (/;/, $fields)) {
364 if ($field eq "metadata") {
365 $all_metadata_specified = 1;
366 }
367 else {
368 push(@all_fields,$field);
369 }
370 }
371 }
372 }
373
374 if ($all_metadata_specified) {
375
376 # (Unforunately) we need to process all the documents in the collection
377 # to figure out what the metadata_field_mapping is
378
379 # set up the document processr
380 $self->{'buildproc'}->set_output_handle (undef);
381 $self->{'buildproc'}->set_mode ('index_field_mapping');
382 $self->{'buildproc'}->reset();
383
384 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
385 $self->{'buildproc'}, $self->{'maxdocs'});
386 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
387 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
388 &plugin::end($self->{'pluginfo'});
389
390 }
391
392 else {
393 # Field mapping solely dependent of entries in 'indexes'
394
395 # No need to explicitly handle "allfields" as create_shortname()
396 # will get a fix on it through it's static_indexfield_map
397
398 my $buildproc = $self->{'buildproc'};
399
400 foreach my $field (@all_fields) {
401 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
402 my $shortname = $buildproc->create_shortname($field);
403 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
404 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
405 }
406 }
407 }
408
409 # Write out solr 'schema.xml' (and related) file
410 #
411 $self->make_final_field_list();
412 $self->premake_solr_auxiliary_files();
413
414 # Now update the solr-core information in solr.xml
415 # => at most two cores <colname>-Doc and <colname>-Sec
416
417 my $jetty_server_port = $ENV{'SOLR_JETTY_PORT'};
418 my $base_url = "http://localhost:$jetty_server_port/solr/admin/cores";
419
420 my $collection = $self->{'collection'};
421
422 # my $idx = $self->{'index_mapping'}->{$index};
423 my $idx = "idx";
424
425 foreach my $level (keys %{$self->{'levels'}}) {
426
427 my ($pindex) = $level =~ /^(.)/;
428
429 my $llevel = $mgppbuilder::level_map{$level};
430 my $core = $collection."-".lc($llevel);
431
432
433 # if collect==core not already in solr.xml (check with STATUS)
434 # => use CREATE API to add to solr.xml
435 #
436 # else
437 # => use RELOAD call to refresh fields now expressed in schema.xml
438
439
440 my $check_core_url = "$base_url?action=STATUS&core=$core";
441 my $output = $self->solr_core_admin($check_core_url);
442
443 if (defined $output->{'error'}) {
444
445 my $preamble = $output->{'preamble'};
446 my $error = $output->{'error'};
447
448 print STDERR "----\n";
449 print STDERR "Error: Failed to get XML response from:\n";
450 print STDERR " $check_core_url\n";
451 print STDERR "Output was:\n";
452 print STDERR $preamble if ($preamble ne "");
453 print STDERR "$error\n";
454 print STDERR "----\n";
455
456 next;
457 }
458
459 # If the collection doesn't exist yet, then there will be
460 # an empty element of the form:
461 # <lst name="collect-doc"/>
462 # where 'collect' is the actual name of the collection,
463 # such as demo
464
465 my $xml_output = $output->{'output'};
466
467 my $empty_element="<lst\\s+name=\"$core\"\\s*\\/>";
468
469 my $check_core_exists = !($xml_output =~ m/$empty_element/s);
470
471 if ($check_core_exists) {
472
473 my $reload_core_url = "$base_url?action=RELOAD&core=$core";
474
475 print $outhandle "Reloading Solr core: $core\n";
476 $self->solr_core_admin($reload_core_url);
477 }
478 else {
479
480 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
481 my $etc_dirname = &util::filename_cat($collect_home,"etc");
482
483 my $build_dir = $self->{'build_dir'};
484 my $idx_dirname = &util::filename_cat($build_dir,$pindex.$idx);
485
486 my $create_core_url = "$base_url?action=CREATE&name=$core";
487 $create_core_url .= "&instanceDir=$etc_dirname";
488 $create_core_url .= "&dataDir=$idx_dirname";
489
490 print $outhandle "Creating Solr core: $core\n";
491 $self->solr_core_admin($create_core_url);
492 }
493 }
494
495}
496
497# Essentially the same as the lucenebuilder.pm version, only using solr_passes
498# => refactor and make better use of inheritence
499
500sub build_index {
501 my $self = shift (@_);
502 my ($index,$llevel) = @_;
503 my $outhandle = $self->{'outhandle'};
504 my $build_dir = $self->{'build_dir'};
505
506 # get the full index directory path and make sure it exists
507 my $indexdir = $self->{'index_mapping'}->{$index};
508 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
509
510 # Find the perl script to call to run solr
511 my $solr_passes = $self->{'solr_passes'};
512 my $solr_passes_exe = $self->{'solr_passes_exe'};
513
514 # define the section names for solrpasses
515 # define the section names and possibly the doc name for solrpasses
516 my $solr_passes_sections = $llevel;
517
518 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
519
520 my $osextra = "";
521 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
522 $build_dir =~ s@/@\\@g;
523 } else {
524 if ($outhandle ne "STDERR") {
525 # so solr_passes doesn't print to stderr if we redirect output
526 $osextra .= " 2>/dev/null";
527 }
528 }
529
530 # get the index expression if this index belongs
531 # to a subcollection
532 my $indexexparr = [];
533 my $langarr = [];
534
535 # there may be subcollection info, and language info.
536 my ($fields, $subcollection, $language) = split (":", $index);
537 my @subcollections = ();
538 @subcollections = split /,/, $subcollection if (defined $subcollection);
539
540 foreach $subcollection (@subcollections) {
541 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
542 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
543 }
544 }
545
546 # add expressions for languages if this index belongs to
547 # a language subcollection - only put languages expressions for the
548 # ones we want in the index
549 my @languages = ();
550 my $languagemetadata = "Language";
551 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
552 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
553 }
554 @languages = split /,/, $language if (defined $language);
555 foreach my $language (@languages) {
556 my $not=0;
557 if ($language =~ s/^\!//) {
558 $not = 1;
559 }
560 if($not) {
561 push (@$langarr, "!$language");
562 } else {
563 push (@$langarr, "$language");
564 }
565 }
566
567 # Build index dictionary. Uses verbatim stem method
568 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
569 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
570 my ($handle);
571
572 if ($self->{'debug'}) {
573 $handle = *STDOUT;
574 } else {
575 my $collection = $self->{'collection'};
576
577 print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
578 if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
579 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
580 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
581 }
582 }
583
584 my $store_levels = $self->{'levels'};
585 my $db_level = "section"; #always
586 my $dom_level = "";
587 foreach my $key (keys %$store_levels) {
588 if ($mgppbuilder::level_map{$key} eq $llevel) {
589 $dom_level = $key;
590 }
591 }
592 if ($dom_level eq "") {
593 print STDERR "Warning: unrecognized tag level $llevel\n";
594 $dom_level = "document";
595 }
596
597 my $local_levels = { $dom_level => 1 }; # work on one level at a time
598
599 # set up the document processr
600 $self->{'buildproc'}->set_output_handle ($handle);
601 $self->{'buildproc'}->set_mode ('text');
602 $self->{'buildproc'}->set_index ($index, $indexexparr);
603 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
604 $self->{'buildproc'}->set_indexing_text (1);
605 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
606 $self->{'buildproc'}->set_levels ($local_levels);
607 $self->{'buildproc'}->set_db_level($db_level);
608 $self->{'buildproc'}->reset();
609
610 print $handle "<update>\n";
611
612 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
613 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
614
615
616 print $handle "</update>\n";
617
618 close ($handle) unless $self->{'debug'};
619
620 $self->print_stats();
621
622 $self->{'buildproc'}->set_levels ($store_levels);
623 print STDERR "</Stage>\n" if $self->{'gli'};
624
625}
626
627
628sub post_build_indexes {
629 my $self = shift(@_);
630
631 # deliberately override to prevent the mgpp post_build_index() calling
632 # $self->make_final_field_list()
633 # as this has been done in our pre_build_indexes() phase for solr
634
635
636 # Also need to stop the Solr/jetty server if it was explicitly started
637 # in pre_build_indexes()
638
639 my $solr_server = $self->{'solr_server'};
640
641 if ($solr_server->explicitly_started()) {
642 $solr_server->stop();
643 }
644
645 $self->{'solr_server'} = undef;
646
647}
648
649
6501;
651
652
Note: See TracBrowser for help on using the repository browser.