source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 24453

Last change on this file since 24453 was 24453, checked in by davidb, 13 years ago

Tidy up of code. Better structuring of classes

File size: 16.4 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
83 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
84 &util::mk_all_dir ($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
114 my $collection = $self->{'collection'};
115
116 print STDERR "Executable: $solr_passes_exe\n";
117 print STDERR "Sections: $solr_passes_sections\n";
118 print STDERR "Build Dir: $build_dir\n";
119 print STDERR "Cmd: $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
120 if (!open($handle, "| $solr_passes_exe $collection text $solr_passes_sections \"$build_dir\" \"dummy\" $osextra"))
121 {
122 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
123 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
124 }
125 }
126
127 # stored text is always Doc and Sec levels
128 my $levels = { 'document' => 1, 'section' => 1 };
129 # always do database at section level
130 my $db_level = "section";
131
132 # set up the document processr
133 $self->{'buildproc'}->set_output_handle ($handle);
134 $self->{'buildproc'}->set_mode ('text');
135 $self->{'buildproc'}->set_index ($textindex);
136 $self->{'buildproc'}->set_indexing_text (0);
137 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
138 $self->{'buildproc'}->set_levels ($levels);
139 $self->{'buildproc'}->set_db_level ($db_level);
140 $self->{'buildproc'}->reset();
141
142 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
143 $self->{'buildproc'}, $self->{'maxdocs'});
144 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
145 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
146 &plugin::end($self->{'pluginfo'});
147
148 close ($handle) unless $self->{'debug'};
149 $self->print_stats();
150
151 print STDERR "</Stage>\n" if $self->{'gli'};
152}
153
154#----
155
156
157
158sub filter_in_out_file
159{
160 my ($in_filename,$out_filename,$replace_rules) = @_;
161
162 if (open(SIN,"<$in_filename")) {
163
164 if (open(SOUT,">$out_filename")) {
165
166 my $line;
167 while (defined ($line=<SIN>)) {
168 chomp $line;
169
170 my $done_insert = 0;
171 foreach my $rule (@$replace_rules) {
172 my $line_re = $rule->{'regexp'};
173 my $insert = $rule->{'insert'};
174
175 if ($line =~ m/$line_re/) {
176 print SOUT $insert;
177 $done_insert = 1;
178 last;
179 }
180 }
181 if (!$done_insert) {
182 print SOUT "$line\n";;
183 }
184 }
185
186 close(SOUT);
187 }
188 else {
189 print STDERR "Error: Failed to open $out_filename\n";
190 print STDERR " $!\n";
191 }
192
193 close(SIN);
194 }
195 else {
196 print STDERR "Error: Failed to open $in_filename\n";
197 print STDERR " $!\n";
198 }
199
200}
201
202# Generate solr schema.xml file based on indexmapfield and other associated
203# config files
204#
205# Unlike make_auxiliary_files(), this needs to be done up-front (rather
206# than at the end) so the data-types in schema.xml are correctly set up
207# prior to document content being pumped through solr_passes.pl
208
209
210sub premake_solr_auxiliary_files
211{
212 my $self = shift (@_);
213
214 # Replace the following marker:
215 #
216 # <!-- ##GREENSTONE-FIELDS## -->
217 #
218 # with lines of the form:
219 #
220 # <field name="<field>" type="string" ... />
221 #
222 # for each <field> in 'indexfieldmap'
223
224 my $schema_insert_xml = "";
225
226 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
227
228 my ($field) = ($ifm =~ m/^.*->(.*)$/);
229
230 # Need special case for Long/Lat
231 # ... but for now treat everything as of type string
232
233 $schema_insert_xml .= " "; # indent
234 $schema_insert_xml .= "<field name=\"$field\" ";
235 $schema_insert_xml .= "type=\"string\" indexed=\"true\" ";
236 $schema_insert_xml .= "stored=\"false\" multiValued=\"true\" />\n";
237 }
238
239 # just the one rule to date
240 my $insert_rules
241 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
242 'insert' => $schema_insert_xml } ];
243
244 my $solr_home = $ENV{'GEXT_SOLR'};
245 my $in_dirname = &util::filename_cat($solr_home,"etc","conf");
246 my $schema_in_filename = &util::filename_cat($in_dirname,"schema.xml.in");
247
248
249 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
250 my $out_dirname = &util::filename_cat($collect_home,"etc","conf");
251 my $schema_out_filename = &util::filename_cat($out_dirname,"schema.xml");
252
253 # make sure output conf directory exists
254 if (!-d $out_dirname) {
255 &util::mk_dir($out_dirname);
256 }
257
258 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
259
260 # now do the same for solrconfig.xml, stopwords, ...
261 # these are simpler, as they currently do not need any filtering
262
263 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
264 "synonyms.txt", "protwords.txt" );
265
266 foreach my $file ( @in_file_list ) {
267 my $in_filename = &util::filename_cat($in_dirname,$file.".in");
268 my $out_filename = &util::filename_cat($out_dirname,$file);
269 filter_in_out_file($in_filename,$out_filename,[]);
270 }
271}
272
273
274sub solr_core_admin
275{
276 my $self = shift (@_);
277 my ($url) = @_;
278
279 my $cmd = "wget -q -O - \"$url\"";
280
281 my $status = undef;
282
283 if (open(WIN,"$cmd |")) {
284
285 my $xml_output = "";
286 my $line;
287 while (defined ($line=<WIN>)) {
288
289 $xml_output .= $line;
290 }
291 close(WIN);
292
293## print $xml_output;
294
295 ($status) = ($xml_output =~ m!<int name="status">(\d+)</int>!s);
296
297 }
298 else {
299 print STDERR "Warning: failed to run $cmd\n";
300 print STDERR " $!\n";
301 }
302
303 return $status;
304
305}
306
307sub pre_build_indexes
308{
309 my $self = shift (@_);
310 my ($indexname) = @_;
311 my $outhandle = $self->{'outhandle'};
312
313 # If the Solr/Jetty server is not already running, the following starts
314 # it up, and only returns when the server is "reading and listening"
315
316 my $solr_server = new solrserver();
317 $solr_server->start();
318 $self->{'solr_server'} = $solr_server;
319
320 my $indexes = [];
321 if (defined $indexname && $indexname =~ /\w/) {
322 push @$indexes, $indexname;
323 } else {
324 $indexes = $self->{'collect_cfg'}->{'indexes'};
325 }
326
327 # skip para-level check, as this is done in the main 'build_indexes'
328 # routine
329
330 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
331 my $allfields_index = 0; # do we have an allfields index?
332
333 # Using a hashmap here would duplications, but while more space
334 # efficient, it's not entirely clear it would be more computationally
335 # efficient
336 my @all_fields = ();
337
338 foreach my $index (@$indexes) {
339 if ($self->want_built($index)) {
340
341 # get the parameters for the output
342 # split on : just in case there is subcoll and lang stuff
343 my ($fields) = split (/:/, $index);
344
345 foreach my $field (split (/;/, $fields)) {
346 if ($field eq "metadata") {
347 $all_metadata_specified = 1;
348 }
349 else {
350 push(@all_fields,$field);
351 }
352 }
353 }
354 }
355
356 if ($all_metadata_specified) {
357
358 # (Unforunately) we need to process all the documents in the collection
359 # to figure out what the metadata_field_mapping is
360
361 # set up the document processr
362 $self->{'buildproc'}->set_output_handle (undef);
363 $self->{'buildproc'}->set_mode ('index_field_mapping');
364 $self->{'buildproc'}->reset();
365
366 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
367 $self->{'buildproc'}, $self->{'maxdocs'});
368 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
369 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
370 &plugin::end($self->{'pluginfo'});
371
372 }
373
374 else {
375 # Field mapping solely dependent of entries in 'indexes'
376
377 # No need to explicitly handle "allfields" as create_shortname()
378 # will get a fix on it through it's static_indexfield_map
379
380 my $buildproc = $self->{'buildproc'};
381
382 foreach my $field (@all_fields) {
383 if (!defined $buildproc->{'indexfieldmap'}->{$field}) {
384 my $shortname = $buildproc->create_shortname($field);
385 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
386 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
387 }
388 }
389 }
390
391 # Write out solr 'schema.xml' (and related) file
392 #
393 $self->make_final_field_list();
394 $self->premake_solr_auxiliary_files();
395
396 # Now update the solr-core information in solr.xml
397 # => at most two cores <colname>-Doc and <colname>-Sec
398
399 my $jetty_server_port = $ENV{'SOLR_JETTY_PORT'};
400 my $base_url = "http://localhost:$jetty_server_port/solr/admin/cores";
401
402 my $collection = $self->{'collection'};
403
404 foreach my $level (keys %{$self->{'levels'}}) {
405
406 my $llevel = $mgppbuilder::level_map{$level};
407
408 my $core = $collection."-".lc($llevel);
409 my $check_core_url = "$base_url?action=STATUS&core=$core";
410
411 my $check_status = $self->solr_core_admin($check_core_url);
412
413 print STDERR "*** check status = $check_status\n";
414
415
416 # if collect==core not already in solr.xml (check with STATUS)
417 # => use CREATE API to add to solr.xml
418 #
419 # else
420 # => use RELOAD call to refresh fields now expressed in schema.xml
421 }
422
423}
424
425# Essentially the same as the lucenebuilder.pm version, only using solr_passes
426# => refactor and make better use of inheritence
427
428sub build_index {
429 my $self = shift (@_);
430 my ($index,$llevel) = @_;
431 my $outhandle = $self->{'outhandle'};
432 my $build_dir = $self->{'build_dir'};
433
434 # get the full index directory path and make sure it exists
435 my $indexdir = $self->{'index_mapping'}->{$index};
436 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
437
438 # Find the perl script to call to run solr
439 my $solr_passes = $self->{'solr_passes'};
440 my $solr_passes_exe = $self->{'solr_passes_exe'};
441
442 # define the section names for solrpasses
443 # define the section names and possibly the doc name for solrpasses
444 my $solr_passes_sections = $llevel;
445
446 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
447
448 my $osextra = "";
449 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
450 $build_dir =~ s@/@\\@g;
451 } else {
452 if ($outhandle ne "STDERR") {
453 # so solr_passes doesn't print to stderr if we redirect output
454 $osextra .= " 2>/dev/null";
455 }
456 }
457
458 # get the index expression if this index belongs
459 # to a subcollection
460 my $indexexparr = [];
461 my $langarr = [];
462
463 # there may be subcollection info, and language info.
464 my ($fields, $subcollection, $language) = split (":", $index);
465 my @subcollections = ();
466 @subcollections = split /,/, $subcollection if (defined $subcollection);
467
468 foreach $subcollection (@subcollections) {
469 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
470 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
471 }
472 }
473
474 # add expressions for languages if this index belongs to
475 # a language subcollection - only put languages expressions for the
476 # ones we want in the index
477 my @languages = ();
478 my $languagemetadata = "Language";
479 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
480 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
481 }
482 @languages = split /,/, $language if (defined $language);
483 foreach my $language (@languages) {
484 my $not=0;
485 if ($language =~ s/^\!//) {
486 $not = 1;
487 }
488 if($not) {
489 push (@$langarr, "!$language");
490 } else {
491 push (@$langarr, "$language");
492 }
493 }
494
495 # Build index dictionary. Uses verbatim stem method
496 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
497 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
498 my ($handle);
499
500 if ($self->{'debug'}) {
501 $handle = *STDOUT;
502 } else {
503 my $collection = $self->{'collection'};
504
505 print STDERR "Cmd: $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
506 if (!open($handle, "| $solr_passes_exe $opt_create_index $collection index $solr_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
507 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
508 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
509 }
510 }
511
512 my $store_levels = $self->{'levels'};
513 my $db_level = "section"; #always
514 my $dom_level = "";
515 foreach my $key (keys %$store_levels) {
516 if ($mgppbuilder::level_map{$key} eq $llevel) {
517 $dom_level = $key;
518 }
519 }
520 if ($dom_level eq "") {
521 print STDERR "Warning: unrecognized tag level $llevel\n";
522 $dom_level = "document";
523 }
524
525 my $local_levels = { $dom_level => 1 }; # work on one level at a time
526
527 # set up the document processr
528 $self->{'buildproc'}->set_output_handle ($handle);
529 $self->{'buildproc'}->set_mode ('text');
530 $self->{'buildproc'}->set_index ($index, $indexexparr);
531 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
532 $self->{'buildproc'}->set_indexing_text (1);
533 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
534 $self->{'buildproc'}->set_levels ($local_levels);
535 $self->{'buildproc'}->set_db_level($db_level);
536 $self->{'buildproc'}->reset();
537
538 print $handle "<update>\n";
539
540 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
541 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
542
543
544 print $handle "</update>\n";
545
546 close ($handle) unless $self->{'debug'};
547
548 $self->print_stats();
549
550 $self->{'buildproc'}->set_levels ($store_levels);
551 print STDERR "</Stage>\n" if $self->{'gli'};
552
553}
554
555
556sub post_build_indexes {
557 my $self = shift(@_);
558
559 # deliberately override to prevent the mgpp post_build_index() calling
560 # $self->make_final_field_list()
561 # as this has been done in our pre_build_indexes() phase for solr
562
563
564 # Also need to stop the Solr/jetty server if it was explicitly started
565 # in pre_build_indexes()
566
567 my $solr_server = $self->{'solr_server'};
568
569 if ($solr_server->explicitly_started()) {
570 $solr_server->stop();
571 }
572
573 $self->{'solr_server'} = undef;
574
575}
576
577
5781;
579
580
Note: See TracBrowser for help on using the repository browser.