source: gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm@ 27802

Last change on this file since 27802 was 27802, checked in by kjdon, 11 years ago

adding in code for sort fields. just copied form lucene build code

File size: 19.9 KB
Line 
1###########################################################################
2#
3# solrbuilder.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27package solrbuilder;
28
29use strict;
30no strict 'refs';
31
32use lucenebuilder;
33use solrserver;
34use Config; # for getting the perlpath in the recommended way
35
36sub BEGIN {
37 @solrbuilder::ISA = ('lucenebuilder');
38}
39
40
41sub new {
42 my $class = shift(@_);
43 my $self = new lucenebuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "solr";
47
48 my $solr_passes_script = "solr_passes.pl";
49
50 $self->{'solr_passes'} = "$solr_passes_script";
51 # Tack perl on the beginning to ensure execution
52 $self->{'solr_passes_exe'} = "\"$Config{perlpath}\" -S \"$solr_passes_script\"";
53 return $self;
54}
55
56
57sub default_buildproc {
58 my $self = shift (@_);
59
60 return "solrbuildproc";
61}
62
63# This writes a nice version of the text docs
64#
65# Essentially the same as the lucenebuilder.pm version, only using solr_passes
66# => refactor and make better use of inheritence
67#
68sub compress_text
69{
70 my $self = shift (@_);
71 # do nothing if we don't want compressed text
72 return if $self->{'no_text'};
73
74 my ($textindex) = @_;
75
76 # workaround to avoid hard-coding "solr" check into buildcol.pl
77 $textindex =~ s/^section://;
78
79 my $outhandle = $self->{'outhandle'};
80
81 # the text directory
82 my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
83 my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
84 &FileUtils::makeAllDirectories($text_dir);
85
86 my $osextra = "";
87 if ($ENV{'GSDLOS'} =~ /^windows$/i)
88 {
89 $text_dir =~ s@/@\\@g;
90 }
91 else
92 {
93 if ($outhandle ne "STDERR")
94 {
95 # so solr_passes doesn't print to stderr if we redirect output
96 $osextra .= " 2>/dev/null";
97 }
98 }
99
100 # Find the perl script to call to run solr
101 my $solr_passes = $self->{'solr_passes'};
102 my $solr_passes_exe = $self->{'solr_passes_exe'};
103
104 my $solr_passes_sections = "Doc";
105
106 my ($handle);
107
108 if ($self->{'debug'})
109 {
110 $handle = *STDOUT;
111 }
112 else
113 {
114 my $site = $self->{'site'};
115 my $collect = $self->{'collection'};
116 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
117 my $core = $core_prefix; # unused in this call to solr_passes
118
119 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
120
121 print STDERR "Executable: $solr_passes_exe\n";
122 print STDERR "Sections: $solr_passes_sections\n";
123 print STDERR "Build Dir: $build_dir\n";
124 print STDERR "Cmd: $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra\n";
125 if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\" \"dummy\" $osextra"))
126 {
127 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
128 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
129 }
130 }
131
132 # stored text is always Doc and Sec levels
133 my $levels = { 'document' => 1, 'section' => 1 };
134 # always do database at section level
135 my $db_level = "section";
136
137 # set up the document processr
138 $self->{'buildproc'}->set_output_handle ($handle);
139 $self->{'buildproc'}->set_mode ('text');
140 $self->{'buildproc'}->set_index ($textindex);
141 $self->{'buildproc'}->set_indexing_text (0);
142 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
143 $self->{'buildproc'}->set_levels ($levels);
144 $self->{'buildproc'}->set_db_level ($db_level);
145 $self->{'buildproc'}->reset();
146
147 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
148 $self->{'buildproc'}, $self->{'maxdocs'});
149 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
150 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
151 &plugin::end($self->{'pluginfo'});
152
153 close ($handle) unless $self->{'debug'};
154 $self->print_stats();
155
156 print STDERR "</Stage>\n" if $self->{'gli'};
157}
158
159#----
160
161
162
163sub filter_in_out_file
164{
165 my ($in_filename,$out_filename,$replace_rules) = @_;
166
167 if (open(SIN,"<$in_filename")) {
168
169 if (open(SOUT,">$out_filename")) {
170
171 my $line;
172 while (defined ($line=<SIN>)) {
173 chomp $line;
174
175 my $done_insert = 0;
176 foreach my $rule (@$replace_rules) {
177 my $line_re = $rule->{'regexp'};
178 my $insert = $rule->{'insert'};
179
180 if ($line =~ m/$line_re/) {
181 print SOUT $insert;
182 $done_insert = 1;
183 last;
184 }
185 }
186 if (!$done_insert) {
187 print SOUT "$line\n";;
188 }
189 }
190
191 close(SOUT);
192 }
193 else {
194 print STDERR "Error: Failed to open $out_filename\n";
195 print STDERR " $!\n";
196 }
197
198 close(SIN);
199 }
200 else {
201 print STDERR "Error: Failed to open $in_filename\n";
202 print STDERR " $!\n";
203 }
204
205}
206
207# We need to push the list of indexfield to shortname mappings through to the
208# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
209# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
210# a mess of this - it only output fields that have been processed (none have)
211# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
212# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
213# This version is dumber - just copy them all across verbatum - but works. We
214# do still need to support the special case of 'allfields'
215sub make_final_field_list
216{
217 my $self = shift (@_);
218 $self->{'build_cfg'} = {};
219 my @indexfieldmap = ();
220 my @indexfields = ();
221
222 # @todo support: $self->{'buildproc'}->{'extraindexfields'}
223 foreach my $fields (@{$self->{'collect_cfg'}->{'indexes'}})
224 {
225 # remove subcoll stuff
226 $fields =~ s/:.*$//;
227 foreach my $field (split(';', $fields))
228 {
229 my $shortname = 'ERROR';
230 if ($field eq 'allfields')
231 {
232 $shortname = 'ZZ';
233 }
234 elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
235 {
236 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
237 }
238 else
239 {
240 print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
241 }
242 push (@indexfieldmap, $field . '->' . $shortname);
243 push (@indexfields, $field);
244 }
245 }
246
247 if (scalar @indexfieldmap)
248 {
249 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
250 }
251
252 if (scalar @indexfields)
253 {
254 $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
255 }
256}
257
258# Generate solr schema.xml file based on indexmapfield and other associated
259# config files
260#
261# Unlike make_auxiliary_files(), this needs to be done up-front (rather
262# than at the end) so the data-types in schema.xml are correctly set up
263# prior to document content being pumped through solr_passes.pl
264
265
266sub premake_solr_auxiliary_files
267{
268 my $self = shift (@_);
269
270 # Replace the following marker:
271 #
272 # <!-- ##GREENSTONE-FIELDS## -->
273 #
274 # with lines of the form:
275 #
276 # <field name="<field>" type="string" ... />
277 #
278 # for each <field> in 'indexfieldmap'
279
280 my $schema_insert_xml = "";
281
282 foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
283
284 my ($field) = ($ifm =~ m/^.*->(.*)$/);
285
286 $schema_insert_xml .= " "; # indent
287 $schema_insert_xml .= "<field name=\"$field\" ";
288
289 if($field eq "LA" || $field eq "LO")
290 {
291 $schema_insert_xml .= "type=\"location\" ";
292 }
293# elsif ($field ne "ZZ" && $field ne "TX")
294# {
295# $schema_insert_xml .= "type=\"string\" ";
296# }
297 else
298 {
299 $schema_insert_xml .= "type=\"text_en_splitting\" ";
300 }
301 $schema_insert_xml .= "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
302 #$schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n";
303 }
304
305 # just the one rule to date
306 my $insert_rules
307 = [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
308 'insert' => $schema_insert_xml } ];
309
310 my $solr_home = $ENV{'GEXT_SOLR'};
311## my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"etc","conf");
312 my $in_dirname = &FileUtils::filenameConcatenate($solr_home,"conf");
313 my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");
314
315 my $collect_home = $ENV{'GSDLCOLLECTDIR'};
316 my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
317 my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
318
319 # make sure output conf directory exists
320 if (!FileUtils::directoryExists($out_dirname)) {
321 &FileUtils::makeDirectory($out_dirname);
322 }
323
324 filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);
325
326 # now do the same for solrconfig.xml, stopwords, ...
327 # these are simpler, as they currently do not need any filtering
328
329 my @in_file_list = ( "solrconfig.xml", "stopwords.txt", "stopwords_en.txt",
330 "synonyms.txt", "protwords.txt" );
331
332 foreach my $file ( @in_file_list ) {
333 my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file.".in");
334 my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
335 filter_in_out_file($in_filename,$out_filename,[]);
336 }
337}
338
339
340sub pre_build_indexes
341{
342 my $self = shift (@_);
343 my ($indexname) = @_;
344 my $outhandle = $self->{'outhandle'};
345
346 # If the Solr/Jetty server is not already running, the following starts
347 # it up, and only returns when the server is "reading and listening"
348
349 my $solr_server = new solrserver($self->{'build_dir'});
350 $solr_server->start();
351 $self->{'solr_server'} = $solr_server;
352
353 my $indexes = [];
354 if (defined $indexname && $indexname =~ /\w/) {
355 push @$indexes, $indexname;
356 } else {
357 $indexes = $self->{'collect_cfg'}->{'indexes'};
358 }
359
360 # skip para-level check, as this is done in the main 'build_indexes'
361 # routine
362
363 my $all_metadata_specified = 0; # has the user added a 'metadata' index?
364 my $allfields_index = 0; # do we have an allfields index?
365
366 # Using a hashmap here would avoid duplications, but while more space
367 # efficient, it's not entirely clear it would be more computationally
368 # efficient
369 my @all_fields = ();
370
371 foreach my $index (@$indexes) {
372 if ($self->want_built($index)) {
373
374 # get the parameters for the output
375 # split on : just in case there is subcoll and lang stuff
376 my ($fields) = split (/:/, $index);
377
378 foreach my $field (split (/;/, $fields)) {
379 if ($field eq "metadata") {
380 $all_metadata_specified = 1;
381 }
382 else {
383 push(@all_fields,$field);
384 }
385 }
386 }
387 }
388
389 if ($all_metadata_specified) {
390
391 # (Unforunately) we need to process all the documents in the collection
392 # to figure out what the metadata_field_mapping is
393
394 # set up the document processr
395 $self->{'buildproc'}->set_output_handle (undef);
396 $self->{'buildproc'}->set_mode ('index_field_mapping');
397 $self->{'buildproc'}->reset();
398
399 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
400 $self->{'buildproc'}, $self->{'maxdocs'});
401 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
402 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
403 &plugin::end($self->{'pluginfo'});
404
405 }
406
407 else {
408 # Field mapping solely dependent of entries in 'indexes'
409
410 # No need to explicitly handle "allfields" as create_shortname()
411 # will get a fix on it through it's static_indexfield_map
412
413 my $buildproc = $self->{'buildproc'};
414
415 foreach my $field (@all_fields)
416 {
417 if (!defined $buildproc->{'indexfieldmap'}->{$field})
418 {
419 my $shortname = '';
420 if (defined $buildproc->{'fieldnamemap'}->{$field})
421 {
422 $shortname = $buildproc->{'fieldnamemap'}->{$field};
423 }
424 else
425 {
426 $shortname = $buildproc->create_shortname($field);
427 }
428 $buildproc->{'indexfieldmap'}->{$field} = $shortname;
429 $buildproc->{'indexfieldmap'}->{$shortname} = 1;
430 }
431 }
432 }
433
434 # Write out solr 'schema.xml' (and related) file
435 #
436 $self->make_final_field_list();
437 $self->premake_solr_auxiliary_files();
438
439 # Now update the solr-core information in solr.xml
440 # => at most two cores <colname>-Doc and <colname>-Sec
441
442 my $site = $self->{'site'};
443 my $collect = $self->{'collection'};
444 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
445
446 # my $idx = $self->{'index_mapping'}->{$index};
447 my $idx = "idx";
448
449 my $build_dir = $self->{'build_dir'};
450
451 foreach my $level (keys %{$self->{'levels'}}) {
452
453 my ($pindex) = $level =~ /^(.)/;
454
455 my $index_dir = $pindex.$idx;
456 my $core = "$core_prefix-$index_dir";
457
458 # force_removeold == opposite of being run in 'incremental' mode
459 my $force_removeold = ($self->{'incremental'}) ? 0 : 1;
460
461 if ($force_removeold) {
462 print $outhandle "\n-removeold set (new index will be created)\n";
463
464 # create cores under temporary core names, corresponding to building directory
465 $core = "building-".$core;
466
467 my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
468 &FileUtils::removeFilesRecursive($full_index_dir);
469 &FileUtils::makeDirectory($full_index_dir);
470
471 # Solr then wants an "index" folder within this general index area!
472# my $full_index_index_dir = &FileUtils::filenameConcatenate($full_index_dir,"index");
473# &FileUtils::makeDirectory($full_index_index_dir);
474
475
476 # now go on and create new index
477 print $outhandle "Creating Solr core: $core\n";
478 $solr_server->admin_create_core($core);
479
480 }
481 else {
482 # if collect==core already in solr.xml (check with STATUS)
483 # => use RELOAD call to refresh fields now expressed in schema.xml
484 #
485 # else
486 # => use CREATE API to add to solr.xml
487
488 my $check_core_exists = $solr_server->admin_ping_core($core);
489
490 if ($check_core_exists) {
491 print $outhandle "Reloading Solr core: $core\n";
492 $solr_server->admin_reload_core($core);
493 }
494 else {
495 print $outhandle "Creating Solr core: $core\n";
496 $solr_server->admin_create_core($core);
497 }
498 }
499 }
500
501}
502
503# Essentially the same as the lucenebuilder.pm version, only using solr_passes
504# => refactor and make better use of inheritence
505
506sub build_index {
507 my $self = shift (@_);
508 my ($index,$llevel) = @_;
509 my $outhandle = $self->{'outhandle'};
510 my $build_dir = $self->{'build_dir'};
511
512 # get the full index directory path and make sure it exists
513 my $indexdir = $self->{'index_mapping'}->{$index};
514 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));
515
516 # Find the perl script to call to run solr
517 my $solr_passes = $self->{'solr_passes'};
518 my $solr_passes_exe = $self->{'solr_passes_exe'};
519
520 # define the section names for solrpasses
521 # define the section names and possibly the doc name for solrpasses
522 my $solr_passes_sections = $llevel;
523
524 my $osextra = "";
525 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
526 $build_dir =~ s@/@\\@g;
527 } else {
528 if ($outhandle ne "STDERR") {
529 # so solr_passes doesn't print to stderr if we redirect output
530 $osextra .= " 2>/dev/null";
531 }
532 }
533
534 # get the index expression if this index belongs
535 # to a subcollection
536 my $indexexparr = [];
537 my $langarr = [];
538
539 # there may be subcollection info, and language info.
540 my ($fields, $subcollection, $language) = split (":", $index);
541 my @subcollections = ();
542 @subcollections = split /,/, $subcollection if (defined $subcollection);
543
544 foreach $subcollection (@subcollections) {
545 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
546 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
547 }
548 }
549
550 # add expressions for languages if this index belongs to
551 # a language subcollection - only put languages expressions for the
552 # ones we want in the index
553 my @languages = ();
554 my $languagemetadata = "Language";
555 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
556 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
557 }
558 @languages = split /,/, $language if (defined $language);
559 foreach my $language (@languages) {
560 my $not=0;
561 if ($language =~ s/^\!//) {
562 $not = 1;
563 }
564 if($not) {
565 push (@$langarr, "!$language");
566 } else {
567 push (@$langarr, "$language");
568 }
569 }
570
571 # Build index dictionary. Uses verbatim stem method
572 print $outhandle "\n creating index dictionary (solr_passes -I1)\n" if ($self->{'verbosity'} >= 1);
573 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
574 my ($handle);
575
576 if ($self->{'debug'}) {
577 $handle = *STDOUT;
578 } else {
579 my $site = $self->{'site'};
580 my $collect = $self->{'collection'};
581 my $core_prefix = (defined $site) ? "$site-$collect" : $collect;
582 my $ds_idx = $self->{'index_mapping'}->{$index};
583 my $core = "$core_prefix-$ds_idx";
584
585 $core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold
586
587 print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra\n";
588 if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\" \"$indexdir\" $osextra")) {
589 print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
590 die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
591 }
592 }
593
594 my $store_levels = $self->{'levels'};
595 my $db_level = "section"; #always
596 my $dom_level = "";
597 foreach my $key (keys %$store_levels) {
598 if ($mgppbuilder::level_map{$key} eq $llevel) {
599 $dom_level = $key;
600 }
601 }
602 if ($dom_level eq "") {
603 print STDERR "Warning: unrecognized tag level $llevel\n";
604 $dom_level = "document";
605 }
606
607 my $local_levels = { $dom_level => 1 }; # work on one level at a time
608
609 # set up the document processr
610 $self->{'buildproc'}->set_output_handle ($handle);
611 $self->{'buildproc'}->set_mode ('text');
612 $self->{'buildproc'}->set_index ($index, $indexexparr);
613 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
614 $self->{'buildproc'}->set_indexing_text (1);
615 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
616 $self->{'buildproc'}->set_levels ($local_levels);
617 if (defined $self->{'collect_cfg'}->{'sortfields'}) {
618 $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
619 }
620 $self->{'buildproc'}->set_db_level($db_level);
621 $self->{'buildproc'}->reset();
622
623 print $handle "<update>\n";
624
625 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
626 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
627
628 print $handle "</update>\n";
629
630 close ($handle) unless $self->{'debug'};
631
632 $self->print_stats();
633
634 $self->{'buildproc'}->set_levels ($store_levels);
635 print STDERR "</Stage>\n" if $self->{'gli'};
636
637}
638
639
640sub post_build_indexes {
641 my $self = shift(@_);
642
643 # deliberately override to prevent the mgpp post_build_index() calling
644 # $self->make_final_field_list()
645 # as this has been done in our pre_build_indexes() phase for solr
646
647
648 # Also need to stop the Solr/jetty server if it was explicitly started
649 # in pre_build_indexes()
650
651 my $solr_server = $self->{'solr_server'};
652
653 if ($solr_server->explicitly_started()) {
654 $solr_server->stop();
655 }
656
657 $self->{'solr_server'} = undef;
658
659}
660
661
6621;
663
664
Note: See TracBrowser for help on using the repository browser.