source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 24501

Last change on this file since 24501 was 24447, checked in by davidb, 13 years ago

Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running

File size: 15.9 KB
Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33# http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49 @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54 my $class = shift @_;
55 my $self = new lucenebuildproc (@_);
56
57 return bless $self, $class;
58}
59
60
61#----
62
63sub index_field_mapping_edit {
64 my $self = shift (@_);
65 my ($doc_obj,$file,$edit_mode) = @_;
66
67 # Only add/update gets to here
68 # Currently there is no need to distinguish between these edit modes
69
70 my $outhandle = $self->{'outhandle'};
71
72 # only study this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77 # get the parameters for the output
78 # split on : just in case there is subcoll and lang stuff
79 my ($fields) = split (/:/, $self->{'index'});
80
81 my $doc_section = 0; # just for this document
82
83 # get the text for this document
84 my $section = $doc_obj->get_top_section();
85
86 while (defined $section)
87 {
88 $doc_section++;
89
90 # if we are doing subcollections, then some docs shouldn't be
91 # considered for indexing
92
93 my $indexed_section
94 = $doc_obj->get_metadata_element($section, "gsdldoctype")
95 || "indexed_section";
96
97 if (($indexed_doc == 0)
98 || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99 $section = $doc_obj->get_next_section($section);
100 next;
101 }
102
103 # has the user added a 'metadata' index?
104 my $all_metadata_specified = 0;
105
106 # which fields have already been indexed?
107 # (same as fields, but in a map)
108 my $specified_fields = {};
109
110 # do we have an allfields index??
111 my $allfields_index = 0;
112
113 # collect up all the text for it in here
114 my $allfields_text = "";
115
116 foreach my $field (split (/;/, $fields)) {
117 if ($field eq "allfields") {
118 $allfields_index = 1;
119 } elsif ($field eq "metadata") {
120 $all_metadata_specified = 1;
121 }
122 }
123
124 foreach my $field (split (/;/, $fields)) {
125
126 # only deal with this field if it doesn't start with top or
127 # this is the first section
128 my $real_field = $field;
129 next if (($real_field =~ s/^top//) && ($doc_section != 1));
130
131 # process these two later
132 next if ($real_field eq "allfields" || $real_field eq "metadata");
133
134 # individual metadata and or text specified
135 # -- could be a comma separated list
136 $specified_fields->{$real_field} = 1;
137
138 if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139 my $shortname = $self->create_shortname($real_field);
140 $self->{'indexfieldmap'}->{$real_field} = $shortname;
141 $self->{'indexfieldmap'}->{$shortname} = 1;
142 }
143 } # foreach field
144
145
146 if ($all_metadata_specified) {
147
148 my $new_text = "";
149 my $shortname = "";
150 my $metadata = $doc_obj->get_all_metadata ($section);
151
152 foreach my $pair (@$metadata) {
153 my ($mfield, $mvalue) = (@$pair);
154
155 # no value
156 next unless defined $mvalue && $mvalue ne "";
157
158 # we have already indexed this
159 next if defined ($specified_fields->{$mfield});
160
161 # check fields here, maybe others dont want - change to use dontindex!!
162 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
163 next if ($mfield =~ /^gsdl/);
164
165 if (defined $self->{'indexfieldmap'}->{$mfield}) {
166 $shortname = $self->{'indexfieldmap'}->{$mfield};
167 }
168 else {
169 $shortname = $self->create_shortname($mfield);
170 $self->{'indexfieldmap'}->{$mfield} = $shortname;
171 $self->{'indexfieldmap'}->{$shortname} = 1;
172 }
173
174 if (!defined $self->{'indexfields'}->{$mfield}) {
175 $self->{'indexfields'}->{$mfield} = 1;
176 }
177 }
178 }
179
180 if ($allfields_index) {
181 # add the index name mapping
182 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183 $self->{'indexfieldmap'}->{"ZZ"} = 1;
184 }
185
186 $section = $doc_obj->get_next_section($section);
187
188 } # while defined section
189
190
191}
192
193sub index_field_mapping {
194 my $self = shift (@_);
195 my ($doc_obj,$file) = @_;
196
197 $self->index_field_mapping_edit($doc_obj,$file,"add");
198}
199
200sub index_field_mappingreindex
201{
202 my $self = shift (@_);
203 my ($doc_obj,$file) = @_;
204
205 $self->index_field_mapping_edit($doc_obj,$file,"update");
206}
207
208sub index_field_mappingdelete
209{
210 my $self = shift (@_);
211 my ($doc_obj,$file) = @_;
212
213 return; # nothing to be done
214}
215
216
217#----
218
219sub textedit {
220 my $self = shift (@_);
221 my ($doc_obj,$file,$edit_mode) = @_;
222
223
224 if (!$self->get_indexing_text()) {
225 # In text-compress mode:
226 # => want document to be output in the simple <Doc>..</Doc> as is
227 # done by its super-class
228 return $self->SUPER::textedit(@_);
229 }
230
231 # "update" for $edit_mode near identical to "add" as we use Solr in its
232 # default mode of replacing an existing document if the new document
233 # has the same doc id. Main area of difference between "add" and "update"
234 # is that we do not update our 'stats' for number of documents or number
235 # of bytes processed. The latter is inaccurate, but considered better
236 # than allowing the value to steadily climb.
237
238
239 my $solrhandle = $self->{'output_handle'};
240 my $outhandle = $self->{'outhandle'};
241
242 # only output this document if it is one to be indexed
243 return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245 # skip this document if in "compress-text" mode and asked to delete it
246 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250 # this is another document
251 if ($edit_mode eq "add") {
252 $self->{'num_docs'} += 1;
253 }
254 elsif ($edit_mode eq "delete") {
255 $self->{'num_docs'} -= 1;
256 }
257
258 # get the parameters for the output
259 # split on : just in case there is subcoll and lang stuff
260 my ($fields) = split (/:/, $self->{'index'});
261
262 my $levels = $self->{'levels'};
263 my $ldoc_level = $levels->{'document'};
264 my $lsec_level = $levels->{'section'};
265
266 my $gs2_docOID = $doc_obj->get_OID();
267
268
269 my $start_doc;
270 my $end_doc;
271
272 if ($edit_mode eq "add") {
273 $start_doc = " <add>\n";
274 $start_doc .= " <doc>\n";
275 $start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
276
277 $end_doc = " </doc>\n";
278 $end_doc .= " </add>\n";
279 }
280 else {
281 $start_doc = " <delete>\n";
282 $start_doc .= " <id>$gs2_docOID</id>\n";
283
284 $end_doc = " </delete>\n";
285 }
286
287 # add/update, delete
288
289 my $sec_tag_name = "";
290 if ($lsec_level)
291 {
292 $sec_tag_name = $mgppbuildproc::level_map{'section'};
293 }
294
295 my $doc_section = 0; # just for this document
296
297 # only output if working with doc level
298 my $text = $start_doc if ($sec_tag_name eq "");
299
300 # get the text for this document
301 my $section = $doc_obj->get_top_section();
302
303 while (defined $section)
304 {
305 # update a few statistics
306 $doc_section++;
307 $self->{'num_sections'}++;
308
309 my $sec_gs2_id = $self->{'num_sections'};
310 my $sec_gs2_docOID = $gs2_docOID;
311 $sec_gs2_docOID .= ".$section" if ($section ne "");
312
313 my $start_sec;
314 my $end_sec;
315
316 if ($edit_mode eq "add") {
317 $start_sec = " <add>\n";
318 $start_sec .= " <doc>\n";
319 $start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
320
321 $end_sec = " </doc>\n";
322 $end_sec .= " </add>\n";
323 }
324 else {
325 $start_sec = " <delete>\n";
326 $start_sec .= " <id>$sec_gs2_docOID</id>\n";
327
328 $end_sec = " </delete>\n";
329 }
330
331
332 # if we are doing subcollections, then some docs shouldn't be indexed.
333 # but we need to put the section tag placeholders in there so the
334 # sections match up with database
335 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
336 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
337 if ($sec_tag_name ne "") {
338 $text .= $start_sec;
339 $text .= $end_sec;
340 }
341 $section = $doc_obj->get_next_section($section);
342 next;
343 }
344
345 # add in start section tag if indexing at the section level
346 $text .= $start_sec if ($sec_tag_name ne "");
347
348 if ($edit_mode eq "add") {
349 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
350 }
351 elsif ($edit_mode eq "delete") {
352 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
353 }
354
355
356 # has the user added a 'metadata' index?
357 my $all_metadata_specified = 0;
358 # which fields have already been indexed? (same as fields, but in a map)
359 my $specified_fields = {};
360
361 # do we have an allfields index??
362 my $allfields_index = 0;
363 # collect up all the text for it in here
364 my $allfields_text = "";
365 foreach my $field (split (/;/, $fields)) {
366 if ($field eq "allfields") {
367 $allfields_index = 1;
368 } elsif ($field eq "metadata") {
369 $all_metadata_specified = 1;
370 }
371 }
372
373 foreach my $field (split (/;/, $fields)) {
374
375 # only deal with this field if it doesn't start with top or
376 # this is the first section
377 my $real_field = $field;
378 next if (($real_field =~ s/^top//) && ($doc_section != 1));
379
380 # process these two later
381 next if ($real_field eq "allfields" || $real_field eq "metadata");
382
383 #individual metadata and or text specified - could be a comma separated list
384 $specified_fields->{$real_field} = 1;
385 my $shortname="";
386 my $new_field = 0; # have we found a new field name?
387 if (defined $self->{'indexfieldmap'}->{$real_field}) {
388 $shortname = $self->{'indexfieldmap'}->{$real_field};
389 }
390 else {
391 $shortname = $self->create_shortname($real_field);
392 $new_field = 1;
393 }
394
395 my @metadata_list = (); # put any metadata values in here
396 my $section_text = ""; # put the text in here
397 foreach my $submeta (split /,/, $real_field) {
398 if ($submeta eq "text") {
399 # no point in indexing text more than once
400 if ($section_text eq "") {
401 $section_text = $doc_obj->get_text($section);
402 if ($self->{'indexing_text'}) {
403 # we always strip html
404 $section_text = $self->preprocess_text($section_text, 1, "");
405 }
406 else {
407 # leave html stuff in, but escape the tags
408 &ghtml::htmlsafe($section_text);
409 }
410 }
411 }
412 else {
413 $submeta =~ s/^ex\.//; #strip off ex.
414
415 # its a metadata element
416 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
417 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
418 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
419 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
420 }
421 }
422 push (@metadata_list, @section_metadata);
423 }
424 } # for each field in this one index
425
426 # now we add the text and/or metadata into new_text
427 if ($section_text ne "" || scalar(@metadata_list)) {
428 my $new_text = "";
429
430 if ($section_text ne "") {
431 $new_text .= "$section_text ";
432 }
433
434 foreach my $item (@metadata_list) {
435 &ghtml::htmlsafe($item);
436 $new_text .= "$item ";
437 }
438
439 if ($allfields_index) {
440 $allfields_text .= $new_text;
441 }
442
443 if ($self->{'indexing_text'}) {
444 # add the tag
445 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
446 }
447 # filter the text
448 $new_text = $self->filter_text ($field, $new_text);
449
450 if ($edit_mode eq "add") {
451 $self->{'num_processed_bytes'} += length ($new_text);
452 $text .= "$new_text";
453 }
454 elsif ($edit_mode eq "update") {
455 $text .= "$new_text";
456 }
457 elsif ($edit_mode eq "delete") {
458 $self->{'num_processed_bytes'} -= length ($new_text);
459 }
460
461
462 if ($self->{'indexing_text'} && $new_field) {
463 # we need to add to the list in indexfields
464
465 $self->{'indexfieldmap'}->{$real_field} = $shortname;
466 $self->{'indexfieldmap'}->{$shortname} = 1;
467 }
468
469 }
470
471 } # foreach field
472
473
474 if ($all_metadata_specified) {
475
476 my $new_text = "";
477 my $shortname = "";
478 my $metadata = $doc_obj->get_all_metadata ($section);
479 foreach my $pair (@$metadata) {
480 my ($mfield, $mvalue) = (@$pair);
481
482 # no value
483 next unless defined $mvalue && $mvalue ne "";
484
485 # we have already indexed this
486 next if defined ($specified_fields->{$mfield});
487
488 # check fields here, maybe others dont want - change to use dontindex!!
489 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
490 next if ($mfield =~ /^gsdl/);
491
492 &ghtml::htmlsafe($mvalue);
493
494 if (defined $self->{'indexfieldmap'}->{$mfield}) {
495 $shortname = $self->{'indexfieldmap'}->{$mfield};
496 }
497 else {
498 $shortname = $self->create_shortname($mfield);
499 $self->{'indexfieldmap'}->{$mfield} = $shortname;
500 $self->{'indexfieldmap'}->{$shortname} = 1;
501 }
502 $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
503 if ($allfields_index) {
504 $allfields_text .= "$mvalue ";
505 }
506
507 if (!defined $self->{'indexfields'}->{$mfield}) {
508 $self->{'indexfields'}->{$mfield} = 1;
509 }
510
511 }
512 # filter the text
513 $new_text = $self->filter_text ("metadata", $new_text);
514
515 if ($edit_mode eq "add") {
516 $self->{'num_processed_bytes'} += length ($new_text);
517 $text .= "$new_text";
518 }
519 elsif ($edit_mode eq "update") {
520 $text .= "$new_text";
521 }
522 elsif ($edit_mode eq "delete") {
523 $self->{'num_processed_bytes'} -= length ($new_text);
524 }
525 }
526
527 if ($allfields_index) {
528 # add the index name mapping
529 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
530 $self->{'indexfieldmap'}->{"ZZ"} = 1;
531
532 my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
533 # filter the text
534 $new_text = $self->filter_text ("allfields", $new_text);
535
536 if ($edit_mode eq "add") {
537 $self->{'num_processed_bytes'} += length ($new_text);
538 $text .= "$new_text";
539 }
540 elsif ($edit_mode eq "update") {
541 $text .= "$new_text";
542 }
543 elsif ($edit_mode eq "delete") {
544 $self->{'num_processed_bytes'} -= length ($new_text);
545 }
546 }
547
548 # add in end tag if at top-level doc root, or indexing at the section level
549 $text .= $end_sec if ($sec_tag_name ne "");
550
551 $section = $doc_obj->get_next_section($section);
552 } # while defined section
553
554
555 # only output if working with doc level
556 $text .= $end_doc if ($sec_tag_name eq "");
557
558## $text .= "<commit/>\n";
559
560 print $solrhandle $text;
561
562}
563
564
565
566
567sub textreindex
568{
569 my $self = shift (@_);
570 my ($doc_obj,$file) = @_;
571
572 $self->textedit($doc_obj,$file,"update");
573}
574
575
5761;
577
578
Note: See TracBrowser for help on using the repository browser.