source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 29945

Last change on this file since 29945 was 29945, checked in by ak19, 9 years ago

Incremental building now works (again) for solr. The changes were 1. deleting a document should only contain the delete xml tags and an inner tag of the doc id, not the entire text and metadata as is necessary when adding a document. 2. Reindexing in solrbuildproc used to call textedit with the mode set to update, but update is not implemented in solrbuildproc, instead it now calls textedit first with the mode set to delete and then with the mode set to add.

File size: 19.3 KB
Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33# http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49 @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54 my $class = shift @_;
55 my $self = new lucenebuildproc (@_);
56
57 return bless $self, $class;
58}
59
60sub set_facetfields {
61 my $self = shift (@_);
62
63 my ($facetfields) = @_;
64 $self->{'facetfields'} = ();
65 # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66 foreach my $s (@$facetfields) {
67 if ($s !~ /^(text|allfields|metadata)$/) {
68 push (@{$self->{'facetfields'}}, $s);
69 }
70 }
71}
72
73#----
74
75sub index_field_mapping_edit {
76 my $self = shift (@_);
77 my ($doc_obj,$file,$edit_mode) = @_;
78
79 # Only add/update gets to here
80 # Currently there is no need to distinguish between these edit modes
81
82 my $outhandle = $self->{'outhandle'};
83
84 # only study this document if it is one to be indexed
85 return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89 # get the parameters for the output
90 # split on : just in case there is subcoll and lang stuff
91 my ($fields) = split (/:/, $self->{'index'});
92
93 my $doc_section = 0; # just for this document
94
95 # get the text for this document
96 my $section = $doc_obj->get_top_section();
97
98 while (defined $section)
99 {
100 $doc_section++;
101
102 # if we are doing subcollections, then some docs shouldn't be
103 # considered for indexing
104
105 my $indexed_section
106 = $doc_obj->get_metadata_element($section, "gsdldoctype")
107 || "indexed_section";
108
109 if (($indexed_doc == 0)
110 || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111 $section = $doc_obj->get_next_section($section);
112 next;
113 }
114
115 # has the user added a 'metadata' index?
116 my $all_metadata_specified = 0;
117
118 # which fields have already been indexed?
119 # (same as fields, but in a map)
120 my $specified_fields = {};
121
122 # do we have an allfields index??
123 my $allfields_index = 0;
124
125 # collect up all the text for it in here
126 my $allfields_text = "";
127
128 foreach my $field (split (/;/, $fields)) {
129 if ($field eq "allfields") {
130 $allfields_index = 1;
131 } elsif ($field eq "metadata") {
132 $all_metadata_specified = 1;
133 }
134 }
135
136 foreach my $field (split (/;/, $fields)) {
137
138 # only deal with this field if it doesn't start with top or
139 # this is the first section
140 my $real_field = $field;
141 next if (($real_field =~ s/^top//) && ($doc_section != 1));
142
143 # process these two later
144 next if ($real_field eq "allfields" || $real_field eq "metadata");
145
146 # individual metadata and or text specified
147 # -- could be a comma separated list
148 $specified_fields->{$real_field} = 1;
149
150 if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151 my $shortname = $self->create_shortname($real_field);
152 $self->{'indexfieldmap'}->{$real_field} = $shortname;
153 $self->{'indexfieldmap'}->{$shortname} = 1;
154 }
155 } # foreach field
156
157
158 if ($all_metadata_specified) {
159
160 my $new_text = "";
161 my $shortname = "";
162 my $metadata = $doc_obj->get_all_metadata ($section);
163
164 foreach my $pair (@$metadata) {
165 my ($mfield, $mvalue) = (@$pair);
166
167 # no value
168 next unless defined $mvalue && $mvalue ne "";
169
170 # we have already indexed this
171 next if defined ($specified_fields->{$mfield});
172
173 # check fields here, maybe others dont want - change to use dontindex!!
174 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
175 next if ($mfield =~ /^gsdl/);
176
177 if (defined $self->{'indexfieldmap'}->{$mfield}) {
178 $shortname = $self->{'indexfieldmap'}->{$mfield};
179 }
180 else {
181 $shortname = $self->create_shortname($mfield);
182 $self->{'indexfieldmap'}->{$mfield} = $shortname;
183 $self->{'indexfieldmap'}->{$shortname} = 1;
184 }
185
186 if (!defined $self->{'indexfields'}->{$mfield}) {
187 $self->{'indexfields'}->{$mfield} = 1;
188 }
189 }
190 }
191
192 if ($allfields_index) {
193 # add the index name mapping
194 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195 $self->{'indexfieldmap'}->{"ZZ"} = 1;
196 }
197
198 $section = $doc_obj->get_next_section($section);
199
200 } # while defined section
201
202
203}
204
205sub index_field_mapping {
206 my $self = shift (@_);
207 my ($doc_obj,$file) = @_;
208
209 $self->index_field_mapping_edit($doc_obj,$file,"add");
210}
211
212sub index_field_mappingreindex
213{
214 my $self = shift (@_);
215 my ($doc_obj,$file) = @_;
216
217 $self->index_field_mapping_edit($doc_obj,$file,"update");
218}
219
220sub index_field_mappingdelete
221{
222 my $self = shift (@_);
223 my ($doc_obj,$file) = @_;
224
225 return; # nothing to be done
226}
227
228
229#----
230
231sub textedit {
232 my $self = shift (@_);
233 my ($doc_obj,$file,$edit_mode) = @_;
234
235
236 if (!$self->get_indexing_text()) {
237 # In text-compress mode:
238 # => want document to be output in the simple <Doc>..</Doc> as is
239 # done by its super-class
240 return $self->SUPER::textedit(@_);
241 }
242
243 # "update" for $edit_mode near identical to "add" as we use Solr in its
244 # default mode of replacing an existing document if the new document
245 # has the same doc id. Main area of difference between "add" and "update"
246 # is that we do not update our 'stats' for number of documents or number
247 # of bytes processed. The latter is inaccurate, but considered better
248 # than allowing the value to steadily climb.
249
250
251 my $solrhandle = $self->{'output_handle'};
252 my $outhandle = $self->{'outhandle'};
253
254 # only output this document if it is one to be indexed
255 return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257 # skip this document if in "compress-text" mode and asked to delete it
258 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262 # this is another document
263 if ($edit_mode eq "add") {
264 $self->{'num_docs'} += 1;
265 }
266 elsif ($edit_mode eq "delete") {
267 $self->{'num_docs'} -= 1;
268 }
269
270 # get the parameters for the output
271 # split on : just in case there is subcoll and lang stuff
272 my ($fields) = split (/:/, $self->{'index'});
273
274 my $levels = $self->{'levels'};
275 my $ldoc_level = $levels->{'document'};
276 my $lsec_level = $levels->{'section'};
277
278 my $gs2_docOID = $doc_obj->get_OID();
279
280 my $start_doc;
281 my $end_doc;
282
283 if ($edit_mode eq "add") {
284 $start_doc = " <add>\n";
285 $start_doc .= " <doc>\n";
286 $start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
287
288 $end_doc = " </doc>\n";
289 $end_doc .= " </add>\n";
290 }
291 else {
292 $start_doc = " <delete>\n";
293 $start_doc .= " <id>$gs2_docOID</id>\n";
294
295 $end_doc = " </delete>\n";
296
297 # for delete mode, we need to specify just the docOID to delete and we're done
298 my $text = $start_doc;
299 $text .= $end_doc;
300 print $solrhandle $text;
301 return;
302 }
303
304 # add/update, delete
305
306 my $sec_tag_name = "";
307 if ($lsec_level)
308 {
309 $sec_tag_name = $mgppbuildproc::level_map{'section'};
310 }
311
312 my $doc_section = 0; # just for this document
313
314 # only output if working with doc level
315 # my $text = undef;
316
317 my $text = ($sec_tag_name eq "") ? $start_doc : "";
318
319# my $text = $start_doc if ($sec_tag_name eq "");
320
321 # get the text for this document
322 my $section = $doc_obj->get_top_section();
323
324 while (defined $section)
325 {
326 # update a few statistics
327 $doc_section++;
328 $self->{'num_sections'}++;
329
330 my $sec_gs2_id = $self->{'num_sections'};
331 my $sec_gs2_docOID = $gs2_docOID;
332 $sec_gs2_docOID .= ".$section" if ($section ne "");
333
334 my $start_sec;
335 my $end_sec;
336
337 if ($edit_mode eq "add") {
338 $start_sec = " <add>\n";
339 $start_sec .= " <doc>\n";
340 $start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
341
342 $end_sec = " </doc>\n";
343 $end_sec .= " </add>\n";
344 }
345 else {
346 $start_sec = " <delete>\n";
347 $start_sec .= " <id>$sec_gs2_docOID</id>\n";
348
349 $end_sec = " </delete>\n";
350
351 # for delete mode, should specify only this section's docOID to delete, then move on to the next section
352 my $text = $start_sec;
353 $text .= $end_sec;
354 print $solrhandle $text;
355 $section = $doc_obj->get_next_section($section);
356 next;
357 }
358
359
360 # if we are doing subcollections, then some docs shouldn't be indexed.
361 # but we need to put the section tag placeholders in there so the
362 # sections match up with database
363 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
364 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
365 if ($sec_tag_name ne "") {
366 $text .= $start_sec;
367 $text .= $end_sec;
368 }
369 $section = $doc_obj->get_next_section($section);
370 next;
371 }
372
373 # add in start section tag if indexing at the section level
374 $text .= $start_sec if ($sec_tag_name ne "");
375
376 if ($edit_mode eq "add") {
377 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
378 }
379 elsif ($edit_mode eq "delete") {
380 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
381 }
382
383
384 # has the user added a 'metadata' index?
385 my $all_metadata_specified = 0;
386 # which fields have already been indexed? (same as fields, but in a map)
387 my $specified_fields = {};
388
389 # do we have an allfields index??
390 my $allfields_index = 0;
391 # collect up all the text for it in here
392 my $allfields_text = "";
393 foreach my $field (split (/;/, $fields)) {
394 if ($field eq "allfields") {
395 $allfields_index = 1;
396 } elsif ($field eq "metadata") {
397 $all_metadata_specified = 1;
398 }
399 }
400
401 foreach my $field (split (/;/, $fields)) {
402
403 # only deal with this field if it doesn't start with top or
404 # this is the first section
405 my $real_field = $field;
406 next if (($real_field =~ s/^top//) && ($doc_section != 1));
407
408 # process these two later
409 next if ($real_field eq "allfields" || $real_field eq "metadata");
410
411 #individual metadata and or text specified - could be a comma separated list
412 $specified_fields->{$real_field} = 1;
413 my $shortname="";
414 my $new_field = 0; # have we found a new field name?
415 if (defined $self->{'indexfieldmap'}->{$real_field}) {
416 $shortname = $self->{'indexfieldmap'}->{$real_field};
417 }
418 else {
419 $shortname = $self->create_shortname($real_field);
420 $new_field = 1;
421 }
422
423 my @metadata_list = (); # put any metadata values in here
424 my $section_text = ""; # put the text in here
425 foreach my $submeta (split /,/, $real_field) {
426 if ($submeta eq "text") {
427 # no point in indexing text more than once
428 if ($section_text eq "") {
429 $section_text = $doc_obj->get_text($section);
430 if ($self->{'indexing_text'}) {
431 # we always strip html
432 $section_text = $self->preprocess_text($section_text, 1, "");
433 }
434 else {
435 # leave html stuff in, but escape the tags
436 &ghtml::htmlsafe($section_text);
437 }
438 }
439 }
440 else {
441 $submeta =~ s/^ex\.//; #strip off ex.
442
443 # its a metadata element
444 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
445 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
446 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
447 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
448 }
449 }
450 push (@metadata_list, @section_metadata);
451 }
452 } # for each field in this one index
453
454 # now we add the text and/or metadata into new_text
455 if ($section_text ne "" || scalar(@metadata_list)) {
456 my $new_text = "";
457
458 if ($section_text ne "") {
459 $new_text .= "$section_text ";
460 }
461
462 foreach my $item (@metadata_list) {
463 &ghtml::htmlsafe($item);
464 $new_text .= "$item ";
465 }
466
467 if ($allfields_index) {
468 $allfields_text .= $new_text;
469 }
470
471 # Remove any leading or trailing white space
472 $new_text =~ s/\s+$//;
473 $new_text =~ s/^\s+//;
474
475
476 if ($self->{'indexing_text'}) {
477 # add the tag
478 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
479 }
480 # filter the text
481 $new_text = $self->filter_text ($field, $new_text);
482
483 if ($edit_mode eq "add") {
484 $self->{'num_processed_bytes'} += length ($new_text);
485 $text .= "$new_text";
486 }
487 elsif ($edit_mode eq "update") {
488 $text .= "$new_text";
489 }
490 elsif ($edit_mode eq "delete") {
491 $self->{'num_processed_bytes'} -= length ($new_text);
492 }
493
494
495 if ($self->{'indexing_text'} && $new_field) {
496 # we need to add to the list in indexfields
497
498 $self->{'indexfieldmap'}->{$real_field} = $shortname;
499 $self->{'indexfieldmap'}->{$shortname} = 1;
500 }
501
502 }
503
504 } # foreach field
505
506
507 if ($all_metadata_specified) {
508
509 my $new_text = "";
510 my $shortname = "";
511 my $metadata = $doc_obj->get_all_metadata ($section);
512 foreach my $pair (@$metadata) {
513 my ($mfield, $mvalue) = (@$pair);
514
515 # no value
516 next unless defined $mvalue && $mvalue ne "";
517
518 # we have already indexed this
519 next if defined ($specified_fields->{$mfield});
520
521 # check fields here, maybe others dont want - change to use dontindex!!
522 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
523 next if ($mfield =~ /^gsdl/);
524
525 &ghtml::htmlsafe($mvalue);
526
527 if (defined $self->{'indexfieldmap'}->{$mfield}) {
528 $shortname = $self->{'indexfieldmap'}->{$mfield};
529 }
530 else {
531 $shortname = $self->create_shortname($mfield);
532 $self->{'indexfieldmap'}->{$mfield} = $shortname;
533 $self->{'indexfieldmap'}->{$shortname} = 1;
534 }
535 $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
536 if ($allfields_index) {
537 $allfields_text .= "$mvalue ";
538 }
539
540 if (!defined $self->{'indexfields'}->{$mfield}) {
541 $self->{'indexfields'}->{$mfield} = 1;
542 }
543
544 }
545 # filter the text
546 $new_text = $self->filter_text ("metadata", $new_text);
547
548 if ($edit_mode eq "add") {
549 $self->{'num_processed_bytes'} += length ($new_text);
550 $text .= "$new_text";
551 }
552 elsif ($edit_mode eq "update") {
553 $text .= "$new_text";
554 }
555 elsif ($edit_mode eq "delete") {
556 $self->{'num_processed_bytes'} -= length ($new_text);
557 }
558 }
559
560 if ($allfields_index) {
561 # add the index name mapping
562 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
563 $self->{'indexfieldmap'}->{"ZZ"} = 1;
564
565 my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
566 # filter the text
567 $new_text = $self->filter_text ("allfields", $new_text);
568
569 if ($edit_mode eq "add") {
570 $self->{'num_processed_bytes'} += length ($new_text);
571 $text .= "$new_text";
572 }
573 elsif ($edit_mode eq "update") {
574 $text .= "$new_text";
575 }
576 elsif ($edit_mode eq "delete") {
577 $self->{'num_processed_bytes'} -= length ($new_text);
578 }
579 }
580
581 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
582 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
583 # add sort fields if there are any
584 my $seenfields = {};
585 foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
586 # ignore special field rank/none
587 next if $sfield eq "rank" || $sfield eq "none";
588 # ignore any we have already done - we may have duplicates in the sort and facet lists
589 next if (defined $seenfields->{$sfield});
590 $seenfields->{$sfield} = 1;
591 my $sf_shortname;
592 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
593 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
594 }
595 else {
596 $sf_shortname = $self->create_sortfield_shortname($sfield);
597 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
598 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
599 }
600 my @metadata_list = (); # put any metadata values in here
601 foreach my $submeta (split /,/, $sfield) {
602 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
603
604 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
605 if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
606 if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
607 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
608 }
609 }
610 push (@metadata_list, @section_metadata);
611 }
612 my $new_text = "";
613 foreach my $item (@metadata_list) {
614 &ghtml::htmlsafe($item);
615 $new_text .= "$item ";
616 }
617 if ($new_text =~ /\S/) {
618 $new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
619 # filter the text???
620 $text .= "$new_text"; # add it to the main text block
621 $self->{'actualsortfields'}->{$sfield} = 1;
622 }
623 }
624 }
625
626 # add in end tag if at top-level doc root, or indexing at the section level
627 $text .= $end_sec if ($sec_tag_name ne "");
628
629 $section = $doc_obj->get_next_section($section);
630 } # while defined section
631
632
633 # only output if working with doc level
634 $text .= $end_doc if ($sec_tag_name eq "");
635
636## $text .= "<commit/>\n";
637
638# The following code looks like it's for debugging purposes, but
639# committed by accident. Commenting out for now ...
640
641# open(TEXTOUT, '>:utf8', "text.out");
642# print TEXTOUT "$text";
643# close TEXTOUT;
644
645 print $solrhandle $text;
646
647}
648
649
650
651
652sub textreindex
653{
654 my $self = shift (@_);
655 my ($doc_obj,$file) = @_;
656
657 # the update command does not exist in solrbuildproc
658 # reindexing consists of deleting and then adding the same file
659 #$self->textedit($doc_obj,$file,"update");
660 $self->textedit($doc_obj,$file,"delete");
661 $self->textedit($doc_obj,$file,"add");
662}
663
664
6651;
666
667
Note: See TracBrowser for help on using the repository browser.