source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 28127

Last change on this file since 28127 was 28127, checked in by davidb, 11 years ago

Commenting out code that looks like it was meant only for debugging

File size: 18.7 KB
Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33# http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49 @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54 my $class = shift @_;
55 my $self = new lucenebuildproc (@_);
56
57 return bless $self, $class;
58}
59
60sub set_facetfields {
61 my $self = shift (@_);
62
63 my ($facetfields) = @_;
64 $self->{'facetfields'} = ();
65 # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66 foreach my $s (@$facetfields) {
67 if ($s !~ /^(text|allfields|metadata)$/) {
68 push (@{$self->{'facetfields'}}, $s);
69 }
70 }
71}
72
73#----
74
75sub index_field_mapping_edit {
76 my $self = shift (@_);
77 my ($doc_obj,$file,$edit_mode) = @_;
78
79 # Only add/update gets to here
80 # Currently there is no need to distinguish between these edit modes
81
82 my $outhandle = $self->{'outhandle'};
83
84 # only study this document if it is one to be indexed
85 return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89 # get the parameters for the output
90 # split on : just in case there is subcoll and lang stuff
91 my ($fields) = split (/:/, $self->{'index'});
92
93 my $doc_section = 0; # just for this document
94
95 # get the text for this document
96 my $section = $doc_obj->get_top_section();
97
98 while (defined $section)
99 {
100 $doc_section++;
101
102 # if we are doing subcollections, then some docs shouldn't be
103 # considered for indexing
104
105 my $indexed_section
106 = $doc_obj->get_metadata_element($section, "gsdldoctype")
107 || "indexed_section";
108
109 if (($indexed_doc == 0)
110 || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111 $section = $doc_obj->get_next_section($section);
112 next;
113 }
114
115 # has the user added a 'metadata' index?
116 my $all_metadata_specified = 0;
117
118 # which fields have already been indexed?
119 # (same as fields, but in a map)
120 my $specified_fields = {};
121
122 # do we have an allfields index??
123 my $allfields_index = 0;
124
125 # collect up all the text for it in here
126 my $allfields_text = "";
127
128 foreach my $field (split (/;/, $fields)) {
129 if ($field eq "allfields") {
130 $allfields_index = 1;
131 } elsif ($field eq "metadata") {
132 $all_metadata_specified = 1;
133 }
134 }
135
136 foreach my $field (split (/;/, $fields)) {
137
138 # only deal with this field if it doesn't start with top or
139 # this is the first section
140 my $real_field = $field;
141 next if (($real_field =~ s/^top//) && ($doc_section != 1));
142
143 # process these two later
144 next if ($real_field eq "allfields" || $real_field eq "metadata");
145
146 # individual metadata and or text specified
147 # -- could be a comma separated list
148 $specified_fields->{$real_field} = 1;
149
150 if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151 my $shortname = $self->create_shortname($real_field);
152 $self->{'indexfieldmap'}->{$real_field} = $shortname;
153 $self->{'indexfieldmap'}->{$shortname} = 1;
154 }
155 } # foreach field
156
157
158 if ($all_metadata_specified) {
159
160 my $new_text = "";
161 my $shortname = "";
162 my $metadata = $doc_obj->get_all_metadata ($section);
163
164 foreach my $pair (@$metadata) {
165 my ($mfield, $mvalue) = (@$pair);
166
167 # no value
168 next unless defined $mvalue && $mvalue ne "";
169
170 # we have already indexed this
171 next if defined ($specified_fields->{$mfield});
172
173 # check fields here, maybe others dont want - change to use dontindex!!
174 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
175 next if ($mfield =~ /^gsdl/);
176
177 if (defined $self->{'indexfieldmap'}->{$mfield}) {
178 $shortname = $self->{'indexfieldmap'}->{$mfield};
179 }
180 else {
181 $shortname = $self->create_shortname($mfield);
182 $self->{'indexfieldmap'}->{$mfield} = $shortname;
183 $self->{'indexfieldmap'}->{$shortname} = 1;
184 }
185
186 if (!defined $self->{'indexfields'}->{$mfield}) {
187 $self->{'indexfields'}->{$mfield} = 1;
188 }
189 }
190 }
191
192 if ($allfields_index) {
193 # add the index name mapping
194 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195 $self->{'indexfieldmap'}->{"ZZ"} = 1;
196 }
197
198 $section = $doc_obj->get_next_section($section);
199
200 } # while defined section
201
202
203}
204
205sub index_field_mapping {
206 my $self = shift (@_);
207 my ($doc_obj,$file) = @_;
208
209 $self->index_field_mapping_edit($doc_obj,$file,"add");
210}
211
212sub index_field_mappingreindex
213{
214 my $self = shift (@_);
215 my ($doc_obj,$file) = @_;
216
217 $self->index_field_mapping_edit($doc_obj,$file,"update");
218}
219
220sub index_field_mappingdelete
221{
222 my $self = shift (@_);
223 my ($doc_obj,$file) = @_;
224
225 return; # nothing to be done
226}
227
228
229#----
230
231sub textedit {
232 my $self = shift (@_);
233 my ($doc_obj,$file,$edit_mode) = @_;
234
235
236 if (!$self->get_indexing_text()) {
237 # In text-compress mode:
238 # => want document to be output in the simple <Doc>..</Doc> as is
239 # done by its super-class
240 return $self->SUPER::textedit(@_);
241 }
242
243 # "update" for $edit_mode near identical to "add" as we use Solr in its
244 # default mode of replacing an existing document if the new document
245 # has the same doc id. Main area of difference between "add" and "update"
246 # is that we do not update our 'stats' for number of documents or number
247 # of bytes processed. The latter is inaccurate, but considered better
248 # than allowing the value to steadily climb.
249
250
251 my $solrhandle = $self->{'output_handle'};
252 my $outhandle = $self->{'outhandle'};
253
254 # only output this document if it is one to be indexed
255 return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257 # skip this document if in "compress-text" mode and asked to delete it
258 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262 # this is another document
263 if ($edit_mode eq "add") {
264 $self->{'num_docs'} += 1;
265 }
266 elsif ($edit_mode eq "delete") {
267 $self->{'num_docs'} -= 1;
268 }
269
270 # get the parameters for the output
271 # split on : just in case there is subcoll and lang stuff
272 my ($fields) = split (/:/, $self->{'index'});
273
274 my $levels = $self->{'levels'};
275 my $ldoc_level = $levels->{'document'};
276 my $lsec_level = $levels->{'section'};
277
278 my $gs2_docOID = $doc_obj->get_OID();
279
280 my $start_doc;
281 my $end_doc;
282
283 if ($edit_mode eq "add") {
284 $start_doc = " <add>\n";
285 $start_doc .= " <doc>\n";
286 $start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
287
288 $end_doc = " </doc>\n";
289 $end_doc .= " </add>\n";
290 }
291 else {
292 $start_doc = " <delete>\n";
293 $start_doc .= " <id>$gs2_docOID</id>\n";
294
295 $end_doc = " </delete>\n";
296 }
297
298 # add/update, delete
299
300 my $sec_tag_name = "";
301 if ($lsec_level)
302 {
303 $sec_tag_name = $mgppbuildproc::level_map{'section'};
304 }
305
306 my $doc_section = 0; # just for this document
307
308 # only output if working with doc level
309 # my $text = undef;
310
311 my $text = ($sec_tag_name eq "") ? $start_doc : "";
312
313# my $text = $start_doc if ($sec_tag_name eq "");
314
315 # get the text for this document
316 my $section = $doc_obj->get_top_section();
317
318 while (defined $section)
319 {
320 # update a few statistics
321 $doc_section++;
322 $self->{'num_sections'}++;
323
324 my $sec_gs2_id = $self->{'num_sections'};
325 my $sec_gs2_docOID = $gs2_docOID;
326 $sec_gs2_docOID .= ".$section" if ($section ne "");
327
328 my $start_sec;
329 my $end_sec;
330
331 if ($edit_mode eq "add") {
332 $start_sec = " <add>\n";
333 $start_sec .= " <doc>\n";
334 $start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
335
336 $end_sec = " </doc>\n";
337 $end_sec .= " </add>\n";
338 }
339 else {
340 $start_sec = " <delete>\n";
341 $start_sec .= " <id>$sec_gs2_docOID</id>\n";
342
343 $end_sec = " </delete>\n";
344 }
345
346
347 # if we are doing subcollections, then some docs shouldn't be indexed.
348 # but we need to put the section tag placeholders in there so the
349 # sections match up with database
350 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
351 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
352 if ($sec_tag_name ne "") {
353 $text .= $start_sec;
354 $text .= $end_sec;
355 }
356 $section = $doc_obj->get_next_section($section);
357 next;
358 }
359
360 # add in start section tag if indexing at the section level
361 $text .= $start_sec if ($sec_tag_name ne "");
362
363 if ($edit_mode eq "add") {
364 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
365 }
366 elsif ($edit_mode eq "delete") {
367 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
368 }
369
370
371 # has the user added a 'metadata' index?
372 my $all_metadata_specified = 0;
373 # which fields have already been indexed? (same as fields, but in a map)
374 my $specified_fields = {};
375
376 # do we have an allfields index??
377 my $allfields_index = 0;
378 # collect up all the text for it in here
379 my $allfields_text = "";
380 foreach my $field (split (/;/, $fields)) {
381 if ($field eq "allfields") {
382 $allfields_index = 1;
383 } elsif ($field eq "metadata") {
384 $all_metadata_specified = 1;
385 }
386 }
387
388 foreach my $field (split (/;/, $fields)) {
389
390 # only deal with this field if it doesn't start with top or
391 # this is the first section
392 my $real_field = $field;
393 next if (($real_field =~ s/^top//) && ($doc_section != 1));
394
395 # process these two later
396 next if ($real_field eq "allfields" || $real_field eq "metadata");
397
398 #individual metadata and or text specified - could be a comma separated list
399 $specified_fields->{$real_field} = 1;
400 my $shortname="";
401 my $new_field = 0; # have we found a new field name?
402 if (defined $self->{'indexfieldmap'}->{$real_field}) {
403 $shortname = $self->{'indexfieldmap'}->{$real_field};
404 }
405 else {
406 $shortname = $self->create_shortname($real_field);
407 $new_field = 1;
408 }
409
410 my @metadata_list = (); # put any metadata values in here
411 my $section_text = ""; # put the text in here
412 foreach my $submeta (split /,/, $real_field) {
413 if ($submeta eq "text") {
414 # no point in indexing text more than once
415 if ($section_text eq "") {
416 $section_text = $doc_obj->get_text($section);
417 if ($self->{'indexing_text'}) {
418 # we always strip html
419 $section_text = $self->preprocess_text($section_text, 1, "");
420 }
421 else {
422 # leave html stuff in, but escape the tags
423 &ghtml::htmlsafe($section_text);
424 }
425 }
426 }
427 else {
428 $submeta =~ s/^ex\.//; #strip off ex.
429
430 # its a metadata element
431 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
432 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
433 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
434 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
435 }
436 }
437 push (@metadata_list, @section_metadata);
438 }
439 } # for each field in this one index
440
441 # now we add the text and/or metadata into new_text
442 if ($section_text ne "" || scalar(@metadata_list)) {
443 my $new_text = "";
444
445 if ($section_text ne "") {
446 $new_text .= "$section_text ";
447 }
448
449 foreach my $item (@metadata_list) {
450 &ghtml::htmlsafe($item);
451 $new_text .= "$item ";
452 }
453
454 if ($allfields_index) {
455 $allfields_text .= $new_text;
456 }
457
458 # Remove any leading or trailing white space
459 $new_text =~ s/\s+$//;
460 $new_text =~ s/^\s+//;
461
462
463 if ($self->{'indexing_text'}) {
464 # add the tag
465 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
466 }
467 # filter the text
468 $new_text = $self->filter_text ($field, $new_text);
469
470 if ($edit_mode eq "add") {
471 $self->{'num_processed_bytes'} += length ($new_text);
472 $text .= "$new_text";
473 }
474 elsif ($edit_mode eq "update") {
475 $text .= "$new_text";
476 }
477 elsif ($edit_mode eq "delete") {
478 $self->{'num_processed_bytes'} -= length ($new_text);
479 }
480
481
482 if ($self->{'indexing_text'} && $new_field) {
483 # we need to add to the list in indexfields
484
485 $self->{'indexfieldmap'}->{$real_field} = $shortname;
486 $self->{'indexfieldmap'}->{$shortname} = 1;
487 }
488
489 }
490
491 } # foreach field
492
493
494 if ($all_metadata_specified) {
495
496 my $new_text = "";
497 my $shortname = "";
498 my $metadata = $doc_obj->get_all_metadata ($section);
499 foreach my $pair (@$metadata) {
500 my ($mfield, $mvalue) = (@$pair);
501
502 # no value
503 next unless defined $mvalue && $mvalue ne "";
504
505 # we have already indexed this
506 next if defined ($specified_fields->{$mfield});
507
508 # check fields here, maybe others dont want - change to use dontindex!!
509 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
510 next if ($mfield =~ /^gsdl/);
511
512 &ghtml::htmlsafe($mvalue);
513
514 if (defined $self->{'indexfieldmap'}->{$mfield}) {
515 $shortname = $self->{'indexfieldmap'}->{$mfield};
516 }
517 else {
518 $shortname = $self->create_shortname($mfield);
519 $self->{'indexfieldmap'}->{$mfield} = $shortname;
520 $self->{'indexfieldmap'}->{$shortname} = 1;
521 }
522 $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
523 if ($allfields_index) {
524 $allfields_text .= "$mvalue ";
525 }
526
527 if (!defined $self->{'indexfields'}->{$mfield}) {
528 $self->{'indexfields'}->{$mfield} = 1;
529 }
530
531 }
532 # filter the text
533 $new_text = $self->filter_text ("metadata", $new_text);
534
535 if ($edit_mode eq "add") {
536 $self->{'num_processed_bytes'} += length ($new_text);
537 $text .= "$new_text";
538 }
539 elsif ($edit_mode eq "update") {
540 $text .= "$new_text";
541 }
542 elsif ($edit_mode eq "delete") {
543 $self->{'num_processed_bytes'} -= length ($new_text);
544 }
545 }
546
547 if ($allfields_index) {
548 # add the index name mapping
549 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
550 $self->{'indexfieldmap'}->{"ZZ"} = 1;
551
552 my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
553 # filter the text
554 $new_text = $self->filter_text ("allfields", $new_text);
555
556 if ($edit_mode eq "add") {
557 $self->{'num_processed_bytes'} += length ($new_text);
558 $text .= "$new_text";
559 }
560 elsif ($edit_mode eq "update") {
561 $text .= "$new_text";
562 }
563 elsif ($edit_mode eq "delete") {
564 $self->{'num_processed_bytes'} -= length ($new_text);
565 }
566 }
567
568 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
569 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
570 # add sort fields if there are any
571 my $seenfields = {};
572 foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
573 # ignore special field rank/none
574 next if $sfield eq "rank" || $sfield eq "none";
575 # ignore any we have already done - we may have duplicates in the sort and facet lists
576 next if (defined $seenfields->{$sfield});
577 $seenfields->{$sfield} = 1;
578 my $sf_shortname;
579 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
580 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
581 }
582 else {
583 $sf_shortname = $self->create_sortfield_shortname($sfield);
584 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
585 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
586 }
587 my @metadata_list = (); # put any metadata values in here
588 foreach my $submeta (split /,/, $sfield) {
589 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
590
591 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
592 if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
593 if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
594 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
595 }
596 }
597 push (@metadata_list, @section_metadata);
598 }
599 my $new_text = "";
600 foreach my $item (@metadata_list) {
601 &ghtml::htmlsafe($item);
602 $new_text .= "$item ";
603 }
604 if ($new_text =~ /\S/) {
605 $new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
606 # filter the text???
607 $text .= "$new_text"; # add it to the main text block
608 $self->{'actualsortfields'}->{$sfield} = 1;
609 }
610 }
611 }
612
613 # add in end tag if at top-level doc root, or indexing at the section level
614 $text .= $end_sec if ($sec_tag_name ne "");
615
616 $section = $doc_obj->get_next_section($section);
617 } # while defined section
618
619
620 # only output if working with doc level
621 $text .= $end_doc if ($sec_tag_name eq "");
622
623## $text .= "<commit/>\n";
624
625# The following code looks like it's for debugging purposes, but
626# committed by accident. Commenting out for now ...
627
628# open(TEXTOUT, '>:utf8', "text.out");
629# print TEXTOUT "$text";
630# close TEXTOUT;
631
632 print $solrhandle $text;
633
634}
635
636
637
638
639sub textreindex
640{
641 my $self = shift (@_);
642 my ($doc_obj,$file) = @_;
643
644 $self->textedit($doc_obj,$file,"update");
645}
646
647
6481;
649
650
Note: See TracBrowser for help on using the repository browser.