root/gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm @ 32098

Revision 32098, 19.8 KB (checked in by ak19, 3 years ago)

Fix to solr facets: if there are 2 values assigned for a metadata of a doc, e.g. a doc's subjects meta is set to youth, and another subjects meta for that doc is set to food, it used to appear under solr's subjects facet as a new, single term called 'youth food'. But now each distinct value for that meta is indexed as a separate field with a separate value, despite the same field name (the meta, e.g. subjects). In order for this to work, the schema.xml.in, which gets copied into a solr col's etc/conf folder, needed to specify that the dynamic field for by* is multivalued (previously set to false, now true) to allow many field elements for the same field name. Tested solr-jdbm-demo (and my simple test case). Nothing that wasn't already broken in faceted searching of solr-jdbm-demo appears broken here, but results are still different, which may be expected since there's no accumulated meta value for a doc's metadata anymore.

Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33#  http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49    @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54    my $class = shift @_;
55    my $self = new lucenebuildproc (@_);
56
57    return bless $self, $class;
58}
59
60sub set_facetfields {
61    my $self = shift (@_);
62 
63    my ($facetfields) = @_;
64    $self->{'facetfields'} = ();
65    # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66    foreach my $s (@$facetfields) {
67    if ($s !~ /^(text|allfields|metadata)$/) {
68        push (@{$self->{'facetfields'}}, $s);
69    }
70    }
71}
72
73#----
74
75sub index_field_mapping_edit {
76    my $self = shift (@_);
77    my ($doc_obj,$file,$edit_mode) = @_;
78
79    # Only add/update gets to here
80    # Currently there is no need to distinguish between these edit modes
81
82    my $outhandle = $self->{'outhandle'};
83
84    # only study this document if it is one to be indexed
85    return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89    # get the parameters for the output
90    # split on : just in case there is subcoll and lang stuff
91    my ($fields) = split (/:/, $self->{'index'});
92
93    my $doc_section = 0; # just for this document
94
95    # get the text for this document
96    my $section = $doc_obj->get_top_section();
97
98    while (defined $section)
99    {
100    $doc_section++;
101
102    # if we are doing subcollections, then some docs shouldn't be
103    # considered for indexing
104
105    my $indexed_section
106        = $doc_obj->get_metadata_element($section, "gsdldoctype")
107          || "indexed_section";
108
109    if (($indexed_doc == 0)
110        || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111            $section = $doc_obj->get_next_section($section);
112        next;
113          }
114
115    # has the user added a 'metadata' index?
116    my $all_metadata_specified = 0;
117
118    # which fields have already been indexed?
119    # (same as fields, but in a map)
120    my $specified_fields = {};
121   
122    # do we have an allfields index??
123    my $allfields_index = 0;
124
125    # collect up all the text for it in here
126    my $allfields_text = "";
127
128    foreach my $field (split (/;/, $fields)) {
129        if ($field eq "allfields") {
130        $allfields_index = 1;
131        } elsif ($field eq "metadata") {
132        $all_metadata_specified = 1;
133        }
134    }
135   
136    foreach my $field (split (/;/, $fields)) {
137       
138        # only deal with this field if it doesn't start with top or
139        # this is the first section
140        my $real_field = $field;
141        next if (($real_field =~ s/^top//) && ($doc_section != 1));
142       
143        # process these two later
144        next if ($real_field eq "allfields" || $real_field eq "metadata");
145       
146        # individual metadata and or text specified
147        # -- could be a comma separated list
148        $specified_fields->{$real_field} = 1;
149
150        if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151        my $shortname = $self->create_shortname($real_field);
152        $self->{'indexfieldmap'}->{$real_field} = $shortname;
153        $self->{'indexfieldmap'}->{$shortname} = 1;
154        }       
155    } # foreach field
156
157
158    if ($all_metadata_specified) {
159       
160        my $new_text = "";
161        my $shortname = "";
162        my $metadata = $doc_obj->get_all_metadata ($section);
163
164        foreach my $pair (@$metadata) {
165        my ($mfield, $mvalue) = (@$pair);
166
167        # no value
168        next unless defined $mvalue && $mvalue ne "";
169
170        # we have already indexed this
171        next if defined ($specified_fields->{$mfield});
172
173        # check fields here, maybe others dont want - change to use dontindex!!
174        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
175        next if ($mfield =~ /^gsdl/);
176       
177        if (defined $self->{'indexfieldmap'}->{$mfield}) {
178            $shortname = $self->{'indexfieldmap'}->{$mfield};
179        }
180        else {
181            $shortname = $self->create_shortname($mfield);
182            $self->{'indexfieldmap'}->{$mfield} = $shortname;
183            $self->{'indexfieldmap'}->{$shortname} = 1;
184        }     
185
186        if (!defined $self->{'indexfields'}->{$mfield}) {
187            $self->{'indexfields'}->{$mfield} = 1;
188        }                           
189        }
190    }
191
192    if ($allfields_index) {
193        # add the index name mapping
194        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195        $self->{'indexfieldmap'}->{"ZZ"} = 1;               
196    }
197       
198        $section = $doc_obj->get_next_section($section);
199
200    } # while defined section
201
202   
203}
204
205sub index_field_mapping {
206    my $self = shift (@_);
207    my ($doc_obj,$file) = @_;
208
209    $self->index_field_mapping_edit($doc_obj,$file,"add");
210}
211
212sub index_field_mappingreindex
213{
214    my $self = shift (@_);
215    my ($doc_obj,$file) = @_;
216
217    $self->index_field_mapping_edit($doc_obj,$file,"update");
218}
219
220sub index_field_mappingdelete
221{
222    my $self = shift (@_);
223    my ($doc_obj,$file) = @_;
224
225    return; # nothing to be done
226}
227
228
229#----
230
231sub textedit {
232    my $self = shift (@_);
233    my ($doc_obj,$file,$edit_mode) = @_;
234
235
236    if (!$self->get_indexing_text()) {
237    # In text-compress mode:
238    # => want document to be output in the simple <Doc>..</Doc> as is
239    # done by its super-class
240    return $self->SUPER::textedit(@_);
241    }
242
243    # "update" for $edit_mode near identical to "add" as we use Solr in its
244    # default mode of replacing an existing document if the new document
245    # has the same doc id.  Main area of difference between "add" and "update"
246    # is that we do not update our 'stats' for number of documents or number
247    # of bytes processed.  The latter is inaccurate, but considered better
248    # than allowing the value to steadily climb.
249
250
251    my $solrhandle = $self->{'output_handle'};
252    my $outhandle = $self->{'outhandle'};
253
254    # only output this document if it is one to be indexed
255    return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257    # skip this document if in "compress-text" mode and asked to delete it
258    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262    # this is another document
263    if ($edit_mode eq "add") {
264    $self->{'num_docs'} += 1;
265    }
266    elsif ($edit_mode eq "delete") {
267    $self->{'num_docs'} -= 1;
268    }
269
270    # get the parameters for the output
271    # split on : just in case there is subcoll and lang stuff
272    my ($fields) = split (/:/, $self->{'index'});
273
274    my $levels = $self->{'levels'};
275    my $ldoc_level = $levels->{'document'};
276    my $lsec_level = $levels->{'section'};
277
278    my $gs2_docOID = $doc_obj->get_OID();
279
280    my $start_doc;
281    my $end_doc;
282
283    if ($edit_mode eq "add") {
284    $start_doc  = "  <add>\n";
285    $start_doc .= "    <doc>\n";
286    $start_doc .= "      <field name=\"docOID\">$gs2_docOID</field>\n";
287   
288    $end_doc    = "    </doc>\n";
289    $end_doc   .= "  </add>\n";
290    }
291    else {
292    $start_doc  = "  <delete>\n";
293    $start_doc .= "    <id>$gs2_docOID</id>\n";
294
295    $end_doc    = "  </delete>\n";
296
297    # for delete mode, we need to specify just the docOID to delete and we're done
298    my $text = $start_doc;
299    $text .= $end_doc;
300    print $solrhandle $text;
301    return;
302    }
303
304    # add/update, delete
305
306    my $sec_tag_name = "";
307    if ($lsec_level)
308    {
309    $sec_tag_name = $mgppbuildproc::level_map{'section'};
310    }
311
312    my $doc_section = 0; # just for this document
313
314    # only output if working with doc level
315    # my $text = undef;
316   
317    my $text = ($sec_tag_name eq "") ? $start_doc : "";
318
319#     my $text = $start_doc if ($sec_tag_name eq "");
320     
321    # get the text for this document
322    my $section = $doc_obj->get_top_section();
323
324    while (defined $section)
325    {
326    # update a few statistics
327    $doc_section++;
328    $self->{'num_sections'}++;
329
330    my $sec_gs2_id = $self->{'num_sections'};
331    my $sec_gs2_docOID = $gs2_docOID;
332    $sec_gs2_docOID .= ".$section" if ($section ne "");
333   
334    my $start_sec;
335    my $end_sec;
336
337    if ($edit_mode eq "add") {
338        $start_sec  = "  <add>\n";
339        $start_sec .= "    <doc>\n";
340        $start_sec .= "      <field name=\"docOID\">$sec_gs2_docOID</field>\n";
341   
342        $end_sec    = "    </doc>\n";
343        $end_sec   .= "  </add>\n";
344    }
345    else {
346        $start_sec  = "  <delete>\n";
347        $start_sec .= "    <id>$sec_gs2_docOID</id>\n";
348
349        $end_sec    = "  </delete>\n";
350
351        # for delete mode, should specify only this section's docOID to delete, then move on to the next section
352        my $text = $start_sec;
353        $text .= $end_sec;
354        print $solrhandle $text;
355        $section = $doc_obj->get_next_section($section);
356        next;
357    }
358
359
360    # if we are doing subcollections, then some docs shouldn't be indexed.
361    # but we need to put the section tag placeholders in there so the
362    # sections match up with database
363    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
364    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
365        if ($sec_tag_name ne "") {
366        $text .= $start_sec;
367        $text .= $end_sec;
368        }
369            $section = $doc_obj->get_next_section($section);
370        next;
371          }
372
373    # add in start section tag if indexing at the section level
374    $text .= $start_sec if ($sec_tag_name ne "");
375
376    if ($edit_mode eq "add") {
377        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
378    }
379    elsif ($edit_mode eq "delete") {
380        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
381    }
382
383
384    # has the user added a 'metadata' index?
385    my $all_metadata_specified = 0;
386    # which fields have already been indexed? (same as fields, but in a map)
387    my $specified_fields = {};
388   
389    # do we have an allfields index??
390    my $allfields_index = 0;
391    # collect up all the text for it in here
392    my $allfields_text = "";
393    foreach my $field (split (/;/, $fields)) {
394        if ($field eq "allfields") {
395        $allfields_index = 1;
396        } elsif ($field eq "metadata") {
397        $all_metadata_specified = 1;
398        }
399    }
400   
401    foreach my $field (split (/;/, $fields)) {
402       
403        # only deal with this field if it doesn't start with top or
404        # this is the first section
405        my $real_field = $field;
406        next if (($real_field =~ s/^top//) && ($doc_section != 1));
407       
408        # process these two later
409        next if ($real_field eq "allfields" || $real_field eq "metadata");
410       
411        #individual metadata and or text specified - could be a comma separated list
412        $specified_fields->{$real_field} = 1;
413        my $shortname="";
414        my $new_field = 0; # have we found a new field name?
415        if (defined $self->{'indexfieldmap'}->{$real_field}) {
416        $shortname = $self->{'indexfieldmap'}->{$real_field};
417        }
418        else {
419        $shortname = $self->create_shortname($real_field);
420        $new_field = 1;
421        }
422
423        my @metadata_list = (); # put any metadata values in here
424        my $section_text = ""; # put the text in here
425        foreach my $submeta (split /,/, $real_field) {
426        if ($submeta eq "text") {
427            # no point in indexing text more than once
428            if ($section_text eq "") {
429            $section_text = $doc_obj->get_text($section);
430            if ($self->{'indexing_text'}) {
431                # we always strip html
432                &ghtml::htmlsafe($section_text);
433                #$section_text = $self->preprocess_text($section_text, 1, "");
434            }
435            else {
436                # leave html stuff in, but escape the tags
437                &ghtml::htmlsafe($section_text);
438            }
439            }
440        }
441        else {
442            $submeta =~ s/^ex\.//; #strip off ex.
443
444            # its a metadata element
445            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
446            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
447            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
448                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
449            }
450            }
451            push (@metadata_list, @section_metadata);
452        }
453        } # for each field in this one index
454       
455        # now we add the text and/or metadata into new_text
456        if ($section_text ne "" || scalar(@metadata_list)) {
457        my $new_text = "";
458       
459        if ($section_text ne "") {
460            $new_text .= "$section_text ";
461        }
462       
463        foreach my $item (@metadata_list) {
464            &ghtml::htmlsafe($item);
465            $new_text .= "$item ";
466        }
467
468        if ($allfields_index) {
469            $allfields_text .= $new_text;
470        }
471
472        # Remove any leading or trailing white space
473        $new_text =~ s/\s+$//;
474        $new_text =~ s/^\s+//;
475   
476       
477        if ($self->{'indexing_text'}) {
478            # add the tag
479            $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
480        }
481        # filter the text
482        $new_text = $self->filter_text ($field, $new_text);
483
484        if ($edit_mode eq "add") {
485            $self->{'num_processed_bytes'} += length ($new_text);
486            $text .= "$new_text";
487        }
488        elsif ($edit_mode eq "update") {
489            $text .= "$new_text";
490        }
491        elsif ($edit_mode eq "delete") {
492            $self->{'num_processed_bytes'} -= length ($new_text);
493        }
494       
495
496        if ($self->{'indexing_text'} && $new_field) {
497            # we need to add to the list in indexfields
498           
499            $self->{'indexfieldmap'}->{$real_field} = $shortname;
500            $self->{'indexfieldmap'}->{$shortname} = 1;
501        }
502       
503        }
504       
505    } # foreach field
506
507
508    if ($all_metadata_specified) {
509       
510        my $new_text = "";
511        my $shortname = "";
512        my $metadata = $doc_obj->get_all_metadata ($section);
513        foreach my $pair (@$metadata) {
514        my ($mfield, $mvalue) = (@$pair);
515
516        # no value
517        next unless defined $mvalue && $mvalue ne "";
518
519        # we have already indexed this
520        next if defined ($specified_fields->{$mfield});
521
522        # check fields here, maybe others dont want - change to use dontindex!!
523        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
524        next if ($mfield =~ /^gsdl/);
525       
526        &ghtml::htmlsafe($mvalue);
527       
528        if (defined $self->{'indexfieldmap'}->{$mfield}) {
529            $shortname = $self->{'indexfieldmap'}->{$mfield};
530        }
531        else {
532            $shortname = $self->create_shortname($mfield);
533            $self->{'indexfieldmap'}->{$mfield} = $shortname;
534            $self->{'indexfieldmap'}->{$shortname} = 1;
535        }     
536        $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
537        if ($allfields_index) {
538            $allfields_text .= "$mvalue ";
539        }
540
541        if (!defined $self->{'indexfields'}->{$mfield}) {
542            $self->{'indexfields'}->{$mfield} = 1;
543        }                   
544       
545        }
546        # filter the text
547        $new_text = $self->filter_text ("metadata", $new_text);
548       
549        if ($edit_mode eq "add") {
550        $self->{'num_processed_bytes'} += length ($new_text);
551        $text .= "$new_text";
552        }
553        elsif ($edit_mode eq "update") {
554        $text .= "$new_text";
555        }
556        elsif ($edit_mode eq "delete") {
557        $self->{'num_processed_bytes'} -= length ($new_text);
558        }       
559    }
560
561    if ($allfields_index) {
562        # add the index name mapping
563        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
564        $self->{'indexfieldmap'}->{"ZZ"} = 1;
565       
566        my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
567        # filter the text
568        $new_text = $self->filter_text ("allfields", $new_text);
569       
570        if ($edit_mode eq "add") {
571        $self->{'num_processed_bytes'} += length ($new_text);
572        $text .= "$new_text";
573        }
574        elsif ($edit_mode eq "update") {
575        $text .= "$new_text";
576        }
577        elsif ($edit_mode eq "delete") {
578        $self->{'num_processed_bytes'} -= length ($new_text);
579        }
580    }
581       
582    # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
583    if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
584    # add sort fields if there are any
585        my $seenfields = {};
586    foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
587        # ignore special field rank/none
588        next if $sfield eq "rank" || $sfield eq "none";
589        # ignore any we have already done - we may have duplicates in the sort and facet lists
590        next if (defined $seenfields->{$sfield});
591        $seenfields->{$sfield} = 1;
592        my $sf_shortname;
593        if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
594        $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
595        }
596        else {
597        $sf_shortname = $self->create_sortfield_shortname($sfield);
598        $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
599        $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
600        }
601        my @metadata_list = (); # put any metadata values in here
602        foreach my $submeta (split /,/, $sfield) {
603        $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
604       
605        my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
606            if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
607            if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
608                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
609            }
610            }
611        push (@metadata_list, @section_metadata);
612        }
613        # my $new_text = "";
614        # foreach my $item (@metadata_list) {
615        #   &ghtml::htmlsafe($item);
616        #   $new_text .= "$item ";
617        # }
618        # if ($new_text =~ /\S/) {
619        #   $new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
620        #   # filter the text???
621        #   $text .= "$new_text"; # add it to the main text block
622        #   print "#### new_text: $new_text\n";
623
624        #   $self->{'actualsortfields'}->{$sfield} = 1;
625        # }
626        # print "#### TEXT: $text\n";
627
628        foreach my $item (@metadata_list) {
629        &ghtml::htmlsafe($item);
630       
631        $item = "<field name=\"$sf_shortname\">$item</field>\n";
632        # filter the text???
633        $text .= "$item"; # add it to the main text block
634        #print "#### new_text: $item\n";
635        }
636        if(scalar @metadata_list > 0) {
637            $self->{'actualsortfields'}->{$sfield} = 1;
638        }
639       
640    }
641    }
642
643    # add in end tag if at top-level doc root, or indexing at the section level
644    $text .= $end_sec if ($sec_tag_name ne "");
645
646        $section = $doc_obj->get_next_section($section);
647    } # while defined section
648
649   
650    # only output if working with doc level
651    $text .= $end_doc if ($sec_tag_name eq "");
652
653##    $text .= "<commit/>\n";
654
655# The following code looks like it's for debugging purposes, but
656# committed by accident.  Commenting out for now ...
657
658#    open(TEXTOUT, '>:utf8', "text.out");
659#    print TEXTOUT "$text";
660#    close TEXTOUT;
661
662    print $solrhandle $text;
663
664}
665
666
667
668
669sub textreindex
670{
671    my $self = shift (@_);
672    my ($doc_obj,$file) = @_;
673
674    # the update command does not exist in solrbuildproc
675    # reindexing consists of deleting and then adding the same file
676    #$self->textedit($doc_obj,$file,"update");
677    $self->textedit($doc_obj,$file,"delete");
678    $self->textedit($doc_obj,$file,"add");
679}
680
681
6821;
683
684
Note: See TracBrowser for help on using the browser.