root/gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm @ 27815

Revision 27815, 18.7 KB (checked in by kjdon, 6 years ago)

adding in facets

Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33#  http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49    @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54    my $class = shift @_;
55    my $self = new lucenebuildproc (@_);
56
57    return bless $self, $class;
58}
59
60sub set_facetfields {
61    my $self = shift (@_);
62 
63    my ($facetfields) = @_;
64    $self->{'facetfields'} = ();
65    # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66    foreach my $s (@$facetfields) {
67    if ($s !~ /^(text|allfields|metadata)$/) {
68        push (@{$self->{'facetfields'}}, $s);
69    }
70    }
71}
72
73#----
74
75sub index_field_mapping_edit {
76    my $self = shift (@_);
77    my ($doc_obj,$file,$edit_mode) = @_;
78
79    # Only add/update gets to here
80    # Currently there is no need to distinguish between these edit modes
81
82    my $outhandle = $self->{'outhandle'};
83
84    # only study this document if it is one to be indexed
85    return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89    # get the parameters for the output
90    # split on : just in case there is subcoll and lang stuff
91    my ($fields) = split (/:/, $self->{'index'});
92
93    my $doc_section = 0; # just for this document
94
95    # get the text for this document
96    my $section = $doc_obj->get_top_section();
97
98    while (defined $section)
99    {
100    $doc_section++;
101
102    # if we are doing subcollections, then some docs shouldn't be
103    # considered for indexing
104
105    my $indexed_section
106        = $doc_obj->get_metadata_element($section, "gsdldoctype")
107          || "indexed_section";
108
109    if (($indexed_doc == 0)
110        || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111            $section = $doc_obj->get_next_section($section);
112        next;
113          }
114
115    # has the user added a 'metadata' index?
116    my $all_metadata_specified = 0;
117
118    # which fields have already been indexed?
119    # (same as fields, but in a map)
120    my $specified_fields = {};
121   
122    # do we have an allfields index??
123    my $allfields_index = 0;
124
125    # collect up all the text for it in here
126    my $allfields_text = "";
127
128    foreach my $field (split (/;/, $fields)) {
129        if ($field eq "allfields") {
130        $allfields_index = 1;
131        } elsif ($field eq "metadata") {
132        $all_metadata_specified = 1;
133        }
134    }
135   
136    foreach my $field (split (/;/, $fields)) {
137       
138        # only deal with this field if it doesn't start with top or
139        # this is the first section
140        my $real_field = $field;
141        next if (($real_field =~ s/^top//) && ($doc_section != 1));
142       
143        # process these two later
144        next if ($real_field eq "allfields" || $real_field eq "metadata");
145       
146        # individual metadata and or text specified
147        # -- could be a comma separated list
148        $specified_fields->{$real_field} = 1;
149
150        if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151        my $shortname = $self->create_shortname($real_field);
152        $self->{'indexfieldmap'}->{$real_field} = $shortname;
153        $self->{'indexfieldmap'}->{$shortname} = 1;
154        }       
155    } # foreach field
156
157
158    if ($all_metadata_specified) {
159       
160        my $new_text = "";
161        my $shortname = "";
162        my $metadata = $doc_obj->get_all_metadata ($section);
163
164        foreach my $pair (@$metadata) {
165        my ($mfield, $mvalue) = (@$pair);
166
167        # no value
168        next unless defined $mvalue && $mvalue ne "";
169
170        # we have already indexed this
171        next if defined ($specified_fields->{$mfield});
172
173        # check fields here, maybe others dont want - change to use dontindex!!
174        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
175        next if ($mfield =~ /^gsdl/);
176       
177        if (defined $self->{'indexfieldmap'}->{$mfield}) {
178            $shortname = $self->{'indexfieldmap'}->{$mfield};
179        }
180        else {
181            $shortname = $self->create_shortname($mfield);
182            $self->{'indexfieldmap'}->{$mfield} = $shortname;
183            $self->{'indexfieldmap'}->{$shortname} = 1;
184        }     
185
186        if (!defined $self->{'indexfields'}->{$mfield}) {
187            $self->{'indexfields'}->{$mfield} = 1;
188        }                           
189        }
190    }
191
192    if ($allfields_index) {
193        # add the index name mapping
194        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195        $self->{'indexfieldmap'}->{"ZZ"} = 1;               
196    }
197       
198        $section = $doc_obj->get_next_section($section);
199
200    } # while defined section
201
202   
203}
204
205sub index_field_mapping {
206    my $self = shift (@_);
207    my ($doc_obj,$file) = @_;
208
209    $self->index_field_mapping_edit($doc_obj,$file,"add");
210}
211
212sub index_field_mappingreindex
213{
214    my $self = shift (@_);
215    my ($doc_obj,$file) = @_;
216
217    $self->index_field_mapping_edit($doc_obj,$file,"update");
218}
219
220sub index_field_mappingdelete
221{
222    my $self = shift (@_);
223    my ($doc_obj,$file) = @_;
224
225    return; # nothing to be done
226}
227
228
229#----
230
231sub textedit {
232    my $self = shift (@_);
233    my ($doc_obj,$file,$edit_mode) = @_;
234
235
236    if (!$self->get_indexing_text()) {
237    # In text-compress mode:
238    # => want document to be output in the simple <Doc>..</Doc> as is
239    # done by its super-class
240    return $self->SUPER::textedit(@_);
241    }
242
243    # "update" for $edit_mode near identical to "add" as we use Solr in its
244    # default mode of replacing an existing document if the new document
245    # has the same doc id.  Main area of difference between "add" and "update"
246    # is that we do not update our 'stats' for number of documents or number
247    # of bytes processed.  The latter is inaccurate, but considered better
248    # than allowing the value to steadily climb.
249
250
251    my $solrhandle = $self->{'output_handle'};
252    my $outhandle = $self->{'outhandle'};
253
254    # only output this document if it is one to be indexed
255    return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257    # skip this document if in "compress-text" mode and asked to delete it
258    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262    # this is another document
263    if ($edit_mode eq "add") {
264    $self->{'num_docs'} += 1;
265    }
266    elsif ($edit_mode eq "delete") {
267    $self->{'num_docs'} -= 1;
268    }
269
270    # get the parameters for the output
271    # split on : just in case there is subcoll and lang stuff
272    my ($fields) = split (/:/, $self->{'index'});
273
274    my $levels = $self->{'levels'};
275    my $ldoc_level = $levels->{'document'};
276    my $lsec_level = $levels->{'section'};
277
278    my $gs2_docOID = $doc_obj->get_OID();
279
280    my $start_doc;
281    my $end_doc;
282
283    if ($edit_mode eq "add") {
284    $start_doc  = "  <add>\n";
285    $start_doc .= "    <doc>\n";
286    $start_doc .= "      <field name=\"docOID\">$gs2_docOID</field>\n";
287   
288    $end_doc    = "    </doc>\n";
289    $end_doc   .= "  </add>\n";
290    }
291    else {
292    $start_doc  = "  <delete>\n";
293    $start_doc .= "    <id>$gs2_docOID</id>\n";
294
295    $end_doc    = "  </delete>\n";
296    }
297
298    # add/update, delete
299
300    my $sec_tag_name = "";
301    if ($lsec_level)
302    {
303    $sec_tag_name = $mgppbuildproc::level_map{'section'};
304    }
305
306    my $doc_section = 0; # just for this document
307
308    # only output if working with doc level
309    # my $text = undef;
310   
311    my $text = ($sec_tag_name eq "") ? $start_doc : "";
312
313#     my $text = $start_doc if ($sec_tag_name eq "");
314     
315    # get the text for this document
316    my $section = $doc_obj->get_top_section();
317
318    while (defined $section)
319    {
320    # update a few statistics
321    $doc_section++;
322    $self->{'num_sections'}++;
323
324    my $sec_gs2_id = $self->{'num_sections'};
325    my $sec_gs2_docOID = $gs2_docOID;
326    $sec_gs2_docOID .= ".$section" if ($section ne "");
327   
328    my $start_sec;
329    my $end_sec;
330
331    if ($edit_mode eq "add") {
332        $start_sec  = "  <add>\n";
333        $start_sec .= "    <doc>\n";
334        $start_sec .= "      <field name=\"docOID\">$sec_gs2_docOID</field>\n";
335   
336        $end_sec    = "    </doc>\n";
337        $end_sec   .= "  </add>\n";
338    }
339    else {
340        $start_sec  = "  <delete>\n";
341        $start_sec .= "    <id>$sec_gs2_docOID</id>\n";
342
343        $end_sec    = "  </delete>\n";
344    }
345
346
347    # if we are doing subcollections, then some docs shouldn't be indexed.
348    # but we need to put the section tag placeholders in there so the
349    # sections match up with database
350    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
351    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
352        if ($sec_tag_name ne "") {
353        $text .= $start_sec;
354        $text .= $end_sec;
355        }
356            $section = $doc_obj->get_next_section($section);
357        next;
358          }
359
360    # add in start section tag if indexing at the section level
361    $text .= $start_sec if ($sec_tag_name ne "");
362
363    if ($edit_mode eq "add") {
364        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
365    }
366    elsif ($edit_mode eq "delete") {
367        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
368    }
369
370
371    # has the user added a 'metadata' index?
372    my $all_metadata_specified = 0;
373    # which fields have already been indexed? (same as fields, but in a map)
374    my $specified_fields = {};
375   
376    # do we have an allfields index??
377    my $allfields_index = 0;
378    # collect up all the text for it in here
379    my $allfields_text = "";
380    foreach my $field (split (/;/, $fields)) {
381        if ($field eq "allfields") {
382        $allfields_index = 1;
383        } elsif ($field eq "metadata") {
384        $all_metadata_specified = 1;
385        }
386    }
387   
388    foreach my $field (split (/;/, $fields)) {
389       
390        # only deal with this field if it doesn't start with top or
391        # this is the first section
392        my $real_field = $field;
393        next if (($real_field =~ s/^top//) && ($doc_section != 1));
394       
395        # process these two later
396        next if ($real_field eq "allfields" || $real_field eq "metadata");
397       
398        #individual metadata and or text specified - could be a comma separated list
399        $specified_fields->{$real_field} = 1;
400        my $shortname="";
401        my $new_field = 0; # have we found a new field name?
402        if (defined $self->{'indexfieldmap'}->{$real_field}) {
403        $shortname = $self->{'indexfieldmap'}->{$real_field};
404        }
405        else {
406        $shortname = $self->create_shortname($real_field);
407        $new_field = 1;
408        }
409
410        my @metadata_list = (); # put any metadata values in here
411        my $section_text = ""; # put the text in here
412        foreach my $submeta (split /,/, $real_field) {
413        if ($submeta eq "text") {
414            # no point in indexing text more than once
415            if ($section_text eq "") {
416            $section_text = $doc_obj->get_text($section);
417            if ($self->{'indexing_text'}) {
418                # we always strip html
419                $section_text = $self->preprocess_text($section_text, 1, "");
420            }
421            else {
422                # leave html stuff in, but escape the tags
423                &ghtml::htmlsafe($section_text);
424            }
425            }
426        }
427        else {
428            $submeta =~ s/^ex\.//; #strip off ex.
429
430            # its a metadata element
431            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
432            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
433            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
434                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
435            }
436            }
437            push (@metadata_list, @section_metadata);
438        }
439        } # for each field in this one index
440       
441        # now we add the text and/or metadata into new_text
442        if ($section_text ne "" || scalar(@metadata_list)) {
443        my $new_text = "";
444       
445        if ($section_text ne "") {
446            $new_text .= "$section_text ";
447        }
448       
449        foreach my $item (@metadata_list) {
450            &ghtml::htmlsafe($item);
451            $new_text .= "$item ";
452        }
453
454        if ($allfields_index) {
455            $allfields_text .= $new_text;
456        }
457
458        # Remove any leading or trailing white space
459        $new_text =~ s/\s+$//;
460        $new_text =~ s/^\s+//;
461   
462       
463        if ($self->{'indexing_text'}) {
464            # add the tag
465            $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
466        }
467        # filter the text
468        $new_text = $self->filter_text ($field, $new_text);
469
470        if ($edit_mode eq "add") {
471            $self->{'num_processed_bytes'} += length ($new_text);
472            $text .= "$new_text";
473        }
474        elsif ($edit_mode eq "update") {
475            $text .= "$new_text";
476        }
477        elsif ($edit_mode eq "delete") {
478            $self->{'num_processed_bytes'} -= length ($new_text);
479        }
480       
481
482        if ($self->{'indexing_text'} && $new_field) {
483            # we need to add to the list in indexfields
484           
485            $self->{'indexfieldmap'}->{$real_field} = $shortname;
486            $self->{'indexfieldmap'}->{$shortname} = 1;
487        }
488       
489        }
490       
491    } # foreach field
492
493
494    if ($all_metadata_specified) {
495       
496        my $new_text = "";
497        my $shortname = "";
498        my $metadata = $doc_obj->get_all_metadata ($section);
499        foreach my $pair (@$metadata) {
500        my ($mfield, $mvalue) = (@$pair);
501
502        # no value
503        next unless defined $mvalue && $mvalue ne "";
504
505        # we have already indexed this
506        next if defined ($specified_fields->{$mfield});
507
508        # check fields here, maybe others dont want - change to use dontindex!!
509        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
510        next if ($mfield =~ /^gsdl/);
511       
512        &ghtml::htmlsafe($mvalue);
513       
514        if (defined $self->{'indexfieldmap'}->{$mfield}) {
515            $shortname = $self->{'indexfieldmap'}->{$mfield};
516        }
517        else {
518            $shortname = $self->create_shortname($mfield);
519            $self->{'indexfieldmap'}->{$mfield} = $shortname;
520            $self->{'indexfieldmap'}->{$shortname} = 1;
521        }     
522        $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
523        if ($allfields_index) {
524            $allfields_text .= "$mvalue ";
525        }
526
527        if (!defined $self->{'indexfields'}->{$mfield}) {
528            $self->{'indexfields'}->{$mfield} = 1;
529        }                   
530       
531        }
532        # filter the text
533        $new_text = $self->filter_text ("metadata", $new_text);
534       
535        if ($edit_mode eq "add") {
536        $self->{'num_processed_bytes'} += length ($new_text);
537        $text .= "$new_text";
538        }
539        elsif ($edit_mode eq "update") {
540        $text .= "$new_text";
541        }
542        elsif ($edit_mode eq "delete") {
543        $self->{'num_processed_bytes'} -= length ($new_text);
544        }       
545    }
546
547    if ($allfields_index) {
548        # add the index name mapping
549        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
550        $self->{'indexfieldmap'}->{"ZZ"} = 1;
551       
552        my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
553        # filter the text
554        $new_text = $self->filter_text ("allfields", $new_text);
555       
556        if ($edit_mode eq "add") {
557        $self->{'num_processed_bytes'} += length ($new_text);
558        $text .= "$new_text";
559        }
560        elsif ($edit_mode eq "update") {
561        $text .= "$new_text";
562        }
563        elsif ($edit_mode eq "delete") {
564        $self->{'num_processed_bytes'} -= length ($new_text);
565        }
566    }
567       
568    # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
569    if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
570    # add sort fields if there are any
571        my $seenfields = {};
572    foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
573        print STDERR "sort/facet field = $sfield\n";
574        # ignore special field rank
575        next if $sfield eq "rank";
576        # ignore any we have already done - we may have duplicates in the sort and facet lists
577        next if (defined $seenfields->{$sfield});
578        print STDERR "processing it\n";
579        $seenfields->{$sfield} = 1;
580        my $sf_shortname;
581        if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
582        $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
583        }
584        else {
585        $sf_shortname = $self->create_sortfield_shortname($sfield);
586        $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
587        $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
588        }
589        my @metadata_list = (); # put any metadata values in here
590        foreach my $submeta (split /,/, $sfield) {
591        $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
592       
593        my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
594            if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
595            if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
596                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
597            }
598            }
599        push (@metadata_list, @section_metadata);
600        }
601        my $new_text = "";
602        foreach my $item (@metadata_list) {
603        &ghtml::htmlsafe($item);
604        $new_text .= "$item";
605        }
606        if ($new_text =~ /\S/) {
607        #$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
608        $new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
609        # filter the text???
610        $text .= "$new_text"; # add it to the main text block
611        print STDERR "adding in sort text $new_text\n";
612        $self->{'actualsortfields'}->{$sfield} = 1;
613        }
614    }
615    }
616
617    # add in end tag if at top-level doc root, or indexing at the section level
618    $text .= $end_sec if ($sec_tag_name ne "");
619
620        $section = $doc_obj->get_next_section($section);
621    } # while defined section
622
623   
624    # only output if working with doc level
625    $text .= $end_doc if ($sec_tag_name eq "");
626
627##    $text .= "<commit/>\n";
628    open (TEXTOUT, ">text.out");
629    print TEXTOUT "$text";
630    close TEXTOUT;
631
632    print $solrhandle $text;
633
634}
635
636
637
638
639sub textreindex
640{
641    my $self = shift (@_);
642    my ($doc_obj,$file) = @_;
643
644    $self->textedit($doc_obj,$file,"update");
645}
646
647
6481;
649
650
Note: See TracBrowser for help on using the browser.