root/gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm @ 24447

Revision 24447, 15.9 KB (checked in by davidb, 8 years ago)

Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running

Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33#  http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49    @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54    my $class = shift @_;
55    my $self = new lucenebuildproc (@_);
56
57    return bless $self, $class;
58}
59
60
61#----
62
63sub index_field_mapping_edit {
64    my $self = shift (@_);
65    my ($doc_obj,$file,$edit_mode) = @_;
66
67    # Only add/update gets to here
68    # Currently there is no need to distinguish between these edit modes
69
70    my $outhandle = $self->{'outhandle'};
71
72    # only study this document if it is one to be indexed
73    return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77    # get the parameters for the output
78    # split on : just in case there is subcoll and lang stuff
79    my ($fields) = split (/:/, $self->{'index'});
80
81    my $doc_section = 0; # just for this document
82
83    # get the text for this document
84    my $section = $doc_obj->get_top_section();
85
86    while (defined $section)
87    {
88    $doc_section++;
89
90    # if we are doing subcollections, then some docs shouldn't be
91    # considered for indexing
92
93    my $indexed_section
94        = $doc_obj->get_metadata_element($section, "gsdldoctype")
95          || "indexed_section";
96
97    if (($indexed_doc == 0)
98        || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99            $section = $doc_obj->get_next_section($section);
100        next;
101          }
102
103    # has the user added a 'metadata' index?
104    my $all_metadata_specified = 0;
105
106    # which fields have already been indexed?
107    # (same as fields, but in a map)
108    my $specified_fields = {};
109   
110    # do we have an allfields index??
111    my $allfields_index = 0;
112
113    # collect up all the text for it in here
114    my $allfields_text = "";
115
116    foreach my $field (split (/;/, $fields)) {
117        if ($field eq "allfields") {
118        $allfields_index = 1;
119        } elsif ($field eq "metadata") {
120        $all_metadata_specified = 1;
121        }
122    }
123   
124    foreach my $field (split (/;/, $fields)) {
125       
126        # only deal with this field if it doesn't start with top or
127        # this is the first section
128        my $real_field = $field;
129        next if (($real_field =~ s/^top//) && ($doc_section != 1));
130       
131        # process these two later
132        next if ($real_field eq "allfields" || $real_field eq "metadata");
133       
134        # individual metadata and or text specified
135        # -- could be a comma separated list
136        $specified_fields->{$real_field} = 1;
137
138        if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139        my $shortname = $self->create_shortname($real_field);
140        $self->{'indexfieldmap'}->{$real_field} = $shortname;
141        $self->{'indexfieldmap'}->{$shortname} = 1;
142        }       
143    } # foreach field
144
145
146    if ($all_metadata_specified) {
147       
148        my $new_text = "";
149        my $shortname = "";
150        my $metadata = $doc_obj->get_all_metadata ($section);
151
152        foreach my $pair (@$metadata) {
153        my ($mfield, $mvalue) = (@$pair);
154
155        # no value
156        next unless defined $mvalue && $mvalue ne "";
157
158        # we have already indexed this
159        next if defined ($specified_fields->{$mfield});
160
161        # check fields here, maybe others dont want - change to use dontindex!!
162        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
163        next if ($mfield =~ /^gsdl/);
164       
165        if (defined $self->{'indexfieldmap'}->{$mfield}) {
166            $shortname = $self->{'indexfieldmap'}->{$mfield};
167        }
168        else {
169            $shortname = $self->create_shortname($mfield);
170            $self->{'indexfieldmap'}->{$mfield} = $shortname;
171            $self->{'indexfieldmap'}->{$shortname} = 1;
172        }     
173
174        if (!defined $self->{'indexfields'}->{$mfield}) {
175            $self->{'indexfields'}->{$mfield} = 1;
176        }                           
177        }
178    }
179
180    if ($allfields_index) {
181        # add the index name mapping
182        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183        $self->{'indexfieldmap'}->{"ZZ"} = 1;               
184    }
185       
186        $section = $doc_obj->get_next_section($section);
187
188    } # while defined section
189
190   
191}
192
193sub index_field_mapping {
194    my $self = shift (@_);
195    my ($doc_obj,$file) = @_;
196
197    $self->index_field_mapping_edit($doc_obj,$file,"add");
198}
199
200sub index_field_mappingreindex
201{
202    my $self = shift (@_);
203    my ($doc_obj,$file) = @_;
204
205    $self->index_field_mapping_edit($doc_obj,$file,"update");
206}
207
208sub index_field_mappingdelete
209{
210    my $self = shift (@_);
211    my ($doc_obj,$file) = @_;
212
213    return; # nothing to be done
214}
215
216
217#----
218
219sub textedit {
220    my $self = shift (@_);
221    my ($doc_obj,$file,$edit_mode) = @_;
222
223
224    if (!$self->get_indexing_text()) {
225    # In text-compress mode:
226    # => want document to be output in the simple <Doc>..</Doc> as is
227    # done by its super-class
228    return $self->SUPER::textedit(@_);
229    }
230
231    # "update" for $edit_mode near identical to "add" as we use Solr in its
232    # default mode of replacing an existing document if the new document
233    # has the same doc id.  Main area of difference between "add" and "update"
234    # is that we do not update our 'stats' for number of documents or number
235    # of bytes processed.  The latter is inaccurate, but considered better
236    # than allowing the value to steadily climb.
237
238
239    my $solrhandle = $self->{'output_handle'};
240    my $outhandle = $self->{'outhandle'};
241
242    # only output this document if it is one to be indexed
243    return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245    # skip this document if in "compress-text" mode and asked to delete it
246    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250    # this is another document
251    if ($edit_mode eq "add") {
252    $self->{'num_docs'} += 1;
253    }
254    elsif ($edit_mode eq "delete") {
255    $self->{'num_docs'} -= 1;
256    }
257
258    # get the parameters for the output
259    # split on : just in case there is subcoll and lang stuff
260    my ($fields) = split (/:/, $self->{'index'});
261
262    my $levels = $self->{'levels'};
263    my $ldoc_level = $levels->{'document'};
264    my $lsec_level = $levels->{'section'};
265
266    my $gs2_docOID = $doc_obj->get_OID();
267
268
269    my $start_doc;
270    my $end_doc;
271
272    if ($edit_mode eq "add") {
273    $start_doc  = "  <add>\n";
274    $start_doc .= "    <doc>\n";
275    $start_doc .= "      <field name=\"docOID\">$gs2_docOID</field>\n";
276   
277    $end_doc    = "    </doc>\n";
278    $end_doc   .= "  </add>\n";
279    }
280    else {
281    $start_doc  = "  <delete>\n";
282    $start_doc .= "    <id>$gs2_docOID</id>\n";
283
284    $end_doc    = "  </delete>\n";
285    }
286
287    # add/update, delete
288
289    my $sec_tag_name = "";
290    if ($lsec_level)
291    {
292    $sec_tag_name = $mgppbuildproc::level_map{'section'};
293    }
294
295    my $doc_section = 0; # just for this document
296
297    # only output if working with doc level
298    my $text = $start_doc if ($sec_tag_name eq "");
299
300    # get the text for this document
301    my $section = $doc_obj->get_top_section();
302
303    while (defined $section)
304    {
305    # update a few statistics
306    $doc_section++;
307    $self->{'num_sections'}++;
308
309    my $sec_gs2_id = $self->{'num_sections'};
310    my $sec_gs2_docOID = $gs2_docOID;
311    $sec_gs2_docOID .= ".$section" if ($section ne "");
312   
313    my $start_sec;
314    my $end_sec;
315
316    if ($edit_mode eq "add") {
317        $start_sec  = "  <add>\n";
318        $start_sec .= "    <doc>\n";
319        $start_sec .= "      <field name=\"docOID\">$sec_gs2_docOID</field>\n";
320
321        $end_sec    = "    </doc>\n";
322        $end_sec   .= "  </add>\n";
323    }
324    else {
325        $start_sec  = "  <delete>\n";
326        $start_sec .= "    <id>$sec_gs2_docOID</id>\n";
327
328        $end_sec    = "  </delete>\n";
329    }
330
331
332    # if we are doing subcollections, then some docs shouldn't be indexed.
333    # but we need to put the section tag placeholders in there so the
334    # sections match up with database
335    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
336    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
337        if ($sec_tag_name ne "") {
338        $text .= $start_sec;
339        $text .= $end_sec;
340        }
341            $section = $doc_obj->get_next_section($section);
342        next;
343          }
344
345    # add in start section tag if indexing at the section level
346    $text .= $start_sec if ($sec_tag_name ne "");
347
348    if ($edit_mode eq "add") {
349        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
350    }
351    elsif ($edit_mode eq "delete") {
352        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
353    }
354
355
356    # has the user added a 'metadata' index?
357    my $all_metadata_specified = 0;
358    # which fields have already been indexed? (same as fields, but in a map)
359    my $specified_fields = {};
360   
361    # do we have an allfields index??
362    my $allfields_index = 0;
363    # collect up all the text for it in here
364    my $allfields_text = "";
365    foreach my $field (split (/;/, $fields)) {
366        if ($field eq "allfields") {
367        $allfields_index = 1;
368        } elsif ($field eq "metadata") {
369        $all_metadata_specified = 1;
370        }
371    }
372   
373    foreach my $field (split (/;/, $fields)) {
374       
375        # only deal with this field if it doesn't start with top or
376        # this is the first section
377        my $real_field = $field;
378        next if (($real_field =~ s/^top//) && ($doc_section != 1));
379       
380        # process these two later
381        next if ($real_field eq "allfields" || $real_field eq "metadata");
382       
383        #individual metadata and or text specified - could be a comma separated list
384        $specified_fields->{$real_field} = 1;
385        my $shortname="";
386        my $new_field = 0; # have we found a new field name?
387        if (defined $self->{'indexfieldmap'}->{$real_field}) {
388        $shortname = $self->{'indexfieldmap'}->{$real_field};
389        }
390        else {
391        $shortname = $self->create_shortname($real_field);
392        $new_field = 1;
393        }
394
395        my @metadata_list = (); # put any metadata values in here
396        my $section_text = ""; # put the text in here
397        foreach my $submeta (split /,/, $real_field) {
398        if ($submeta eq "text") {
399            # no point in indexing text more than once
400            if ($section_text eq "") {
401            $section_text = $doc_obj->get_text($section);
402            if ($self->{'indexing_text'}) {
403                # we always strip html
404                $section_text = $self->preprocess_text($section_text, 1, "");
405            }
406            else {
407                # leave html stuff in, but escape the tags
408                &ghtml::htmlsafe($section_text);
409            }
410            }
411        }
412        else {
413            $submeta =~ s/^ex\.//; #strip off ex.
414
415            # its a metadata element
416            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
417            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
418            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
419                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
420            }
421            }
422            push (@metadata_list, @section_metadata);
423        }
424        } # for each field in this one index
425       
426        # now we add the text and/or metadata into new_text
427        if ($section_text ne "" || scalar(@metadata_list)) {
428        my $new_text = "";
429       
430        if ($section_text ne "") {
431            $new_text .= "$section_text ";
432        }
433       
434        foreach my $item (@metadata_list) {
435            &ghtml::htmlsafe($item);
436            $new_text .= "$item ";
437        }
438
439        if ($allfields_index) {
440            $allfields_text .= $new_text;
441        }
442
443        if ($self->{'indexing_text'}) {
444            # add the tag
445            $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
446        }
447        # filter the text
448        $new_text = $self->filter_text ($field, $new_text);
449
450        if ($edit_mode eq "add") {
451            $self->{'num_processed_bytes'} += length ($new_text);
452            $text .= "$new_text";
453        }
454        elsif ($edit_mode eq "update") {
455            $text .= "$new_text";
456        }
457        elsif ($edit_mode eq "delete") {
458            $self->{'num_processed_bytes'} -= length ($new_text);
459        }
460       
461
462        if ($self->{'indexing_text'} && $new_field) {
463            # we need to add to the list in indexfields
464           
465            $self->{'indexfieldmap'}->{$real_field} = $shortname;
466            $self->{'indexfieldmap'}->{$shortname} = 1;
467        }
468       
469        }
470       
471    } # foreach field
472
473
474    if ($all_metadata_specified) {
475       
476        my $new_text = "";
477        my $shortname = "";
478        my $metadata = $doc_obj->get_all_metadata ($section);
479        foreach my $pair (@$metadata) {
480        my ($mfield, $mvalue) = (@$pair);
481
482        # no value
483        next unless defined $mvalue && $mvalue ne "";
484
485        # we have already indexed this
486        next if defined ($specified_fields->{$mfield});
487
488        # check fields here, maybe others dont want - change to use dontindex!!
489        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
490        next if ($mfield =~ /^gsdl/);
491       
492        &ghtml::htmlsafe($mvalue);
493       
494        if (defined $self->{'indexfieldmap'}->{$mfield}) {
495            $shortname = $self->{'indexfieldmap'}->{$mfield};
496        }
497        else {
498            $shortname = $self->create_shortname($mfield);
499            $self->{'indexfieldmap'}->{$mfield} = $shortname;
500            $self->{'indexfieldmap'}->{$shortname} = 1;
501        }     
502        $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
503        if ($allfields_index) {
504            $allfields_text .= "$mvalue ";
505        }
506
507        if (!defined $self->{'indexfields'}->{$mfield}) {
508            $self->{'indexfields'}->{$mfield} = 1;
509        }                   
510       
511        }
512        # filter the text
513        $new_text = $self->filter_text ("metadata", $new_text);
514       
515        if ($edit_mode eq "add") {
516        $self->{'num_processed_bytes'} += length ($new_text);
517        $text .= "$new_text";
518        }
519        elsif ($edit_mode eq "update") {
520        $text .= "$new_text";
521        }
522        elsif ($edit_mode eq "delete") {
523        $self->{'num_processed_bytes'} -= length ($new_text);
524        }       
525    }
526
527    if ($allfields_index) {
528        # add the index name mapping
529        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
530        $self->{'indexfieldmap'}->{"ZZ"} = 1;
531       
532        my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
533        # filter the text
534        $new_text = $self->filter_text ("allfields", $new_text);
535       
536        if ($edit_mode eq "add") {
537        $self->{'num_processed_bytes'} += length ($new_text);
538        $text .= "$new_text";
539        }
540        elsif ($edit_mode eq "update") {
541        $text .= "$new_text";
542        }
543        elsif ($edit_mode eq "delete") {
544        $self->{'num_processed_bytes'} -= length ($new_text);
545        }
546    }
547       
548    # add in end tag if at top-level doc root, or indexing at the section level
549    $text .= $end_sec if ($sec_tag_name ne "");
550
551        $section = $doc_obj->get_next_section($section);
552    } # while defined section
553
554   
555    # only output if working with doc level
556    $text .= $end_doc if ($sec_tag_name eq "");
557
558##    $text .= "<commit/>\n";
559
560    print $solrhandle $text;
561
562}
563
564
565
566
567sub textreindex
568{
569    my $self = shift (@_);
570    my ($doc_obj,$file) = @_;
571
572    $self->textedit($doc_obj,$file,"update");
573}
574
575
5761;
577
578
Note: See TracBrowser for help on using the browser.