root/gsdl/trunk/perllib/lucenebuildproc.pm @ 17797

Revision 17797, 16.2 KB (checked in by kjdon, 12 years ago)

my previous changes to the text method meant that field tags were being output for text as well as for index. Don't want this - they end up in the HTML and give errors. So now tags are only output if ->'indexing_text'}

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42    @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47    my $class = shift @_;
48    my $self = new mgppbuildproc (@_);
49
50    $self->{'numincdocs'} = 0;
51
52    return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58    my $self = shift (@_);
59
60    # Unlike MG and MGPP, Lucene supports incremental building
61    return 1;
62}
63
64
65sub text {
66    my $self = shift (@_);
67    my ($doc_obj,$file) = @_;
68    my $handle = $self->{'output_handle'};
69    my $outhandle = $self->{'outhandle'};
70
71    # only output this document if it is one to be indexed
72    return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76    # this is another document
77    $self->{'num_docs'} += 1;
78
79    # get the parameters for the output
80    # split on : just in case there is subcoll and lang stuff
81    my ($fields) = split (/:/, $self->{'index'});
82
83    my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85    my $levels = $self->{'levels'};
86    my $ldoc_level = $levels->{'document'};
87    my $lsec_level = $levels->{'section'};
88    #my $lpar_level = $levels->{'paragraph'};
89
90    my $gs2_id = "";
91    if ($ldoc_level)
92    {
93    if ($self->{'db_level'} eq 'document')
94    {
95        $gs2_id = $self->{'num_docs'};
96    }
97        else
98    {
99        # default is section level
100        $gs2_id = $self->{'num_sections'} + 1;
101    }
102    }
103    my $gs2_docOID = $doc_obj->get_OID();
104    my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105    my $documentendtag = "\n</$doc_tag_name>\n";
106
107    my $sec_tag_name = "";
108    if ($lsec_level)
109    {
110    $sec_tag_name = $mgppbuildproc::level_map{'section'};
111    }
112
113    my $doc_section = 0; # just for this document
114
115    my $text = "";
116    $text .= $documenttag;
117    # get the text for this document
118    my $section = $doc_obj->get_top_section();
119    while (defined $section)
120    {
121    # update a few statistics
122    $doc_section++;
123    $self->{'num_sections'}++;
124
125    if ($sec_tag_name ne "")
126    {
127        my $sec_gs2_id = $self->{'num_sections'};
128        my $sec_gs2_docOID = $gs2_docOID . "." . $section;
129        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
130    }
131
132    # if we are doing subcollections, then some docs shouldn't be indexed.
133    # but we need to put the section tag placeholders in there so the
134    # sections match up with database
135    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
136    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
137        $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
138            $section = $doc_obj->get_next_section($section);
139        next;
140          }
141
142    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
143
144    # has the user added a 'metadata' index?
145    my $all_metadata_specified = 0;
146    # which fields have already been indexed? (same as fields, but in a map)
147    my $specified_fields = {};
148   
149    # do we have an allfields index??
150    my $allfields_index = 0;
151    # collect up all the text for it in here
152    my $allfields_text = "";
153    foreach my $field (split (/;/, $fields)) {
154        if ($field eq "allfields") {
155        $allfields_index = 1;
156        } elsif ($field eq "metadata") {
157        $all_metadata_specified = 1;
158        }
159    }
160   
161    foreach my $field (split (/;/, $fields)) {
162       
163        # only deal with this field if it doesn't start with top or
164        # this is the first section
165        my $real_field = $field;
166        next if (($real_field =~ s/^top//) && ($doc_section != 1));
167       
168        # process these two later
169        next if ($real_field eq "allfields" || $real_field eq "metadata");
170       
171        #individual metadata and or text specified - could be a comma separated list
172        $specified_fields->{$real_field} = 1;
173        my $shortname="";
174        my $new_field = 0; # have we found a new field name?
175        if (defined $self->{'indexfieldmap'}->{$real_field}) {
176        $shortname = $self->{'indexfieldmap'}->{$real_field};
177        }
178        else {
179        $shortname = $self->create_shortname($real_field);
180        $new_field = 1;
181        }
182
183        my @metadata_list = (); # put any metadata values in here
184        my $section_text = ""; # put the text in here
185        foreach my $submeta (split /,/, $real_field) {
186        if ($submeta eq "text") {
187            # no point in indexing text more than once
188            if ($section_text eq "") {
189            $section_text = $doc_obj->get_text($section);
190            if ($self->{'indexing_text'}) {
191                # we always strip html
192                $section_text = $self->preprocess_text($section_text, 1, "");
193            }
194            else {
195                # leave html stuff in, but escape the tags
196                &ghtml::htmlsafe($section_text);
197            }
198            }
199        }
200        else {
201            # its a metadata element
202            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
203            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
204            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
205                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
206            }
207            }
208            push (@metadata_list, @section_metadata);
209        }
210        } # for each field in this one index
211       
212        # now we add the text and/or metadata into new_text
213        if ($section_text ne "" || scalar(@metadata_list)) {
214        my $new_text = "";
215       
216        if ($section_text ne "") {
217            $new_text .= "$section_text ";
218        }
219       
220        foreach my $item (@metadata_list) {
221            &ghtml::htmlsafe($item);
222            $new_text .= "$item ";
223        }
224
225        if ($allfields_index) {
226            $allfields_text .= $new_text;
227        }
228
229        if ($self->{'indexing_text'}) {
230            # add the tag
231            $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
232        }
233        # filter the text
234        $new_text = $self->filter_text ($field, $new_text);
235        $self->{'num_processed_bytes'} += length ($new_text);
236       
237        $text .= "$new_text";
238
239        if ($self->{'indexing_text'} && $new_field) {
240            # we need to add to the list in indexfields
241           
242            $self->{'indexfieldmap'}->{$real_field} = $shortname;
243            $self->{'indexfieldmap'}->{$shortname} = 1;
244        }
245       
246        }
247       
248    } # foreach field
249
250
251    if ($all_metadata_specified) {
252       
253        my $new_text = "";
254        my $shortname = "";
255        my $metadata = $doc_obj->get_all_metadata ($section);
256        foreach my $pair (@$metadata) {
257        my ($mfield, $mvalue) = (@$pair);
258        # no value
259        next unless defined $mvalue && $mvalue ne "";
260        # we have already indexed this
261        next if defined ($specified_fields->{$mfield});
262        # check fields here, maybe others dont want - change to use dontindex!!
263        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
264        next if ($mfield =~ /^gsdl/);
265       
266        &ghtml::htmlsafe($mvalue);
267       
268        if (defined $self->{'indexfieldmap'}->{$mfield}) {
269            $shortname = $self->{'indexfieldmap'}->{$mfield};
270        }
271        else {
272            $shortname = $self->create_shortname($mfield);
273            $self->{'indexfieldmap'}->{$mfield} = $shortname;
274            $self->{'indexfieldmap'}->{$shortname} = 1;
275        }     
276        $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
277        if ($allfields_index) {
278            $allfields_text .= "$mvalue ";
279        }
280
281        if (!defined $self->{'indexfields'}->{$mfield}) {
282            $self->{'indexfields'}->{$mfield} = 1;
283        }                   
284       
285        }
286        # filter the text
287        $new_text = $self->filter_text ("metadata", $new_text);
288       
289        $self->{'num_processed_bytes'} += length ($new_text);
290        $text .= "$new_text";
291
292       
293    }
294
295    if ($allfields_index) {
296        # add the index name mapping
297        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
298        $self->{'indexfieldmap'}->{"ZZ"} = 1;
299       
300        my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
301        # filter the text
302        $new_text = $self->filter_text ("allfields", $new_text);
303       
304        $self->{'num_processed_bytes'} += length ($new_text);
305        $text .= "$new_text";
306    }
307   
308    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
309
310        $section = $doc_obj->get_next_section($section);
311    } #while defined section
312    print $handle "$text\n$documentendtag";
313    #print STDOUT "$text\n$documentendtag";
314}
315
316# /** We make this builder pretend to be a document processor so we can get
317#  *  information back from the plugins.
318#  *
319#  *  @param  $self    A reference to this Lucene builder
320#  *  @param  $doc_obj A reference to a document object representing what was
321#  *                   parsed by the GAPlug
322#  *  @param  $file    The name of the file parsed as a string
323#  *
324#  *  @author John Thompson, DL Consulting Ltd
325#  */
326sub process()
327  {
328    my $self = shift (@_);
329    my ($doc_obj, $file) = @_;
330
331    # If this is called from any stage other than an incremental infodb we want
332    # to pass through to the superclass of build
333    if ($self->get_mode() eq "incinfodb")
334      {
335        print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
336        my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
337        $archivedir = "" unless defined $archivedir;
338        $archivedir =~ s/\\/\//g;
339        $archivedir =~ s/^\/+//;
340        $archivedir =~ s/\/+$//;
341
342        # Number of files
343        print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
344
345        # resolve the final filenames of the files associated with this document
346        $self->assoc_files ($doc_obj, $archivedir);
347
348        # is this a paged or a hierarchical document
349        my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
350
351        # Determine the actual docnum by checking if we've processed any
352        # previous incrementally added documents. If so, carry on from there.
353        # Otherwise we set the counter to be the same as the number of
354        # sections encountered during the previous build
355        if ($self->{'numincdocs'} == 0)
356          {
357            $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
358          }
359
360        my $section = $doc_obj->get_top_section ();
361        print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
362        my $doc_OID = $doc_obj->get_OID();
363        my $url = "";
364        while (defined $section)
365          {
366            print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
367            # Attach all the other metadata to this document
368            # output the fact that this document is a document (unless doctype
369            # has been set to something else from within a plugin
370            my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
371            if (!defined $dtype || $dtype !~ /\w/)
372              {
373                #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
374          $doc_obj->add_utf8_metadata($section, "doctype", "doc");
375              }
376            # output whether this node contains text
377            if ($doc_obj->get_text_length($section) > 0)
378              {
379                $doc_obj->add_utf8_metadata($section, "hastxt", 1);
380              }
381            else
382              {
383                $doc_obj->add_utf8_metadata($section, "hastxt", 0);
384              }
385
386            # output archivedir if at top level
387            if ($section eq $doc_obj->get_top_section())
388              {
389                $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
390        $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
391              }
392
393            # output a list of children
394            my $children = $doc_obj->get_children ($section);
395            if (scalar(@$children) > 0)
396              {
397                $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
398                my @contains = ();
399                foreach my $child (@$children)
400                  {
401                    if ($child =~ /^.*?\.(\d+)$/)
402                      {
403                        push (@contains, "\".$1");
404                      }
405                    else
406                      {
407                        push (@contains, "\".$child");
408                      }
409                  }
410                $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
411              }
412            #output the matching doc number
413            print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
414            $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
415
416            $self->{'numincdocs'}++;
417            $section = $doc_obj->get_next_section($section);
418            # if no sections wanted, only add the docs
419            last if ($self->{'db_level'} eq "document");
420          }
421        print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
422        &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
423      }
424    else
425      {
426        $self->mgppbuildproc::process(@_);
427      }
428  }
429# /** process() **/
430
431
432# Following methods seem to be no different to those defined in basebuildproc.pm
433# From inspection, it looks like these ones can be removed
434
435
436sub get_num_docs {
437    my $self = shift (@_);
438    #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
439    return $self->{'num_docs'};
440}
441
442sub get_num_sections {
443    my $self = shift (@_);
444    #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
445    return $self->{'num_sections'};
446}
447
448# num_bytes is the actual number of bytes in the collection
449# this is normally the same as what's processed during text compression
450sub get_num_bytes {
451    my $self = shift (@_);
452    #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
453    return $self->{'num_bytes'};
454}
455
456
457# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
458# Otherwise the removal of tags below might lead to Lucene turning
459#   "...farming</p>\n<p>EDWARD.." into "farmingedward"
460#     (example from demo collection b20cre)
461# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
462sub preprocess_text
463{
464    my $self = shift (@_);
465    my ($text, $strip_html, $para) = @_;
466    # at this stage, we do not do paragraph tags unless have strip_html -
467    # it will result in a huge mess of non-xml
468    return unless $strip_html;
469
470    my $new_text = $text;
471
472    # if we have <pre> tags, we can have < > inside them, need to delete
473    # the <> before stripping tags
474    $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
475
476    if ($para eq "") {
477       # just remove all tags
478       $new_text =~ s/<[^>]*>/ /gs;
479    } else {
480       # strip all tags except <p> tags which get turned into $para
481       $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
482    }
483
484    # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
485    #   may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
486    $new_text =~ s/&\w{1,10};//g;
487    # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
488    $new_text =~ s/&([^\#])/ $1/g;
489
490    return $new_text;
491}
492
493
4941;
495
Note: See TracBrowser for help on using the browser.