root/gsdl/trunk/perllib/lucenebuildproc.pm @ 17110

Revision 17110, 16.3 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42    @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47    my $class = shift @_;
48    my $self = new mgppbuildproc (@_);
49
50    $self->{'numincdocs'} = 0;
51
52    return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58    my $self = shift (@_);
59
60    # Unlike MG and MGPP, Lucene supports incremental building
61    return 1;
62}
63
64
65sub text {
66    my $self = shift (@_);
67    my ($doc_obj,$file) = @_;
68    my $handle = $self->{'output_handle'};
69    my $outhandle = $self->{'outhandle'};
70
71    # only output this document if it is one to be indexed
72    return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76    # this is another document
77    $self->{'num_docs'} += 1;
78
79    # get the parameters for the output
80    # split on : just in case there is subcoll and lang stuff
81    my ($fields) = split (/:/, $self->{'index'});
82
83    my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85    my $levels = $self->{'levels'};
86    my $ldoc_level = $levels->{'document'};
87    my $lsec_level = $levels->{'section'};
88    my $lpar_level = $levels->{'paragraph'};
89
90    my $gs2_id = "";
91    if ($ldoc_level)
92    {
93    if ($self->{'db_level'} eq 'document')
94    {
95        $gs2_id = $self->{'num_docs'};
96    }
97        else
98    {
99        # default is section level
100        $gs2_id = $self->{'num_sections'} + 1;
101    }
102    }
103    my $gs2_docOID = $doc_obj->get_OID();
104    my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105    my $documentendtag = "\n</$doc_tag_name>\n";
106
107    my $sec_tag_name = "";
108    if ($lsec_level)
109      {
110    $sec_tag_name = $mgppbuildproc::level_map{'section'};
111      }
112    my ($parastarttag) = "";
113    my ($paraendtag) = "";
114    if ($self->{'levels'}->{'paragraph'})
115      {
116    if ($self->{'strip_html'})
117          {
118        $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
119        $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
120          }
121        else
122          {
123        print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
124          }
125      }
126
127    my $doc_section = 0; # just for this document
128
129    my $text = "";
130    $text .= $documenttag;
131    # get the text for this document
132    my $section = $doc_obj->get_top_section();
133    while (defined $section)
134      {
135    # update a few statistics
136    $doc_section++;
137    $self->{'num_sections'}++;
138
139    if ($sec_tag_name ne "")
140    {
141        my $sec_gs2_id = $self->{'num_sections'};
142        my $sec_gs2_docOID = $gs2_docOID . "." . $section;
143        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
144    }
145
146    # if we are doing subcollections, then some docs shouldn't be indexed.
147    # but we need to put the section tag placeholders in there so the
148    # sections match up with database
149    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
150    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
151        $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
152            $section = $doc_obj->get_next_section($section);
153        next;
154          }
155
156    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
157    foreach my $field (split (/;/, $fields))
158          {
159        # only deal with this field if it doesn't start with top or
160        # this is the first section
161        my $real_field = $field;
162        next if (($real_field =~ s/^top//) && ($doc_section != 1));
163
164        my $new_text = "";
165        my $tmp_text = "";
166
167        # If allfields is requested add all metadata fields and text as
168            # belonging to the ZZ field
169        if ($real_field eq "allfields") {
170              # Text first - no html nor paragraph tags
171              $new_text .= "$parastarttag<ZZ index=\"1\">\n";
172              $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
173              &ghtml::htmlsafe($tmp_text);
174              $new_text .= "$tmp_text</ZZ>$paraendtag\n";
175              # Then Metadata
176              my $metadata = $doc_obj->get_all_metadata ($section);
177              foreach my $pair (@$metadata) {
178                my ($mfield, $mvalue) = (@$pair);
179                &ghtml::htmlsafe($mvalue);
180                # check fields here, maybe others dont want - change to use dontindex!!
181                if ($mfield ne "Identifier"
182                    && $mfield !~ /^gsdl/
183                    && $mfield ne "classifytype"
184                    && $mfield ne "assocfilepath"
185                    && defined $mvalue && $mvalue ne "") {
186                  $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
187                }
188                if (!defined $self->{'indexfields'}->{$mfield}) {
189                  $self->{'indexfields'}->{$mfield} = 1;
190                }
191              }
192        }
193        # metadata - output all metadata we know about except gsdl stuff
194        elsif ($real_field eq "metadata" || $real_field eq "allfields") {
195        my $shortname = "";
196        my $metadata = $doc_obj->get_all_metadata ($section);
197        foreach my $pair (@$metadata) {
198            my ($mfield, $mvalue) = (@$pair);
199                    &ghtml::htmlsafe($mvalue);
200            # check fields here, maybe others dont want - change to use dontindex!!
201            if ($mfield ne "Identifier"
202            && $mfield !~ /^gsdl/
203            && $mfield ne "classifytype"
204            && $mfield ne "assocfilepath"
205            && defined $mvalue && $mvalue ne "") {
206
207            if (defined $self->{'indexfieldmap'}->{$mfield}) {
208                $shortname = $self->{'indexfieldmap'}->{$mfield};
209            }
210            else {
211                $shortname = $self->create_shortname($mfield);
212                $self->{'indexfieldmap'}->{$mfield} = $shortname;
213                $self->{'indexfieldmap'}->{$shortname} = 1;
214            }
215            $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
216            if (!defined $self->{'indexfields'}->{$mfield}) {
217                $self->{'indexfields'}->{$mfield} = 1;
218            }
219            }
220        }
221        }
222        else {
223        #individual metadata and or text specified - could be a comma separated list
224        my $shortname="";
225        if (defined $self->{'indexfieldmap'}->{$real_field}) {
226            $shortname = $self->{'indexfieldmap'}->{$real_field};
227        }
228        else {
229            $shortname = $self->create_shortname($real_field);
230            $self->{'indexfieldmap'}->{$real_field} = $shortname;
231            $self->{'indexfieldmap'}->{$shortname} = 1;
232        }
233
234        my @metadata_list = ();
235        foreach my $submeta (split /,/, $real_field) {
236            if ($submeta eq "text") {
237            my $section_text = $doc_obj->get_text($section);
238            if ($self->{'indexing_text'}) {
239                            # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
240                $new_text .= "$parastarttag<$shortname index=\"1\">\n";
241                if ($parastarttag ne "") {
242                $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
243                }
244                else {
245                # we don't want to individually tag each paragraph if not doing para indexing
246                $section_text = $self->preprocess_text($section_text, 1, "");
247                }
248                $new_text .= "$section_text</$shortname>$paraendtag\n";
249            }
250            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
251                $tmp_text .= $doc_obj->get_text ($section);
252                &ghtml::htmlsafe($tmp_text);
253                $new_text .= $tmp_text;
254            }
255            }
256            else {
257            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
258            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
259                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
260                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
261                }
262            }
263            push (@metadata_list, @section_metadata);
264            }
265        }
266        foreach my $item (@metadata_list) {
267            &ghtml::htmlsafe($item);
268            $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269        }
270        }
271        # filter the text
272        $new_text = $self->filter_text ($field, $new_text);
273        $self->{'num_processed_bytes'} += length ($new_text);
274
275        $text .= "$new_text";
276    } # foreach field
277
278    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
279
280        $section = $doc_obj->get_next_section($section);
281    } #while defined section
282    print $handle "$text\n$documentendtag";
283    #print STDOUT "$text\n$documentendtag";
284}
285
286# /** We make this builder pretend to be a document processor so we can get
287#  *  information back from the plugins.
288#  *
289#  *  @param  $self    A reference to this Lucene builder
290#  *  @param  $doc_obj A reference to a document object representing what was
291#  *                   parsed by the GAPlug
292#  *  @param  $file    The name of the file parsed as a string
293#  *
294#  *  @author John Thompson, DL Consulting Ltd
295#  */
296sub process()
297  {
298    my $self = shift (@_);
299    my ($doc_obj, $file) = @_;
300
301    # If this is called from any stage other than an incremental infodb we want
302    # to pass through to the superclass of build
303    if ($self->get_mode() eq "incinfodb")
304      {
305        print STDERR "*** Processing a document added using INCINFODB ***\n";
306        my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
307        $archivedir = "" unless defined $archivedir;
308        $archivedir =~ s/\\/\//g;
309        $archivedir =~ s/^\/+//;
310        $archivedir =~ s/\/+$//;
311
312        # Number of files
313        print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315        # resolve the final filenames of the files associated with this document
316        $self->assoc_files ($doc_obj, $archivedir);
317
318        # is this a paged or a hierarchical document
319        my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321        # Determine the actual docnum by checking if we've processed any
322        # previous incrementally added documents. If so, carry on from there.
323        # Otherwise we set the counter to be the same as the number of
324        # sections encountered during the previous build
325        if ($self->{'numincdocs'} == 0)
326          {
327            $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328          }
329
330        my $section = $doc_obj->get_top_section ();
331        print STDERR "+ top section: '$section'\n";
332        my $doc_OID = $doc_obj->get_OID();
333        my $url = "";
334        while (defined $section)
335          {
336            print STDERR "+ processing section: '$section'\n";
337            # Attach all the other metadata to this document
338            # output the fact that this document is a document (unless doctype
339            # has been set to something else from within a plugin
340            my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341            if (!defined $dtype || $dtype !~ /\w/)
342              {
343                $doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344              }
345            # output whether this node contains text
346            if ($doc_obj->get_text_length($section) > 0)
347              {
348                $doc_obj->add_utf8_metadata($section, "hastxt", 1);
349              }
350            else
351              {
352                $doc_obj->add_utf8_metadata($section, "hastxt", 0);
353              }
354
355            # output archivedir if at top level
356            if ($section eq $doc_obj->get_top_section())
357              {
358                $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359        $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360              }
361
362            # output a list of children
363            my $children = $doc_obj->get_children ($section);
364            if (scalar(@$children) > 0)
365              {
366                $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367                my @contains = ();
368                foreach my $child (@$children)
369                  {
370                    if ($child =~ /^.*?\.(\d+)$/)
371                      {
372                        push (@contains, "\".$1");
373                      }
374                    else
375                      {
376                        push (@contains, "\".$child");
377                      }
378                  }
379                $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380              }
381            #output the matching doc number
382            print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383            $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385            $self->{'numincdocs'}++;
386            $section = $doc_obj->get_next_section($section);
387            # if no sections wanted, only add the docs
388            last if ($self->{'db_level'} eq "document");
389          }
390        print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391        &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392      }
393    else
394      {
395        $self->mgppbuildproc::process(@_);
396      }
397  }
398# /** process() **/
399
400
401# Following methods seem to be no different to those defined in basebuildproc.pm
402# From inspection, it looks like these ones can be removed
403
404
405sub get_num_docs {
406    my $self = shift (@_);
407    #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408    return $self->{'num_docs'};
409}
410
411sub get_num_sections {
412    my $self = shift (@_);
413    #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414    return $self->{'num_sections'};
415}
416
417# num_bytes is the actual number of bytes in the collection
418# this is normally the same as what's processed during text compression
419sub get_num_bytes {
420    my $self = shift (@_);
421    #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422    return $self->{'num_bytes'};
423}
424
425
426# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427# Otherwise the removal of tags below might lead to Lucene turning
428#   "...farming</p>\n<p>EDWARD.." into "farmingedward"
429#     (example from demo collection b20cre)
430# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431sub preprocess_text
432{
433    my $self = shift (@_);
434    my ($text, $strip_html, $para) = @_;
435    # at this stage, we do not do paragraph tags unless have strip_html -
436    # it will result in a huge mess of non-xml
437    return unless $strip_html;
438
439    my $new_text = $text;
440
441    # if we have <pre> tags, we can have < > inside them, need to delete
442    # the <> before stripping tags
443    $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445    if ($para eq "") {
446       # just remove all tags
447       $new_text =~ s/<[^>]*>/ /gs;
448    } else {
449       # strip all tags except <p> tags which get turned into $para
450       $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451    }
452
453    # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454    #   may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
455    $new_text =~ s/&\w{1,10};//g;
456    # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457    $new_text =~ s/&([^\#])/ $1/g;
458
459    return $new_text;
460}
461
462
4631;
464
Note: See TracBrowser for help on using the browser.