Context Navigation

← Previous Change
Next Change →

lucenebuildproc.pm

Timestamp:

2006-09-25T14:17:10+12:00 (18 years ago)

Author:

mdewsnip

Message:

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

File:

: 1 edited

trunk/gsdl/perllib/lucenebuildproc.pm (modified) (11 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/lucenebuildproc.pm

-              r12426
+              r12844
 # Use same basic XML structure setup by mgppbuilder/mgppbuildproc
 use mgppbuildproc;
+use mgppbuildproc;
 use ghtml;
 use strict;
 …
+use IncrementalBuildUtils;
 sub BEGIN {
     @lucenebuildproc::ISA = ('mgppbuildproc');
 …
     my $class = shift @_;
     my $self = new mgppbuildproc (@_);
+    $self->{'numincdocs'} = 0;
     return bless $self, $class;
 …
     my $docid="";
+    if ($ldoc_level) {
+    if ($self->{'gdbm_level'} eq 'document') {
+        my $doc_sec_num = $self->{'num_docs'};
+    if ($ldoc_level)
+      {
+    if ($self->{'gdbm_level'} eq 'document')
+          {
+        my $doc_sec_num = $self->{'num_docs'};
         $docid = "gs2:id=\"$doc_sec_num\"";
+    } else  {
+          }
+        else
+          {
         # default is section level
         my $doc_sec_num = $self->{'num_sections'}+1;
+        my $doc_sec_num = $self->{'num_sections'} + 1;
         $docid = "gs2:id=\"$doc_sec_num\"";
+    }
+    }
+          }
+      }
     my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
     my $documentendtag = "\n</$doc_level>\n";
+    my ($sectiontag) = "";
+    if ($lsec_level) {
+    my ($sectiontag) = "";
+    if ($lsec_level)
+      {
     $sectiontag = $mgppbuildproc::level_map{'section'};
+    }
+      }
     my ($parastarttag) = "";
     my ($paraendtag) = "";
+    if ($self->{'levels'}->{'paragraph'}) {
+    if ($self->{'strip_html'}) {
+    if ($self->{'levels'}->{'paragraph'})
+      {
+    if ($self->{'strip_html'})
+          {
         $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
         $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
+    } else {
+          }
+        else
+          {
         print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
+    }
+    }
+          }
+      }
     my $doc_section = 0; # just for this document
 …
     # get the text for this document
     my $section = $doc_obj->get_top_section();
+    while (defined $section) {
+    while (defined $section)
+      {
     # update a few statistics
     $doc_section++;
+    $self->{'num_sections'} += 1;
+    if ($sectiontag ne "") {
+        my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
+    $self->{'num_sections'}++;
+    if ($sectiontag ne "")
+          {
+        my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
         $text .= "\n<$sectiontag $secid >\n";
+    }
+          }
     # if we are doing subcollections, then some docs shouldn't be indexed.
     # but we need to put the section tag placeholders in there so the
+    # but we need to put the section tag placeholders in there so the
     # sections match up with gdbm db
     my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
     if (!$indexed_doc || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
         $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
         $section = $doc_obj->get_next_section($section);
+            $section = $doc_obj->get_next_section($section);
         next;
+    }
+          }
     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+    foreach my $field (split (/;/, $fields)) {
+    foreach my $field (split (/;/, $fields))
+          {
         # only deal with this field if it doesn't start with top or
         # this is the first section
 …
         next if (($real_field =~ s/^top//) && ($doc_section != 1));
+        my $new_text = "";
+        # we get allfields by default - do nothing
+        if ($real_field eq "allfields") {
+        }
+        my $new_text = "";
+        my $tmp_text = "";
+        # If allfields is requested add all metadata fields and text as
+            # belonging to the ZZ field
+        if ($real_field eq "allfields") {
+              # Text first - no html nor paragraph tags
+              $new_text .= "$parastarttag<ZZ index=\"1\">\n";
+              $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
+              &ghtml::htmlsafe($tmp_text);
+              $new_text .= "$tmp_text</ZZ>$paraendtag\n";
+              # Then Metadata
+              my $metadata = $doc_obj->get_all_metadata ($section);
+              foreach my $pair (@$metadata) {
+                my ($mfield, $mvalue) = (@$pair);
+                &ghtml::htmlsafe($mvalue);
+                # check fields here, maybe others dont want - change to use dontindex!!
+                if ($mfield ne "Identifier"
+                    && $mfield !~ /^gsdl/
+                    && $mfield ne "classifytype"
+                    && $mfield ne "assocfilepath"
+                    && defined $mvalue && $mvalue ne "") {
+                  $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
+                }
+                if (!defined $self->{'indexfields'}->{$mfield}) {
+                  $self->{'indexfields'}->{$mfield} = 1;
+                }
+              }
+        }
         # metadata - output all metadata we know about except gsdl stuff
         elsif ($real_field eq "metadata") {
+        elsif ($real_field eq "metadata" || $real_field eq "allfields") {
         my $shortname = "";
         my $metadata = $doc_obj->get_all_metadata ($section);
         foreach my $pair (@$metadata) {
             my ($mfield, $mvalue) = (@$pair);
+                    &ghtml::htmlsafe($mvalue);
             # check fields here, maybe others dont want - change to use dontindex!!
             if ($mfield ne "Identifier"
 …
             && $mfield ne "assocfilepath"
             && defined $mvalue && $mvalue ne "") {
             if (defined $self->{'indexfieldmap'}->{$mfield}) {
                 $shortname = $self->{'indexfieldmap'}->{$mfield};
 …
                 $self->{'indexfieldmap'}->{$mfield} = $shortname;
                 $self->{'indexfieldmap'}->{$shortname} = 1;
+            }
+            }
             $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
             if (!defined $self->{'indexfields'}->{$mfield}) {
                 $self->{'indexfields'}->{$mfield} = 1;
+            }
+            }
+            }
+        }
+        }
         else {
+        else {
         #individual metadata and or text specified - could be a comma separated list
         my $shortname="";
 …
             $self->{'indexfieldmap'}->{$shortname} = 1;
+        }
         my @metadata_list = ();
         foreach my $submeta (split /,/, $real_field) {
 …
                 $new_text .= "$section_text</$shortname>$paraendtag\n";
+            }
             else {
+                            # leave html stuff in, but escape the tags, and don't add Paragraph tags - never retrieve paras at the moment
                 &ghtml::htmlsafe($section_text);
                 $new_text .= $section_text;
+            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
+                $tmp_text .= $doc_obj->get_text ($section);
+                &ghtml::htmlsafe($tmp_text);
+                $new_text .= $tmp_text;
+            }
+            }
 …
+        }
+        }
         # filter the text
         $self->filter_text ($field, $new_text);
         $self->{'num_processed_bytes'} += length ($new_text);
         $text .= "$new_text";
     } # foreach field
     $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
     $section = $doc_obj->get_next_section($section);
+        $section = $doc_obj->get_next_section($section);
     } #while defined section
     print $handle "$text\n$documentendtag";
+    print $handle "$text\n$documentendtag";
     #print STDOUT "$text\n$documentendtag";
+}
+# /** We make this builder pretend to be a document processor so we can get
+#  *  information back from the plugins.
+#  *
+#  *  @param  $self    A reference to this Lucene builder
+#  *  @param  $doc_obj A reference to a document object representing what was
+#  *                   parsed by the GAPlug
+#  *  @param  $file    The name of the file parsed as a string
+#  *
+#  *  @author John Thompson, DL Consulting Ltd
+#  */
+sub process()
+  {
+    my $self = shift (@_);
+    my ($doc_obj, $file) = @_;
+    # If this is called from any stage other than an incremental infodb we want
+    # to pass through to the superclass of build
+    if ($self->get_mode() eq "incinfodb")
+      {
+        print STDERR "*** Processing a document added using INCINFODB ***\n";
+        my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
+        $archivedir = "" unless defined $archivedir;
+        $archivedir =~ s/\\/\//g;
+        $archivedir =~ s/^\/+//;
+        $archivedir =~ s/\/+$//;
+        # Number of files
+        print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
+        # resolve the final filenames of the files associated with this document
+        $self->assoc_files ($doc_obj, $archivedir);
+        # is this a paged or a hierarchical document
+        my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
+        # Determine the actual docnum by checking if we've processed any
+        # previous incrementally added documents. If so, carry on from there.
+        # Otherwise we set the counter to be the same as the number of
+        # sections encountered during the previous build
+        if ($self->{'numincdocs'} == 0)
+          {
+            $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
+          }
+        my $section = $doc_obj->get_top_section ();
+        print STDERR "+ top section: '$section'\n";
+        my $doc_OID = $doc_obj->get_OID();
+        my $url = "";
+        while (defined $section)
+          {
+            print STDERR "+ processing section: '$section'\n";
+            # Attach all the other metadata to this document
+            # output the fact that this document is a document (unless doctype
+            # has been set to something else from within a plugin
+            my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
+            if (!defined $dtype || $dtype !~ /\w/)
+              {
+                $doc_obj->add_utf8_metadata($section, "doctype", $dtype);
+              }
+            # output whether this node contains text
+            if ($doc_obj->get_text_length($section) > 0)
+              {
+                $doc_obj->add_utf8_metadata($section, "hastxt", 1);
+              }
+            else
+              {
+                $doc_obj->add_utf8_metadata($section, "hastxt", 0);
+              }
+            # output archivedir if at top level
+            if ($section eq $doc_obj->get_top_section())
+              {
+                $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
+        $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
+              }
+            # output a list of children
+            my $children = $doc_obj->get_children ($section);
+            if (scalar(@$children) > 0)
+              {
+                $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
+                my @contains = ();
+                foreach my $child (@$children)
+                  {
+                    if ($child =~ /^.*?\.(\d+)$/)
+                      {
+                        push (@contains, "\".$1");
+                      }
+                    else
+                      {
+                        push (@contains, "\".$child");
+                      }
+                  }
+                $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
+              }
+            #output the matching doc number
+            print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
+            $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
+            $self->{'numincdocs'}++;
+            $section = $doc_obj->get_next_section($section);
+            # if no sections wanted, only gdbm the docs
+            last if ($self->{'gdbm_level'} eq "document");
+          }
+        print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
+        &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
+      }
+    else
+      {
+        $self->mgppbuildproc::process(@_);
+      }
+  }
+# /** process() **/
+sub get_num_docs {
+    my $self = shift (@_);
+    #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
+    return $self->{'num_docs'};
+}
+sub get_num_sections {
+    my $self = shift (@_);
+    #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
+    return $self->{'num_sections'};
+}
+# num_bytes is the actual number of bytes in the collection
+# this is normally the same as what's processed during text compression
+sub get_num_bytes {
+    my $self = shift (@_);
+    #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
+    return $self->{'num_bytes'};
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12844 for trunk/gsdl/perllib/lucenebuildproc.pm

Legend:

trunk/gsdl/perllib/lucenebuildproc.pm

Download in other formats: