Context Navigation

← Previous Change
Next Change →

lucenebuildproc.pm

Timestamp:

2005-11-30T15:29:23+13:00 (18 years ago)

Author:

kjdon

Message:

modified mgpp/lucene indexes description so that you can have multiple metadata values in a single field, eg dc.Creator,dc.Contributor. indexes list looks more like mg one. e.g 'dc.Subject text dc.Creator,dc.Contributor'. also made the preprocess_text faster (I think), and a little bit of tidying up. if no_text is true, then don't run the compress text pass at all - its a waste of reading through the documents. so don't use store_text in these two buildprocs anymore

File:

: 1 edited

trunk/gsdl/perllib/lucenebuildproc.pm (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/lucenebuildproc.pm

-              r10474
+              r10961
     my ($text, $strip_html, $para) = @_;
+    my ($outtext) = "";
+    if ($strip_html) {
+    while ($text =~ /<([^>]*)>/ && $text ne "") {
+        my $tag = $1;
+        $outtext .= $`." "; #add everything before the matched tag
+        $text = $'; #'everything after the matched tag
+        if ($para && $tag =~ /^\s*p\s/i) {
+        $outtext .= $para;
+        }
+        elsif ($tag =~ /^pre$/) { # a pre tag
+        $text =~ /<\/pre>/; # find the closing pre tag
+        my $tmp_text = $`; #everything before the closing pre tag
+        $text = $'; #'everything after the </pre>
+        $tmp_text =~ s/[<>]//g; # remove all < and >
+        $outtext.= $tmp_text . " ";
+        }
+    }
+    # call the mgpp method first
+    my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para);
+    # remove entities
+    $new_text =~ s/&\w{1,10};//g;
+    # remove &
+    $new_text =~ s/&//g;
+    $outtext .= $text; # add any remaining text
+    } #if strip_html
+    else {
+    $outtext = $text;
+    }
+    #if ($para) {
+    #$text =~ s/(<p\b)/$para$1/gi;
+    #return $text;
+    #}
+    # remove entities
+    $outtext =~ s/&\w{1,10};//g;
+    return $outtext;
+}
+    return $new_text;
+}
 sub text {
 …
     my $handle = $self->{'output_handle'};
     my $outhandle = $self->{'outhandle'};
-    my $indexed_doc = 1;
     # only output this document if it is one to be indexed
     return if ($doc_obj->get_doc_type() ne "indexed_doc");
+    # see if this document belongs to this subcollection
+    foreach my $indexexp (@{$self->{'indexexparr'}}) {
+    $indexed_doc = 0;
+    my ($field, $exp, $options) = split /\//, $indexexp;
+    if (defined ($field) && defined ($exp)) {
+        my ($bool) = $field =~ /^(.)/;
+        $field =~ s/^.// if $bool eq '!';
+        if ($field =~ /^filename$/i) {
+        $field = $doc_obj->get_source_filename();
+        } else {
+        $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
+        }
+        next unless defined $field;
+        if ($bool eq '!') {
+        if ($options =~ /^i$/i) {
+            if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
+        } else {
+            if ($field !~ /$exp/) {$indexed_doc = 1; last;}
+        }
+        } else {
+        if ($options =~ /^i$/i) {
+            if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
+        } else {
+            if ($field =~ /$exp/) {$indexed_doc = 1; last;}
+        }
+        }
+    }
+    }
+    # if this doc is so far in the sub collection, and we have lang info,
+    # now we check the languages to see if it matches
+    if($indexed_doc && defined $self->{'lang_meta'}) {
+    $indexed_doc = 0;
+    my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
+    if (defined $field) {
+        foreach my $lang (@{$self->{'langarr'}}) {
+        my ($bool) = $lang =~ /^(.)/;
+        if ($bool eq '!') {
+            $lang =~ s/^.//;
+            if ($field !~ /$lang/) {
+            $indexed_doc = 1; last;
+            }
+        } else {
+            if ($field =~ /$lang/) {
+            $indexed_doc = 1; last;
+            }
+        }
+        }
+    }
+    }
+    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
     # this is another document
 …
     # but we need to put the section tag placeholders in there so the
     # sections match up with gdbm db
+    if ($indexed_doc) {
+        #if ($self->{'indexing_text'}) {
+    #   $text .= "$parastarttag"; # only add para tags for indexing
+        # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
+       # }
+        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+        foreach my $field (split (/,/, $fields)) {
+        # only deal with this field if it doesn't start with top or
+        # this is the first section
+        my $real_field = $field;
+        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
+            my $new_text = "";
+            my $tmp_text = "";
+            if ($real_field eq "text") {
+    if (!$indexed_doc) {
+        $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
+        $section = $doc_obj->get_next_section($section);
+        next;
+    }
+    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+    foreach my $field (split (/;/, $fields)) {
+        # only deal with this field if it doesn't start with top or
+        # this is the first section
+        my $real_field = $field;
+        next if (($real_field =~ s/^top//) && ($doc_section != 1));
+        my $new_text = "";
+        my $tmp_text = "";
+        # we get allfields by default - do nothing
+        if ($real_field eq "allfields") {
+        }
+        # metadata - output all metadata we know about except gsdl stuff
+        elsif ($real_field eq "metadata") {
+        my $shortname = "";
+        my $metadata = $doc_obj->get_all_metadata ($section);
+        foreach $pair (@$metadata) {
+            my ($mfield, $mvalue) = (@$pair);
+            # check fields here, maybe others dont want - change to use dontindex!!
+            if ($mfield ne "Identifier"
+            && $mfield !~ /^gsdl/
+            && $mfield ne "classifytype"
+            && $mfield ne "assocfilepath"
+            && defined $mvalue && $mvalue ne "") {
+            if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                $shortname = $self->{'indexfieldmap'}->{$mfield};
+            }
+            else {
+                $shortname = $self->create_shortname($mfield);
+                $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+            }
+            $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
+            if (!defined $self->{'indexfields'}->{$mfield}) {
+                $self->{'indexfields'}->{$mfield} = 1;
+            }
+            }
+        }
+        }
+        else {
+        #individual metadata and or text specified - could be a comma separated list
+        my $shortname="";
+        if (defined $self->{'indexfieldmap'}->{$real_field}) {
+            $shortname = $self->{'indexfieldmap'}->{$real_field};
+        }
+        else {
+            $shortname = $self->create_shortname($real_field);
+            $self->{'indexfieldmap'}->{$real_field} = $shortname;
+            $self->{'indexfieldmap'}->{$shortname} = 1;
+        }
+        my @metadata_list = ();
+        foreach $submeta (split /,/, $real_field) {
+            if ($submeta eq "text") {
             if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
                 $new_text .= "$parastarttag<TX index=\"1\">\n";
+                $new_text .= "$parastarttag<$shortname index=\"1\">\n";
                 $tmp_text .= $doc_obj->get_text ($section);
+                if ($parastarttag =~ "") {
+                if ($parastarttag ne "") {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
+                } else {
                 # we don't want to individually tag each paragraph if not doing para indexing
                 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
+                } else {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">");
+                }
+                $new_text .= "$tmp_text</$shortname>$paraendtag\n";
+            }
+            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
+                $tmp_text .= $doc_obj->get_text ($section);
+                &ghtml::htmlsafe($tmp_text);
+                $new_text .= $tmp_text;
+            }
+            }
+            else {
+            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
+            if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
+                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
+                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
+                }
+                $new_text .= "$tmp_text</TX>$paraendtag\n";
+                #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
+                #$self->{'indexfields'}->{'TextOnly'} = 1;
+                #}
+            }
+            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
+                if ($self->{'store_text'}) {
+                $tmp_text .= $doc_obj->get_text ($section);
+                &ghtml::htmlsafe($tmp_text);
+                $new_text .= $tmp_text;
+                }
+            }
+            } else { # metadata field
+            if ($real_field eq "allfields") { #ignore
+            }
+            elsif ($real_field eq "metadata") { # insert all metadata
+                #except gsdl stuff
+                my $shortname = "";
+                my $metadata = $doc_obj->get_all_metadata ($section);
+                foreach $pair (@$metadata) {
+                my ($mfield, $mvalue) = (@$pair);
+                # check fields here, maybe others dont want - change to use dontindex!!
+                if ($mfield ne "Identifier"
+                    && $mfield !~ /^gsdl/
+                    && $mfield ne "classifytype"
+                    && $mfield ne "assocfilepath"
+                    && defined $mvalue && $mvalue ne "") {
+                    if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                    $shortname = $self->{'indexfieldmap'}->{$mfield};
+                    }
+                    else {
+                    $shortname = $self->create_shortname($mfield);
+                    $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                    $self->{'indexfieldmap'}->{$shortname} = 1;
+                    }
+                    $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                    $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                }
+                }
+            }
+            else { #individual metadata specified
+                my $shortname="";
+                #if (!defined $self->{'indexfields'}->{$real_field}) {
+                #$self->{'indexfields'}->{$real_field} = 1;
+                #}
+                if (defined $self->{'indexfieldmap'}->{$real_field}) {
+                $shortname = $self->{'indexfieldmap'}->{$real_field};
+                }
+                else {
+                $shortname = $self->create_shortname($real_field);
+                $self->{'indexfieldmap'}->{$real_field} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+                }
+                my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
+                if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
+                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
+                    push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
+                }
+                }
+                foreach $item (@section_metadata) {
+                $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
+                # remove entities
+                $new_text =~ s/&\w{1,10};//g;
+                }
+            }
+            }
+            push (@metadata_list, @section_metadata);
+            }
+            # filter the text
+            $self->filter_text ($field, $new_text);
+            $self->{'num_processed_bytes'} += length ($new_text);
+            $text .= "$new_text";
+        }
+        foreach my $item (@metadata_list) {
+            $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
+            # remove entities
+            $new_text =~ s/&\w{1,10};//g;
+            # remove &
+            $new_text =~ s/&//g;
+        }
+        }
+    } # if (indexed_doc)
+        # filter the text
+        $self->filter_text ($field, $new_text);
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+    } # foreach field
     $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
 …
     } #while defined section
     print $handle "$text\n$documentendtag";
+    #print STDOUT "$text\n$documentendtag";
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 10961 for trunk/gsdl/perllib/lucenebuildproc.pm

Legend:

trunk/gsdl/perllib/lucenebuildproc.pm

Download in other formats: