Context Navigation

← Previous Change
Next Change →

Changeset 10961 for trunk/gsdl

Timestamp:

2005-11-30T15:29:23+13:00 (18 years ago)

Author:

kjdon

Message:

modified mgpp/lucene indexes description so that you can have multiple metadata values in a single field, eg dc.Creator,dc.Contributor. indexes list looks more like mg one. e.g 'dc.Subject text dc.Creator,dc.Contributor'. also made the preprocess_text faster (I think), and a little bit of tidying up. if no_text is true, then don't run the compress text pass at all - its a waste of reading through the documents. so don't use store_text in these two buildprocs anymore

Location:

trunk/gsdl/perllib

Files:

: 4 edited

lucenebuilder.pm (modified) (3 diffs)
lucenebuildproc.pm (modified) (4 diffs)
mgppbuilder.pm (modified) (6 diffs)
mgppbuildproc.pm (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/lucenebuilder.pm

-              r10468
+              r10961
     my $self = shift (@_);
+    # we don't do anything if we don't want compressed text
+    return if $self->{'no_text'};
     my ($textindex) = @_;
     my $outhandle = $self->{'outhandle'};
 …
     $self->{'buildproc'}->set_index ($textindex);
     $self->{'buildproc'}->set_indexing_text (0);
-    $self->{'buildproc'}->set_store_text(1);
     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     $self->{'buildproc'}->set_levels ($levels);
 …
     $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
     $self->{'buildproc'}->set_indexing_text (1);
-    $self->{'buildproc'}->set_store_text(1);
     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     $self->{'buildproc'}->set_levels ($local_levels);

trunk/gsdl/perllib/lucenebuildproc.pm

-              r10474
+              r10961
     my ($text, $strip_html, $para) = @_;
+    my ($outtext) = "";
+    if ($strip_html) {
+    while ($text =~ /<([^>]*)>/ && $text ne "") {
+        my $tag = $1;
+        $outtext .= $`." "; #add everything before the matched tag
+        $text = $'; #'everything after the matched tag
+        if ($para && $tag =~ /^\s*p\s/i) {
+        $outtext .= $para;
+        }
+        elsif ($tag =~ /^pre$/) { # a pre tag
+        $text =~ /<\/pre>/; # find the closing pre tag
+        my $tmp_text = $`; #everything before the closing pre tag
+        $text = $'; #'everything after the </pre>
+        $tmp_text =~ s/[<>]//g; # remove all < and >
+        $outtext.= $tmp_text . " ";
+        }
+    }
+    # call the mgpp method first
+    my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para);
+    # remove entities
+    $new_text =~ s/&\w{1,10};//g;
+    # remove &
+    $new_text =~ s/&//g;
+    $outtext .= $text; # add any remaining text
+    } #if strip_html
+    else {
+    $outtext = $text;
+    }
+    #if ($para) {
+    #$text =~ s/(<p\b)/$para$1/gi;
+    #return $text;
+    #}
+    # remove entities
+    $outtext =~ s/&\w{1,10};//g;
+    return $outtext;
+}
+    return $new_text;
+}
 sub text {
 …
     my $handle = $self->{'output_handle'};
     my $outhandle = $self->{'outhandle'};
-    my $indexed_doc = 1;
     # only output this document if it is one to be indexed
     return if ($doc_obj->get_doc_type() ne "indexed_doc");
+    # see if this document belongs to this subcollection
+    foreach my $indexexp (@{$self->{'indexexparr'}}) {
+    $indexed_doc = 0;
+    my ($field, $exp, $options) = split /\//, $indexexp;
+    if (defined ($field) && defined ($exp)) {
+        my ($bool) = $field =~ /^(.)/;
+        $field =~ s/^.// if $bool eq '!';
+        if ($field =~ /^filename$/i) {
+        $field = $doc_obj->get_source_filename();
+        } else {
+        $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
+        }
+        next unless defined $field;
+        if ($bool eq '!') {
+        if ($options =~ /^i$/i) {
+            if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
+        } else {
+            if ($field !~ /$exp/) {$indexed_doc = 1; last;}
+        }
+        } else {
+        if ($options =~ /^i$/i) {
+            if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
+        } else {
+            if ($field =~ /$exp/) {$indexed_doc = 1; last;}
+        }
+        }
+    }
+    }
+    # if this doc is so far in the sub collection, and we have lang info,
+    # now we check the languages to see if it matches
+    if($indexed_doc && defined $self->{'lang_meta'}) {
+    $indexed_doc = 0;
+    my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
+    if (defined $field) {
+        foreach my $lang (@{$self->{'langarr'}}) {
+        my ($bool) = $lang =~ /^(.)/;
+        if ($bool eq '!') {
+            $lang =~ s/^.//;
+            if ($field !~ /$lang/) {
+            $indexed_doc = 1; last;
+            }
+        } else {
+            if ($field =~ /$lang/) {
+            $indexed_doc = 1; last;
+            }
+        }
+        }
+    }
+    }
+    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
     # this is another document
 …
     # but we need to put the section tag placeholders in there so the
     # sections match up with gdbm db
+    if ($indexed_doc) {
+        #if ($self->{'indexing_text'}) {
+    #   $text .= "$parastarttag"; # only add para tags for indexing
+        # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
+       # }
+        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+        foreach my $field (split (/,/, $fields)) {
+        # only deal with this field if it doesn't start with top or
+        # this is the first section
+        my $real_field = $field;
+        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
+            my $new_text = "";
+            my $tmp_text = "";
+            if ($real_field eq "text") {
+    if (!$indexed_doc) {
+        $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
+        $section = $doc_obj->get_next_section($section);
+        next;
+    }
+    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+    foreach my $field (split (/;/, $fields)) {
+        # only deal with this field if it doesn't start with top or
+        # this is the first section
+        my $real_field = $field;
+        next if (($real_field =~ s/^top//) && ($doc_section != 1));
+        my $new_text = "";
+        my $tmp_text = "";
+        # we get allfields by default - do nothing
+        if ($real_field eq "allfields") {
+        }
+        # metadata - output all metadata we know about except gsdl stuff
+        elsif ($real_field eq "metadata") {
+        my $shortname = "";
+        my $metadata = $doc_obj->get_all_metadata ($section);
+        foreach $pair (@$metadata) {
+            my ($mfield, $mvalue) = (@$pair);
+            # check fields here, maybe others dont want - change to use dontindex!!
+            if ($mfield ne "Identifier"
+            && $mfield !~ /^gsdl/
+            && $mfield ne "classifytype"
+            && $mfield ne "assocfilepath"
+            && defined $mvalue && $mvalue ne "") {
+            if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                $shortname = $self->{'indexfieldmap'}->{$mfield};
+            }
+            else {
+                $shortname = $self->create_shortname($mfield);
+                $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+            }
+            $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
+            if (!defined $self->{'indexfields'}->{$mfield}) {
+                $self->{'indexfields'}->{$mfield} = 1;
+            }
+            }
+        }
+        }
+        else {
+        #individual metadata and or text specified - could be a comma separated list
+        my $shortname="";
+        if (defined $self->{'indexfieldmap'}->{$real_field}) {
+            $shortname = $self->{'indexfieldmap'}->{$real_field};
+        }
+        else {
+            $shortname = $self->create_shortname($real_field);
+            $self->{'indexfieldmap'}->{$real_field} = $shortname;
+            $self->{'indexfieldmap'}->{$shortname} = 1;
+        }
+        my @metadata_list = ();
+        foreach $submeta (split /,/, $real_field) {
+            if ($submeta eq "text") {
             if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
                 $new_text .= "$parastarttag<TX index=\"1\">\n";
+                $new_text .= "$parastarttag<$shortname index=\"1\">\n";
                 $tmp_text .= $doc_obj->get_text ($section);
+                if ($parastarttag =~ "") {
+                if ($parastarttag ne "") {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
+                } else {
                 # we don't want to individually tag each paragraph if not doing para indexing
                 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
+                } else {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">");
+                }
+                $new_text .= "$tmp_text</$shortname>$paraendtag\n";
+            }
+            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
+                $tmp_text .= $doc_obj->get_text ($section);
+                &ghtml::htmlsafe($tmp_text);
+                $new_text .= $tmp_text;
+            }
+            }
+            else {
+            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
+            if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
+                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
+                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
+                }
+                $new_text .= "$tmp_text</TX>$paraendtag\n";
+                #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
+                #$self->{'indexfields'}->{'TextOnly'} = 1;
+                #}
+            }
+            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
+                if ($self->{'store_text'}) {
+                $tmp_text .= $doc_obj->get_text ($section);
+                &ghtml::htmlsafe($tmp_text);
+                $new_text .= $tmp_text;
+                }
+            }
+            } else { # metadata field
+            if ($real_field eq "allfields") { #ignore
+            }
+            elsif ($real_field eq "metadata") { # insert all metadata
+                #except gsdl stuff
+                my $shortname = "";
+                my $metadata = $doc_obj->get_all_metadata ($section);
+                foreach $pair (@$metadata) {
+                my ($mfield, $mvalue) = (@$pair);
+                # check fields here, maybe others dont want - change to use dontindex!!
+                if ($mfield ne "Identifier"
+                    && $mfield !~ /^gsdl/
+                    && $mfield ne "classifytype"
+                    && $mfield ne "assocfilepath"
+                    && defined $mvalue && $mvalue ne "") {
+                    if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                    $shortname = $self->{'indexfieldmap'}->{$mfield};
+                    }
+                    else {
+                    $shortname = $self->create_shortname($mfield);
+                    $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                    $self->{'indexfieldmap'}->{$shortname} = 1;
+                    }
+                    $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                    $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                }
+                }
+            }
+            else { #individual metadata specified
+                my $shortname="";
+                #if (!defined $self->{'indexfields'}->{$real_field}) {
+                #$self->{'indexfields'}->{$real_field} = 1;
+                #}
+                if (defined $self->{'indexfieldmap'}->{$real_field}) {
+                $shortname = $self->{'indexfieldmap'}->{$real_field};
+                }
+                else {
+                $shortname = $self->create_shortname($real_field);
+                $self->{'indexfieldmap'}->{$real_field} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+                }
+                my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
+                if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
+                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
+                    push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
+                }
+                }
+                foreach $item (@section_metadata) {
+                $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
+                # remove entities
+                $new_text =~ s/&\w{1,10};//g;
+                }
+            }
+            }
+            push (@metadata_list, @section_metadata);
+            }
+            # filter the text
+            $self->filter_text ($field, $new_text);
+            $self->{'num_processed_bytes'} += length ($new_text);
+            $text .= "$new_text";
+        }
+        foreach my $item (@metadata_list) {
+            $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
+            # remove entities
+            $new_text =~ s/&\w{1,10};//g;
+            # remove &
+            $new_text =~ s/&//g;
+        }
+        }
+    } # if (indexed_doc)
+        # filter the text
+        $self->filter_text ($field, $new_text);
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+    } # foreach field
     $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
 …
     } #while defined section
     print $handle "$text\n$documentendtag";
+    #print STDOUT "$text\n$documentendtag";
+}

trunk/gsdl/perllib/mgppbuilder.pm

-              r10477
+              r10961
     my $indexes = $self->{'collect_cfg'}->{'indexes'};
     $self->{'collect_cfg'}->{'indexes'} = [];
     push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
+    push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes));
+}
 …
     my $self = shift (@_);
+    # we don't do anything if we don't want compressed text
+    return if $self->{'no_text'};
     my ($textindex) = @_;
 …
     $self->{'buildproc'}->set_index ($textindex);
     $self->{'buildproc'}->set_indexing_text (0);
-    if ($self->{'no_text'}) {
-    $self->{'buildproc'}->set_store_text(0);
-    } else {
-    $self->{'buildproc'}->set_store_text(1);
+    }
     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     $self->{'buildproc'}->set_levels ($self->{'levels'});
 …
     $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
     $self->{'buildproc'}->set_indexing_text (1);
-    $self->{'buildproc'}->set_store_text(1);
     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     $self->{'buildproc'}->set_levels ($self->{'levels'});
 …
     my $parts = $field;
     $parts =~ s/:.*$//;
+    my @fs = split(',', $parts);
+    # *************
+    my @fs = split(';', $parts);
     foreach my $f(@fs) {
         if (!defined $specifiedfields->{$f}) {
 …
     $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
     $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
+}

trunk/gsdl/perllib/mgppbuildproc.pm

-              r10474
+              r10961
 #}
+sub remove_gtlt {
+    my $self =shift(@_);
+    my ($text, $para) = @_;
+    $text =~s/[<>]//g;
+    return "$para$text$para";
+}
+sub process_tags {
+    my $self = shift(@_);
+    my ($text, $para) = @_;
+    if ($text =~ /^p\b/i) {
+    return $para;
+    }
+    return "";
+}
+sub preprocess_text {
+    my $self = shift (@_);
+    my ($text, $strip_html, $para) = @_;
+    # at this stage, we do not do paragraph tags unless have strip_html -
+    # it will result in a huge mess of non-xml
+    return unless $strip_html;
+    my $new_text = $text;
+    # if we have <pre> tags, we can have < > inside them, need to delete
+    # the <> before stripping tags
+    $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
+    if ($para eq "") {
+    # just remove all tags
+    $new_text =~ s/<[^>]*>//gs;
+    } else {
+    # strip all tags except <p> tags which get turned into $para
+    $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
+    }
+    return $new_text;
+}
 #this function strips the html tags from the doc if ($strip_html) and
 # if ($para) replaces <p> with <Paragraph> tags.
 …
 #assumes that <pre> and </pre> have no spaces, and removes all < and > inside
 #these tags
 sub preprocess_text {
+sub preprocess_text_old_and_slow {
     my $self = shift (@_);
     my ($text, $strip_html, $para) = @_;
 …
     my ($paratag) = "";
+    if ($self->{'levels'}->{'paragraph'}) {
+    # paragraph tags will only be used for indexing (can't retrieve
+    # paragraphs), and can ony be used if we are stripping HTML tags
+    if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
     if ($self->{'strip_html'}) {
         $paratag = "<". $level_map{'paragraph'} . ">";
 …
     # get the text for this document
     my $section = $doc_obj->get_top_section();
     while (defined $section) {
     # update a few statistics
 …
     $text .= "$sectiontag";
+    if ($indexed_doc) {
+        if ($self->{'indexing_text'}) {
+        $text .= "$paratag"; # only add para tags for indexing
+        # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
+        }
+        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+        foreach my $field (split (/,/, $fields)) {
+        # only deal with this field if it doesn't start with top or
+        # this is the first section
+        my $real_field = $field;
+        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
+            my $new_text = "";
+            my $tmp_text = "";
+            if ($real_field eq "text") {
+    if (!$indexed_doc) {
+        # we are not actually indexing anything for this document,
+        # but we want to keep the section numbers the same, so we just
+        # output section tags for each section (which is done above)
+        $section = $doc_obj->get_next_section($section);
+        next;
+    }
+    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+    foreach my $field (split (/;/, $fields)) {
+        # only deal with this field if it doesn't start with top or
+        # this is the first section
+        my $real_field = $field;
+        next if (($real_field =~ s/^top//) && ($doc_section != 1));
+        my $new_text = "";
+        my $tmp_text = "";
+        # we get allfields by default - do nothing
+        if ($real_field eq "allfields") {
+        }
+        # metadata - output all metadata we know about except gsdl stuff
+        elsif ($real_field eq "metadata") {
+        my $shortname = "";
+        my $metadata = $doc_obj->get_all_metadata ($section);
+        foreach my $pair (@$metadata) {
+            my ($mfield, $mvalue) = (@$pair);
+            # check fields here, maybe others dont want - change to use dontindex!!
+            if ($mfield ne "Identifier"
+            && $mfield !~ /^gsdl/
+            && $mfield ne "classifytype"
+            && $mfield ne "assocfilepath"
+            && defined $mvalue && $mvalue ne "") {
+            if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                $shortname = $self->{'indexfieldmap'}->{$mfield};
+            }
+            else {
+                $shortname = $self->create_shortname($mfield);
+                $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+            }
+            $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
+            if (!defined $self->{'indexfields'}->{$mfield}) {
+                $self->{'indexfields'}->{$mfield} = 1;
+            }
+            }
+        }
+        }
+        else {
+        #individual metadata and or text specified - could be
+        # a comma separated list
+        my $shortname="";
+        if (defined $self->{'indexfieldmap'}->{$real_field}) {
+            $shortname = $self->{'indexfieldmap'}->{$real_field};
+        }
+        else {
+            $shortname = $self->create_shortname($real_field);
+            $self->{'indexfieldmap'}->{$real_field} = $shortname;
+            $self->{'indexfieldmap'}->{$shortname} = 1;
+        }
+        my @metadata_list = ();
+        foreach $submeta (split /,/, $real_field) {
+            if ($submeta eq "text") {
             if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
                 $new_text .= "$paratag<TX>\n";
+                $new_text .= "$paratag<$shortname>\n";
                 $tmp_text .= $doc_obj->get_text ($section);
                 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
                 $new_text .= "$tmp_text</TX>\n";
                 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
                 #$self->{'indexfields'}->{'TextOnly'} = 1;
                 #}
+                if ($paratag ne "") {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
+                } else {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
+                }
+                $new_text .= "$tmp_text</$shortname>\n";
+            }
             else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
                 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
+                $new_text .= $doc_obj->get_text ($section);
+            }
+            } else { # metadata field
+            if ($real_field eq "allfields") { #ignore
+            }
+            elsif ($real_field eq "metadata") { # insert all metadata
+                #except gsdl stuff
+                my $shortname = "";
+                my $metadata = $doc_obj->get_all_metadata ($section);
+                foreach my $pair (@$metadata) {
+                my ($mfield, $mvalue) = (@$pair);
+                # check fields here, maybe others dont want - change to use dontindex!!
+                if ($mfield ne "Identifier"
+                    && $mfield !~ /^gsdl/
+                    && $mfield ne "classifytype"
+                    && $mfield ne "assocfilepath"
+                    && defined $mvalue && $mvalue ne "") {
+                    if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                    $shortname = $self->{'indexfieldmap'}->{$mfield};
+                    }
+                    else {
+                    $shortname = $self->create_shortname($mfield);
+                    $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                    $self->{'indexfieldmap'}->{$shortname} = 1;
+                    }
+                    $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                    $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                }
+                }
+            }
+            else { #individual metadata specified
+                my $shortname="";
+                #if (!defined $self->{'indexfields'}->{$real_field}) {
+                #$self->{'indexfields'}->{$real_field} = 1;
+                #}
+                if (defined $self->{'indexfieldmap'}->{$real_field}) {
+                $shortname = $self->{'indexfieldmap'}->{$real_field};
+                }
+                else {
+                $shortname = $self->create_shortname($real_field);
+                $self->{'indexfieldmap'}->{$real_field} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+                }
+                my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
+                if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
+                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
+                    push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
+                }
+                }
+                foreach my $item (@section_metadata) {
+                #foreach my $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
+                $new_text .= "$paratag<$shortname>$item</$shortname>\n";
+            } else {
+            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
+            if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
+                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
+                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
+                }
+            }
+            push (@metadata_list, @section_metadata);
+            }
+            # filter the text
+            $self->filter_text ($field, $new_text);
+            $self->{'num_processed_bytes'} += length ($new_text);
+            $text .= "$new_text";
+        }
+        }
+    } # if (indexed_doc)
+        }
+        foreach my $item (@metadata_list) {
+            $new_text .= "$paratag<$shortname>$item</$shortname>\n";
+        }
+        }
+        # filter the text
+        $self->filter_text ($field, $new_text);
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+    } # foreach field
     $section = $doc_obj->get_next_section($section);
     } #while defined section
+    } # while defined section
     print $handle "$text\n$documentendtag";

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: