Changeset 4769


Ignore:
Timestamp:
2003-06-23T15:01:38+12:00 (21 years ago)
Author:
kjdon
Message:

paragraph indexing should work now, also slightly changed some other internal stuff to do with remembering what fields were found when indexing 'metadata'

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r3834 r4769  
    7979    $self->{'dontindex'} = {};
    8080    $self->{'indexfieldmap'} = {};
    81     $self->{'indexfields'} = {};
     81    $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
    8282    $self->{'strip_html'}=1;
    8383
     
    444444}
    445445
    446 sub find_paragraphs {
    447     $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
    448 }
     446#sub find_paragraphs {
     447#    $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
     448#}
    449449
    450450#this function strips the html tags from the doc if ($strip_html) and
     
    456456    my $self = shift (@_);
    457457    my ($text, $strip_html, $para) = @_;
    458 
    459458    my ($outtext) = "";
    460459    if ($strip_html) {
     
    464463        $outtext .= $`." "; #add everything before the matched tag
    465464        $text = $'; #everything after the matched tag
    466         if ($para && $tag =~ /^\s*p\s/) {
    467         $outtext .= "<Paragraph> ";
     465        if ($para && $tag =~ /^\s*p\s/i) {
     466        $outtext .= $para;
    468467        }
    469468        elsif ($tag =~ /^pre$/) { # a pre tag
     
    480479    } #if strip_html
    481480
    482     if ($para) {
    483     $text =~ s/(<p\b)/<Paragraph>$1/gi;
    484     return $text;
    485     }
     481    #if ($para) {
     482    #$text =~ s/(<p\b)/$para$1/gi;
     483    #return $text;
     484   # }
    486485    return $text;
    487486}
     
    501500    my ($doc_obj) = @_;
    502501    my $handle = $self->{'output_handle'};
     502    my $outhandle = $self->{'outhandle'};
    503503    my $indexed_doc = 1;
    504504
     
    541541    my ($fields) = $self->{'index'};
    542542
     543    my ($documenttag) = "";
     544    my($documentendtag) = "";
     545    #if ($self->{'levels'}->{'Document'}) {
     546    $documenttag = "\n<Document>\n";
     547    $documentendtag = "</Document>\n";
     548    #}
    543549    my ($sectiontag) = "";
    544550    if ($self->{'levels'}->{'Section'}) {
     
    547553    my ($paratag) = "";
    548554    if ($self->{'levels'}->{'Paragraph'}) {
    549     $paratag = "<Paragraph>";
     555    if ($self->{'strip_html'}) {
     556        $paratag = "<Paragraph>";
     557    } else {
     558        print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
     559    }
    550560    }
    551561    my $doc_section = 0; # just for this document
    552     my $text = "<Document>\n";
     562   
     563    my $text = $documenttag;
    553564   
    554565    # get the text for this document
     
    558569    $doc_section++;
    559570    $self->{'num_sections'} += 1;
    560     $text .= $sectiontag;
     571    $text .= "$sectiontag";
    561572
    562573    if ($indexed_doc) {
     574        if ($self->{'indexing_text'}) {
     575        $text .= "$paratag"; # only add para tags for indexing
     576        # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
     577        }
    563578        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    564579        foreach my $field (split (/,/, $fields)) {
     
    571586            if ($real_field eq "text") {
    572587            if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
    573                 $new_text .= "<TX>\n";
    574                 $tmp_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
    575                 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'});
    576 
     588                $new_text .= "$paratag<TX>\n";
     589                $tmp_text .= $doc_obj->get_text ($section);
     590                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
     591               
    577592                $new_text .= "$tmp_text</TX>\n";
    578                 if (!defined $self->{'indexfields'}->{'TextOnly'}) {
    579                 $self->{'indexfields'}->{'TextOnly'} = 1;   
    580                 }
     593                #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
     594                #$self->{'indexfields'}->{'TextOnly'} = 1;   
     595                #}
    581596            }
    582597            else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
    583598                $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
    584                             #if ($self->{'levels'}->{'Paragraph'}) {
    585                 #$self->find_paragraphs($new_text);
    586                 #}               
    587599            }
    588600            } else { # metadata field
    589             if ($real_field eq "metadata") { # insert all metadata
     601            if ($real_field eq "allfields") { #ignore
     602            }
     603            elsif ($real_field eq "metadata") { # insert all metadata
    590604                #except gsdl stuff
    591605                my $shortname = "";
     
    617631            else { #individual metadata specified
    618632                my $shortname="";
    619                 if (!defined $self->{'indexfields'}->{$real_field}) {
    620                 $self->{'indexfields'}->{$real_field} = 1;
    621                 }
     633                #if (!defined $self->{'indexfields'}->{$real_field}) {
     634                #$self->{'indexfields'}->{$real_field} = 1;
     635                #}
    622636                if (defined $self->{'indexfieldmap'}->{$real_field}) {
    623637                $shortname = $self->{'indexfieldmap'}->{$real_field};
     
    646660    $section = $doc_obj->get_next_section($section);
    647661    } #while defined section
    648     print $handle "$text\n</Document>\n";
     662    print $handle "$text\n$documentendtag";
     663   
    649664}
    650665
Note: See TracChangeset for help on using the changeset viewer.