Changeset 9178


Ignore:
Timestamp:
2005-02-24T16:56:48+13:00 (19 years ago)
Author:
kjdon
Message:

tried to tidy up the paragraph tagging stuff, but its still not working - needs gs2:id att in the para tag to work properly with the indexing as is. needs to be thought about more

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuildproc.pm

    r8716 r9178  
    100100    my $lpar_level = $levels->{'paragraph'};
    101101
    102     my $doc_idx_att = ($ldoc_level) ? "index=\"1\"" : "";
    103     my $sec_idx_att = ($lsec_level) ? "index=\"1\"" : "";
    104     my $par_idx_att = ($lpar_level) ? "index=\"1\"" : "";
    105 
    106102    my $docid="";
    107103    if ($ldoc_level) {
     
    117113    $sectiontag = $mgppbuildproc::level_map{'section'};
    118114    }
    119     my ($paratag) = "";
     115    my ($parastarttag) = "";
     116    my ($paraendtag) = "";
    120117    if ($self->{'levels'}->{'paragraph'}) {
    121118    if ($self->{'strip_html'}) {
    122         $paratag = "<". $mgppbuildproc::level_map{'paragraph'} . ">";
     119        $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
     120        $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
    123121    } else {
    124122        print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
    125123    }
    126124    }
    127 
     125   
    128126    my $doc_section = 0; # just for this document
    129127
    130 ##    my $text = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'; ## ****
    131128    my $text = "";
    132129    $text .= $documenttag;
    133    
    134130    # get the text for this document
    135131    my $section = $doc_obj->get_top_section();
     
    144140    }
    145141
     142    # if we are doing subcollections, then some docs shouldn't be indexed.
     143    # but we need to put the section tag placeholders in there so the
     144    # sections match up with gdbm db
    146145    if ($indexed_doc) {
    147         if ($self->{'indexing_text'}) {
    148         $text .= "$paratag"; # only add para tags for indexing
     146        #if ($self->{'indexing_text'}) {
     147    #   $text .= "$parastarttag"; # only add para tags for indexing
    149148        # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
    150         }
     149       # }
    151150        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    152151        foreach my $field (split (/,/, $fields)) {
     
    159158            if ($real_field eq "text") {
    160159            if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
    161                 $new_text .= "$paratag<TX index=\"1\">\n";
     160                $new_text .= "$parastarttag<TX index=\"1\">\n";
    162161                $tmp_text .= $doc_obj->get_text ($section);
    163                 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX index=\"1\">");
     162                if ($parastarttag =~ "") {
     163                # we don't want to individually tag each paragraph if not doing para indexing
     164                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
     165                } else {
     166                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">");
     167                }
    164168               
    165                 $new_text .= "$tmp_text</TX>\n";
     169                $new_text .= "$tmp_text</TX>$paraendtag\n";
    166170                #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
    167171                #$self->{'indexfields'}->{'TextOnly'} = 1;   
    168172                #}
    169173            }
    170             else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
    171                 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
     174            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
     175                if ($self->{'store_text'}) {
     176                $tmp_text .= $doc_obj->get_text ($section);
     177                $tmp_text =~ s/</&lt;/g;
     178                $tmp_text =~ s/>/&gt;/g;
     179                $new_text .= $tmp_text;
     180                }
    172181            }
    173182            } else { # metadata field
     
    195204                    $self->{'indexfieldmap'}->{$shortname} = 1;
    196205                    }     
    197                     $new_text .= "$paratag<$shortname index=\"1\">$mvalue</$shortname>\n";
     206                    $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
    198207                    if (!defined $self->{'indexfields'}->{$mfield}) {
    199208                    $self->{'indexfields'}->{$mfield} = 1;
     
    217226                }
    218227                foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
    219                 $new_text .= "$paratag<$shortname index=\"1\">$item</$shortname>\n";
     228                $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
    220229                }
    221230            }
Note: See TracChangeset for help on using the changeset viewer.