Changeset 9178
- Timestamp:
- 2005-02-24T16:56:48+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuildproc.pm
r8716 r9178 100 100 my $lpar_level = $levels->{'paragraph'}; 101 101 102 my $doc_idx_att = ($ldoc_level) ? "index=\"1\"" : "";103 my $sec_idx_att = ($lsec_level) ? "index=\"1\"" : "";104 my $par_idx_att = ($lpar_level) ? "index=\"1\"" : "";105 106 102 my $docid=""; 107 103 if ($ldoc_level) { … … 117 113 $sectiontag = $mgppbuildproc::level_map{'section'}; 118 114 } 119 my ($paratag) = ""; 115 my ($parastarttag) = ""; 116 my ($paraendtag) = ""; 120 117 if ($self->{'levels'}->{'paragraph'}) { 121 118 if ($self->{'strip_html'}) { 122 $paratag = "<". $mgppbuildproc::level_map{'paragraph'} . ">"; 119 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">"; 120 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">"; 123 121 } else { 124 122 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n"; 125 123 } 126 124 } 127 125 128 126 my $doc_section = 0; # just for this document 129 127 130 ## my $text = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'; ## ****131 128 my $text = ""; 132 129 $text .= $documenttag; 133 134 130 # get the text for this document 135 131 my $section = $doc_obj->get_top_section(); … … 144 140 } 145 141 142 # if we are doing subcollections, then some docs shouldn't be indexed. 143 # but we need to put the section tag placeholders in there so the 144 # sections match up with gdbm db 146 145 if ($indexed_doc) { 147 if ($self->{'indexing_text'}) {148 $text .= "$paratag"; # only add para tags for indexing146 #if ($self->{'indexing_text'}) { 147 # $text .= "$parastarttag"; # only add para tags for indexing 149 148 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text 150 }149 # } 151 150 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 152 151 foreach my $field (split (/,/, $fields)) { … … 159 158 if ($real_field eq "text") { 160 159 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed 161 $new_text .= "$para tag<TX index=\"1\">\n";160 $new_text .= "$parastarttag<TX index=\"1\">\n"; 162 161 $tmp_text .= $doc_obj->get_text ($section); 163 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX index=\"1\">"); 162 if ($parastarttag =~ "") { 163 # we don't want to individually tag each paragraph if not doing para indexing 164 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, ""); 165 } else { 166 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">"); 167 } 164 168 165 $new_text .= "$tmp_text</TX> \n";169 $new_text .= "$tmp_text</TX>$paraendtag\n"; 166 170 #if (!defined $self->{'indexfields'}->{'TextOnly'}) { 167 171 #$self->{'indexfields'}->{'TextOnly'} = 1; 168 172 #} 169 173 } 170 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment 171 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'}; 174 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment 175 if ($self->{'store_text'}) { 176 $tmp_text .= $doc_obj->get_text ($section); 177 $tmp_text =~ s/</</g; 178 $tmp_text =~ s/>/>/g; 179 $new_text .= $tmp_text; 180 } 172 181 } 173 182 } else { # metadata field … … 195 204 $self->{'indexfieldmap'}->{$shortname} = 1; 196 205 } 197 $new_text .= "$para tag<$shortname index=\"1\">$mvalue</$shortname>\n";206 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n"; 198 207 if (!defined $self->{'indexfields'}->{$mfield}) { 199 208 $self->{'indexfields'}->{$mfield} = 1; … … 217 226 } 218 227 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) { 219 $new_text .= "$para tag<$shortname index=\"1\">$item</$shortname>\n";228 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n"; 220 229 } 221 230 }
Note:
See TracChangeset
for help on using the changeset viewer.