Changeset 10961
- Timestamp:
- 2005-11-30T15:29:23+13:00 (18 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuilder.pm
r10468 r10961 59 59 60 60 my $self = shift (@_); 61 62 # we don't do anything if we don't want compressed text 63 return if $self->{'no_text'}; 64 61 65 my ($textindex) = @_; 62 66 my $outhandle = $self->{'outhandle'}; … … 112 116 $self->{'buildproc'}->set_index ($textindex); 113 117 $self->{'buildproc'}->set_indexing_text (0); 114 $self->{'buildproc'}->set_store_text(1);115 118 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 116 119 $self->{'buildproc'}->set_levels ($levels); … … 293 296 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language); 294 297 $self->{'buildproc'}->set_indexing_text (1); 295 $self->{'buildproc'}->set_store_text(1);296 298 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 297 299 $self->{'buildproc'}->set_levels ($local_levels); -
trunk/gsdl/perllib/lucenebuildproc.pm
r10474 r10961 60 60 my ($text, $strip_html, $para) = @_; 61 61 62 my ($outtext) = ""; 63 if ($strip_html) { 64 while ($text =~ /<([^>]*)>/ && $text ne "") { 65 66 my $tag = $1; 67 $outtext .= $`." "; #add everything before the matched tag 68 $text = $'; #'everything after the matched tag 69 if ($para && $tag =~ /^\s*p\s/i) { 70 $outtext .= $para; 71 } 72 elsif ($tag =~ /^pre$/) { # a pre tag 73 $text =~ /<\/pre>/; # find the closing pre tag 74 my $tmp_text = $`; #everything before the closing pre tag 75 $text = $'; #'everything after the </pre> 76 $tmp_text =~ s/[<>]//g; # remove all < and > 77 $outtext.= $tmp_text . " "; 78 } 79 } 62 # call the mgpp method first 63 my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para); 64 65 # remove entities 66 $new_text =~ s/&\w{1,10};//g; 67 # remove & 68 $new_text =~ s/&//g; 80 69 81 $outtext .= $text; # add any remaining text 82 } #if strip_html 83 else { 84 $outtext = $text; 85 } 86 #if ($para) { 87 #$text =~ s/(<p\b)/$para$1/gi; 88 #return $text; 89 #} 90 91 # remove entities 92 $outtext =~ s/&\w{1,10};//g; 93 94 return $outtext; 95 } 70 return $new_text; 71 } 72 96 73 97 74 sub text { … … 100 77 my $handle = $self->{'output_handle'}; 101 78 my $outhandle = $self->{'outhandle'}; 102 my $indexed_doc = 1;103 79 104 80 # only output this document if it is one to be indexed 105 81 return if ($doc_obj->get_doc_type() ne "indexed_doc"); 106 82 107 # see if this document belongs to this subcollection 108 foreach my $indexexp (@{$self->{'indexexparr'}}) { 109 $indexed_doc = 0; 110 my ($field, $exp, $options) = split /\//, $indexexp; 111 if (defined ($field) && defined ($exp)) { 112 my ($bool) = $field =~ /^(.)/; 113 $field =~ s/^.// if $bool eq '!'; 114 if ($field =~ /^filename$/i) { 115 $field = $doc_obj->get_source_filename(); 116 } else { 117 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 118 } 119 next unless defined $field; 120 if ($bool eq '!') { 121 if ($options =~ /^i$/i) { 122 if ($field !~ /$exp/i) {$indexed_doc = 1; last;} 123 } else { 124 if ($field !~ /$exp/) {$indexed_doc = 1; last;} 125 } 126 } else { 127 if ($options =~ /^i$/i) { 128 if ($field =~ /$exp/i) {$indexed_doc = 1; last;} 129 } else { 130 if ($field =~ /$exp/) {$indexed_doc = 1; last;} 131 } 132 } 133 } 134 } 135 136 # if this doc is so far in the sub collection, and we have lang info, 137 # now we check the languages to see if it matches 138 if($indexed_doc && defined $self->{'lang_meta'}) { 139 $indexed_doc = 0; 140 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'}); 141 if (defined $field) { 142 foreach my $lang (@{$self->{'langarr'}}) { 143 my ($bool) = $lang =~ /^(.)/; 144 if ($bool eq '!') { 145 $lang =~ s/^.//; 146 if ($field !~ /$lang/) { 147 $indexed_doc = 1; last; 148 } 149 } else { 150 if ($field =~ /$lang/) { 151 $indexed_doc = 1; last; 152 } 153 } 154 } 155 } 156 } 83 my $indexed_doc = $self->is_subcollection_doc($doc_obj); 157 84 158 85 # this is another document … … 219 146 # but we need to put the section tag placeholders in there so the 220 147 # sections match up with gdbm db 221 if ($indexed_doc) { 222 #if ($self->{'indexing_text'}) { 223 # $text .= "$parastarttag"; # only add para tags for indexing 224 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text 225 # } 226 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 227 foreach my $field (split (/,/, $fields)) { 228 # only deal with this field if it doesn't start with top or 229 # this is the first section 230 my $real_field = $field; 231 if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 232 my $new_text = ""; 233 my $tmp_text = ""; 234 if ($real_field eq "text") { 148 if (!$indexed_doc) { 149 $text .= "\n</$sectiontag>\n" if ($sectiontag ne ""); 150 $section = $doc_obj->get_next_section($section); 151 next; 152 } 153 154 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 155 foreach my $field (split (/;/, $fields)) { 156 # only deal with this field if it doesn't start with top or 157 # this is the first section 158 my $real_field = $field; 159 next if (($real_field =~ s/^top//) && ($doc_section != 1)); 160 161 my $new_text = ""; 162 my $tmp_text = ""; 163 164 # we get allfields by default - do nothing 165 if ($real_field eq "allfields") { 166 167 } 168 # metadata - output all metadata we know about except gsdl stuff 169 elsif ($real_field eq "metadata") { 170 my $shortname = ""; 171 my $metadata = $doc_obj->get_all_metadata ($section); 172 foreach $pair (@$metadata) { 173 my ($mfield, $mvalue) = (@$pair); 174 # check fields here, maybe others dont want - change to use dontindex!! 175 if ($mfield ne "Identifier" 176 && $mfield !~ /^gsdl/ 177 && $mfield ne "classifytype" 178 && $mfield ne "assocfilepath" 179 && defined $mvalue && $mvalue ne "") { 180 181 if (defined $self->{'indexfieldmap'}->{$mfield}) { 182 $shortname = $self->{'indexfieldmap'}->{$mfield}; 183 } 184 else { 185 $shortname = $self->create_shortname($mfield); 186 $self->{'indexfieldmap'}->{$mfield} = $shortname; 187 $self->{'indexfieldmap'}->{$shortname} = 1; 188 } 189 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n"; 190 if (!defined $self->{'indexfields'}->{$mfield}) { 191 $self->{'indexfields'}->{$mfield} = 1; 192 } 193 } 194 } 195 196 } 197 else { 198 #individual metadata and or text specified - could be a comma separated list 199 my $shortname=""; 200 if (defined $self->{'indexfieldmap'}->{$real_field}) { 201 $shortname = $self->{'indexfieldmap'}->{$real_field}; 202 } 203 else { 204 $shortname = $self->create_shortname($real_field); 205 $self->{'indexfieldmap'}->{$real_field} = $shortname; 206 $self->{'indexfieldmap'}->{$shortname} = 1; 207 } 208 209 my @metadata_list = (); 210 foreach $submeta (split /,/, $real_field) { 211 if ($submeta eq "text") { 235 212 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed 236 $new_text .= "$parastarttag< TXindex=\"1\">\n";213 $new_text .= "$parastarttag<$shortname index=\"1\">\n"; 237 214 $tmp_text .= $doc_obj->get_text ($section); 238 if ($parastarttag =~ "") { 215 if ($parastarttag ne "") { 216 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">"); 217 } else { 239 218 # we don't want to individually tag each paragraph if not doing para indexing 240 219 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, ""); 241 } else { 242 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">"); 220 } 221 $new_text .= "$tmp_text</$shortname>$paraendtag\n"; 222 } 223 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment 224 $tmp_text .= $doc_obj->get_text ($section); 225 &ghtml::htmlsafe($tmp_text); 226 $new_text .= $tmp_text; 227 228 } 229 } 230 else { 231 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)}; 232 if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 233 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 234 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)}); 243 235 } 244 245 $new_text .= "$tmp_text</TX>$paraendtag\n"; 246 #if (!defined $self->{'indexfields'}->{'TextOnly'}) { 247 #$self->{'indexfields'}->{'TextOnly'} = 1; 248 #} 249 } 250 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment 251 if ($self->{'store_text'}) { 252 $tmp_text .= $doc_obj->get_text ($section); 253 &ghtml::htmlsafe($tmp_text); 254 $new_text .= $tmp_text; 255 } 256 } 257 } else { # metadata field 258 if ($real_field eq "allfields") { #ignore 259 } 260 elsif ($real_field eq "metadata") { # insert all metadata 261 #except gsdl stuff 262 my $shortname = ""; 263 my $metadata = $doc_obj->get_all_metadata ($section); 264 foreach $pair (@$metadata) { 265 my ($mfield, $mvalue) = (@$pair); 266 # check fields here, maybe others dont want - change to use dontindex!! 267 if ($mfield ne "Identifier" 268 && $mfield !~ /^gsdl/ 269 && $mfield ne "classifytype" 270 && $mfield ne "assocfilepath" 271 && defined $mvalue && $mvalue ne "") { 272 273 if (defined $self->{'indexfieldmap'}->{$mfield}) { 274 $shortname = $self->{'indexfieldmap'}->{$mfield}; 275 } 276 else { 277 $shortname = $self->create_shortname($mfield); 278 $self->{'indexfieldmap'}->{$mfield} = $shortname; 279 $self->{'indexfieldmap'}->{$shortname} = 1; 280 } 281 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n"; 282 if (!defined $self->{'indexfields'}->{$mfield}) { 283 $self->{'indexfields'}->{$mfield} = 1; 284 } 285 } 286 } 287 288 } 289 else { #individual metadata specified 290 my $shortname=""; 291 #if (!defined $self->{'indexfields'}->{$real_field}) { 292 #$self->{'indexfields'}->{$real_field} = 1; 293 #} 294 if (defined $self->{'indexfieldmap'}->{$real_field}) { 295 $shortname = $self->{'indexfieldmap'}->{$real_field}; 296 } 297 else { 298 $shortname = $self->create_shortname($real_field); 299 $self->{'indexfieldmap'}->{$real_field} = $shortname; 300 $self->{'indexfieldmap'}->{$shortname} = 1; 301 } 302 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)}; 303 if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 304 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 305 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)}); 306 } 307 } 308 foreach $item (@section_metadata) { 309 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n"; 310 # remove entities 311 $new_text =~ s/&\w{1,10};//g; 312 } 313 } 314 236 } 237 push (@metadata_list, @section_metadata); 315 238 } 316 317 # filter the text 318 $self->filter_text ($field, $new_text); 319 $self->{'num_processed_bytes'} += length ($new_text); 320 $text .= "$new_text"; 239 } 240 foreach my $item (@metadata_list) { 241 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n"; 242 # remove entities 243 $new_text =~ s/&\w{1,10};//g; 244 # remove & 245 $new_text =~ s/&//g; 321 246 } 322 247 } 323 } # if (indexed_doc) 324 248 249 # filter the text 250 $self->filter_text ($field, $new_text); 251 $self->{'num_processed_bytes'} += length ($new_text); 252 $text .= "$new_text"; 253 } # foreach field 254 325 255 $text .= "\n</$sectiontag>\n" if ($sectiontag ne ""); 326 256 … … 328 258 } #while defined section 329 259 print $handle "$text\n$documentendtag"; 260 #print STDOUT "$text\n$documentendtag"; 330 261 } 331 262 -
trunk/gsdl/perllib/mgppbuilder.pm
r10477 r10961 148 148 my $indexes = $self->{'collect_cfg'}->{'indexes'}; 149 149 $self->{'collect_cfg'}->{'indexes'} = []; 150 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(' ,', @$indexes));150 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(';', @$indexes)); 151 151 } 152 152 … … 160 160 161 161 my $self = shift (@_); 162 163 # we don't do anything if we don't want compressed text 164 return if $self->{'no_text'}; 165 162 166 my ($textindex) = @_; 163 167 … … 225 229 $self->{'buildproc'}->set_index ($textindex); 226 230 $self->{'buildproc'}->set_indexing_text (0); 227 if ($self->{'no_text'}) {228 $self->{'buildproc'}->set_store_text(0);229 } else {230 $self->{'buildproc'}->set_store_text(1);231 }232 231 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 233 232 $self->{'buildproc'}->set_levels ($self->{'levels'}); … … 497 496 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language); 498 497 $self->{'buildproc'}->set_indexing_text (1); 499 $self->{'buildproc'}->set_store_text(1);500 498 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 501 499 $self->{'buildproc'}->set_levels ($self->{'levels'}); … … 765 763 my $parts = $field; 766 764 $parts =~ s/:.*$//; 767 my @fs = split(',', $parts); 765 # ************* 766 my @fs = split(';', $parts); 768 767 foreach my $f(@fs) { 769 768 if (!defined $specifiedfields->{$f}) { … … 799 798 $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap; 800 799 $self->{'build_cfg'}->{'indexfields'} = \@indexfields; 800 801 801 } 802 802 -
trunk/gsdl/perllib/mgppbuildproc.pm
r10474 r10961 100 100 #} 101 101 102 sub remove_gtlt { 103 my $self =shift(@_); 104 my ($text, $para) = @_; 105 $text =~s/[<>]//g; 106 return "$para$text$para"; 107 } 108 109 sub process_tags { 110 my $self = shift(@_); 111 my ($text, $para) = @_; 112 if ($text =~ /^p\b/i) { 113 return $para; 114 } 115 return ""; 116 } 117 118 sub preprocess_text { 119 my $self = shift (@_); 120 my ($text, $strip_html, $para) = @_; 121 # at this stage, we do not do paragraph tags unless have strip_html - 122 # it will result in a huge mess of non-xml 123 return unless $strip_html; 124 125 my $new_text = $text; 126 127 # if we have <pre> tags, we can have < > inside them, need to delete 128 # the <> before stripping tags 129 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse; 130 131 if ($para eq "") { 132 # just remove all tags 133 $new_text =~ s/<[^>]*>//gs; 134 } else { 135 # strip all tags except <p> tags which get turned into $para 136 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse; 137 138 } 139 return $new_text; 140 } 102 141 #this function strips the html tags from the doc if ($strip_html) and 103 142 # if ($para) replaces <p> with <Paragraph> tags. … … 105 144 #assumes that <pre> and </pre> have no spaces, and removes all < and > inside 106 145 #these tags 107 sub preprocess_text {146 sub preprocess_text_old_and_slow { 108 147 my $self = shift (@_); 109 148 my ($text, $strip_html, $para) = @_; … … 178 217 my ($paratag) = ""; 179 218 180 if ($self->{'levels'}->{'paragraph'}) { 219 # paragraph tags will only be used for indexing (can't retrieve 220 # paragraphs), and can ony be used if we are stripping HTML tags 221 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) { 181 222 if ($self->{'strip_html'}) { 182 223 $paratag = "<". $level_map{'paragraph'} . ">"; … … 192 233 # get the text for this document 193 234 my $section = $doc_obj->get_top_section(); 235 194 236 while (defined $section) { 195 237 # update a few statistics … … 198 240 $text .= "$sectiontag"; 199 241 200 if ($indexed_doc) { 201 if ($self->{'indexing_text'}) { 202 $text .= "$paratag"; # only add para tags for indexing 203 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text 204 } 205 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 206 foreach my $field (split (/,/, $fields)) { 207 # only deal with this field if it doesn't start with top or 208 # this is the first section 209 my $real_field = $field; 210 if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 211 my $new_text = ""; 212 my $tmp_text = ""; 213 if ($real_field eq "text") { 242 if (!$indexed_doc) { 243 # we are not actually indexing anything for this document, 244 # but we want to keep the section numbers the same, so we just 245 # output section tags for each section (which is done above) 246 $section = $doc_obj->get_next_section($section); 247 next; 248 } 249 250 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 251 252 foreach my $field (split (/;/, $fields)) { 253 # only deal with this field if it doesn't start with top or 254 # this is the first section 255 my $real_field = $field; 256 next if (($real_field =~ s/^top//) && ($doc_section != 1)); 257 258 my $new_text = ""; 259 my $tmp_text = ""; 260 261 # we get allfields by default - do nothing 262 if ($real_field eq "allfields") { 263 264 } 265 266 # metadata - output all metadata we know about except gsdl stuff 267 elsif ($real_field eq "metadata") { 268 my $shortname = ""; 269 my $metadata = $doc_obj->get_all_metadata ($section); 270 foreach my $pair (@$metadata) { 271 my ($mfield, $mvalue) = (@$pair); 272 # check fields here, maybe others dont want - change to use dontindex!! 273 if ($mfield ne "Identifier" 274 && $mfield !~ /^gsdl/ 275 && $mfield ne "classifytype" 276 && $mfield ne "assocfilepath" 277 && defined $mvalue && $mvalue ne "") { 278 279 if (defined $self->{'indexfieldmap'}->{$mfield}) { 280 $shortname = $self->{'indexfieldmap'}->{$mfield}; 281 } 282 else { 283 $shortname = $self->create_shortname($mfield); 284 $self->{'indexfieldmap'}->{$mfield} = $shortname; 285 $self->{'indexfieldmap'}->{$shortname} = 1; 286 } 287 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 288 if (!defined $self->{'indexfields'}->{$mfield}) { 289 $self->{'indexfields'}->{$mfield} = 1; 290 } 291 } 292 } 293 } 294 else { 295 #individual metadata and or text specified - could be 296 # a comma separated list 297 my $shortname=""; 298 if (defined $self->{'indexfieldmap'}->{$real_field}) { 299 $shortname = $self->{'indexfieldmap'}->{$real_field}; 300 } 301 else { 302 $shortname = $self->create_shortname($real_field); 303 $self->{'indexfieldmap'}->{$real_field} = $shortname; 304 $self->{'indexfieldmap'}->{$shortname} = 1; 305 } 306 my @metadata_list = (); 307 foreach $submeta (split /,/, $real_field) { 308 if ($submeta eq "text") { 214 309 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed 215 $new_text .= "$paratag< TX>\n";310 $new_text .= "$paratag<$shortname>\n"; 216 311 $tmp_text .= $doc_obj->get_text ($section); 217 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");218 219 $new_text .= "$tmp_text</TX>\n";220 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {221 #$self->{'indexfields'}->{'TextOnly'} = 1;222 #}312 if ($paratag ne "") { 313 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>"); 314 } else { 315 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, ""); 316 } 317 $new_text .= "$tmp_text</$shortname>\n"; 223 318 } 224 319 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment 225 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};320 $new_text .= $doc_obj->get_text ($section); 226 321 } 227 } else { # metadata field 228 if ($real_field eq "allfields") { #ignore 229 } 230 elsif ($real_field eq "metadata") { # insert all metadata 231 #except gsdl stuff 232 my $shortname = ""; 233 my $metadata = $doc_obj->get_all_metadata ($section); 234 foreach my $pair (@$metadata) { 235 my ($mfield, $mvalue) = (@$pair); 236 # check fields here, maybe others dont want - change to use dontindex!! 237 if ($mfield ne "Identifier" 238 && $mfield !~ /^gsdl/ 239 && $mfield ne "classifytype" 240 && $mfield ne "assocfilepath" 241 && defined $mvalue && $mvalue ne "") { 242 243 if (defined $self->{'indexfieldmap'}->{$mfield}) { 244 $shortname = $self->{'indexfieldmap'}->{$mfield}; 245 } 246 else { 247 $shortname = $self->create_shortname($mfield); 248 $self->{'indexfieldmap'}->{$mfield} = $shortname; 249 $self->{'indexfieldmap'}->{$shortname} = 1; 250 } 251 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 252 if (!defined $self->{'indexfields'}->{$mfield}) { 253 $self->{'indexfields'}->{$mfield} = 1; 254 } 255 } 256 } 257 258 } 259 else { #individual metadata specified 260 my $shortname=""; 261 #if (!defined $self->{'indexfields'}->{$real_field}) { 262 #$self->{'indexfields'}->{$real_field} = 1; 263 #} 264 if (defined $self->{'indexfieldmap'}->{$real_field}) { 265 $shortname = $self->{'indexfieldmap'}->{$real_field}; 266 } 267 else { 268 $shortname = $self->create_shortname($real_field); 269 $self->{'indexfieldmap'}->{$real_field} = $shortname; 270 $self->{'indexfieldmap'}->{$shortname} = 1; 271 } 272 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)}; 273 if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 274 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 275 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)}); 276 } 277 } 278 foreach my $item (@section_metadata) { 279 #foreach my $item (@{$doc_obj->get_metadata ($section, $real_field)}) { 280 $new_text .= "$paratag<$shortname>$item</$shortname>\n"; 322 } else { 323 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)}; 324 if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 325 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 326 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)}); 281 327 } 282 328 } 283 329 push (@metadata_list, @section_metadata); 284 330 } 285 286 # filter the text 287 $self->filter_text ($field, $new_text); 288 289 $self->{'num_processed_bytes'} += length ($new_text); 290 $text .= "$new_text"; 291 } 292 } 293 } # if (indexed_doc) 294 331 } 332 foreach my $item (@metadata_list) { 333 $new_text .= "$paratag<$shortname>$item</$shortname>\n"; 334 } 335 } 336 337 # filter the text 338 $self->filter_text ($field, $new_text); 339 340 $self->{'num_processed_bytes'} += length ($new_text); 341 $text .= "$new_text"; 342 } # foreach field 343 295 344 $section = $doc_obj->get_next_section($section); 296 } # while defined section345 } # while defined section 297 346 print $handle "$text\n$documentendtag"; 298 347
Note:
See TracChangeset
for help on using the changeset viewer.