- Timestamp:
- 2008-10-22T13:27:39+13:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/lucenebuildproc.pm
r17287 r17568 86 86 my $ldoc_level = $levels->{'document'}; 87 87 my $lsec_level = $levels->{'section'}; 88 my $lpar_level = $levels->{'paragraph'};88 #my $lpar_level = $levels->{'paragraph'}; 89 89 90 90 my $gs2_id = ""; … … 107 107 my $sec_tag_name = ""; 108 108 if ($lsec_level) 109 109 { 110 110 $sec_tag_name = $mgppbuildproc::level_map{'section'}; 111 } 112 my ($parastarttag) = ""; 113 my ($paraendtag) = ""; 114 if ($self->{'levels'}->{'paragraph'}) 115 { 116 if ($self->{'strip_html'}) 117 { 118 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">"; 119 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">"; 120 } 121 else 122 { 123 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n"; 124 } 125 } 111 } 126 112 127 113 my $doc_section = 0; # just for this document … … 132 118 my $section = $doc_obj->get_top_section(); 133 119 while (defined $section) 134 120 { 135 121 # update a few statistics 136 122 $doc_section++; … … 155 141 156 142 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 157 foreach my $field (split (/;/, $fields)) 158 { 143 144 # has the user added a 'metadata' index? 145 my $all_metadata_specified = 0; 146 # which fields have already been indexed? (same as fields, but in a map) 147 my $specified_fields = {}; 148 149 # do we have an allfields index?? 150 my $allfields_index = 0; 151 # collect up all the text for it in here 152 my $allfields_text = ""; 153 foreach my $field (split (/;/, $fields)) { 154 if ($field eq "allfields") { 155 $allfields_index = 1; 156 } elsif ($field eq "metadata") { 157 $all_metadata_specified = 1; 158 } 159 } 160 161 foreach my $field (split (/;/, $fields)) { 162 159 163 # only deal with this field if it doesn't start with top or 160 164 # this is the first section 161 165 my $real_field = $field; 162 166 next if (($real_field =~ s/^top//) && ($doc_section != 1)); 163 164 my $new_text = ""; 165 my $tmp_text = ""; 166 167 # If allfields is requested add all metadata fields and text as 168 # belonging to the ZZ field 169 if ($real_field eq "allfields") { 170 # Text first - no html nor paragraph tags 171 $new_text .= "$parastarttag<ZZ index=\"1\">\n"; 172 $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, ""); 173 &ghtml::htmlsafe($tmp_text); 174 $new_text .= "$tmp_text</ZZ>$paraendtag\n"; 175 # Then Metadata 176 my $metadata = $doc_obj->get_all_metadata ($section); 177 foreach my $pair (@$metadata) { 178 my ($mfield, $mvalue) = (@$pair); 179 &ghtml::htmlsafe($mvalue); 180 # check fields here, maybe others dont want - change to use dontindex!! 181 if ($mfield ne "Identifier" 182 && $mfield !~ /^gsdl/ 183 && $mfield ne "classifytype" 184 && $mfield ne "assocfilepath" 185 && defined $mvalue && $mvalue ne "") { 186 $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n"; 187 } 188 if (!defined $self->{'indexfields'}->{$mfield}) { 189 $self->{'indexfields'}->{$mfield} = 1; 190 } 191 } 167 168 # process these two later 169 next if ($real_field eq "allfields" || $real_field eq "metadata"); 170 171 #individual metadata and or text specified - could be a comma separated list 172 $specified_fields->{$real_field} = 1; 173 my $shortname=""; 174 my $new_field = 0; # have we found a new field name? 175 if (defined $self->{'indexfieldmap'}->{$real_field}) { 176 $shortname = $self->{'indexfieldmap'}->{$real_field}; 192 177 } 193 # metadata - output all metadata we know about except gsdl stuff 194 elsif ($real_field eq "metadata" || $real_field eq "allfields") { 195 my $shortname = ""; 196 my $metadata = $doc_obj->get_all_metadata ($section); 197 foreach my $pair (@$metadata) { 198 my ($mfield, $mvalue) = (@$pair); 199 &ghtml::htmlsafe($mvalue); 200 # check fields here, maybe others dont want - change to use dontindex!! 201 if ($mfield ne "Identifier" 202 && $mfield !~ /^gsdl/ 203 && $mfield ne "classifytype" 204 && $mfield ne "assocfilepath" 205 && defined $mvalue && $mvalue ne "") { 206 207 if (defined $self->{'indexfieldmap'}->{$mfield}) { 208 $shortname = $self->{'indexfieldmap'}->{$mfield}; 178 else { 179 $shortname = $self->create_shortname($real_field); 180 $new_field = 1; 181 } 182 183 my @metadata_list = (); # put any metadata values in here 184 my $section_text = ""; # put the text in here 185 foreach my $submeta (split /,/, $real_field) { 186 if ($submeta eq "text") { 187 # no point in indexing text more than once 188 if ($section_text eq "") { 189 $section_text = $doc_obj->get_text($section); 190 if ($self->{'indexing_text'}) { 191 # we always strip html 192 $section_text = $self->preprocess_text($section_text, 1, ""); 209 193 } 210 else { 211 $shortname = $self->create_shortname($mfield); 212 $self->{'indexfieldmap'}->{$mfield} = $shortname; 213 $self->{'indexfieldmap'}->{$shortname} = 1; 214 } 215 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n"; 216 if (!defined $self->{'indexfields'}->{$mfield}) { 217 $self->{'indexfields'}->{$mfield} = 1; 194 else { 195 # leave html stuff in, but escape the tags 196 &ghtml::htmlsafe($section_text); 218 197 } 219 198 } 220 199 } 221 }222 else {223 #individual metadata and or text specified - could be a comma separated list224 my $shortname="";225 if (defined $self->{'indexfieldmap'}->{$real_field}) {226 $shortname = $self->{'indexfieldmap'}->{$real_field};227 }228 200 else { 229 $shortname = $self->create_shortname($real_field); 201 # its a metadata element 202 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)}; 203 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 204 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 205 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)}); 206 } 207 } 208 push (@metadata_list, @section_metadata); 209 } 210 } # for each field in this one index 211 212 # now we add the text and/or metadata into new_text 213 if ($section_text ne "" || scalar(@metadata_list)) { 214 my $new_text = ""; 215 216 if ($section_text ne "") { 217 $new_text .= "$section_text "; 218 } 219 220 foreach my $item (@metadata_list) { 221 &ghtml::htmlsafe($item); 222 $new_text .= "$item "; 223 } 224 225 if ($allfields_index) { 226 $allfields_text .= $new_text; 227 } 228 229 $new_text = "<$shortname index=\"1\">$new_text</$shortname>"; 230 # filter the text 231 $new_text = $self->filter_text ($field, $new_text); 232 $self->{'num_processed_bytes'} += length ($new_text); 233 234 $text .= "$new_text"; 235 236 if ($new_field) { 237 # we need to add to the list in indexfields 238 230 239 $self->{'indexfieldmap'}->{$real_field} = $shortname; 231 240 $self->{'indexfieldmap'}->{$shortname} = 1; 232 241 } 233 234 my @metadata_list = (); 235 foreach my $submeta (split /,/, $real_field) { 236 if ($submeta eq "text") { 237 my $section_text = $doc_obj->get_text($section); 238 if ($self->{'indexing_text'}) { 239 # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML 240 $new_text .= "$parastarttag<$shortname index=\"1\">\n"; 241 if ($parastarttag ne "") { 242 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">"); 243 } 244 else { 245 # we don't want to individually tag each paragraph if not doing para indexing 246 $section_text = $self->preprocess_text($section_text, 1, ""); 247 } 248 $new_text .= "$section_text</$shortname>$paraendtag\n"; 249 } 250 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment 251 $tmp_text .= $doc_obj->get_text ($section); 252 &ghtml::htmlsafe($tmp_text); 253 $new_text .= $tmp_text; 254 } 255 } 256 else { 257 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)}; 258 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 259 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 260 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)}); 261 } 262 } 263 push (@metadata_list, @section_metadata); 264 } 265 } 266 foreach my $item (@metadata_list) { 267 &ghtml::htmlsafe($item); 268 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n"; 269 } 242 243 } 244 245 } # foreach field 246 247 248 if ($all_metadata_specified) { 249 250 my $new_text = ""; 251 my $shortname = ""; 252 my $metadata = $doc_obj->get_all_metadata ($section); 253 foreach my $pair (@$metadata) { 254 my ($mfield, $mvalue) = (@$pair); 255 # no value 256 next unless defined $mvalue && $mvalue ne ""; 257 # we have already indexed this 258 next if defined ($specified_fields->{$mfield}); 259 # check fields here, maybe others dont want - change to use dontindex!! 260 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath"); 261 next if ($mfield =~ /^gsdl/); 262 263 &ghtml::htmlsafe($mvalue); 264 265 if (defined $self->{'indexfieldmap'}->{$mfield}) { 266 $shortname = $self->{'indexfieldmap'}->{$mfield}; 267 } 268 else { 269 $shortname = $self->create_shortname($mfield); 270 $self->{'indexfieldmap'}->{$mfield} = $shortname; 271 $self->{'indexfieldmap'}->{$shortname} = 1; 272 } 273 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n"; 274 if ($allfields_index) { 275 $allfields_text .= "$mvalue "; 276 } 277 278 if (!defined $self->{'indexfields'}->{$mfield}) { 279 $self->{'indexfields'}->{$mfield} = 1; 280 } 281 270 282 } 271 283 # filter the text 272 $new_text = $self->filter_text ($field, $new_text); 284 $new_text = $self->filter_text ("metadata", $new_text); 285 273 286 $self->{'num_processed_bytes'} += length ($new_text); 274 275 287 $text .= "$new_text"; 276 } # foreach field 277 288 289 290 } 291 292 if ($allfields_index) { 293 # add the index name mapping 294 $self->{'indexfieldmap'}->{"allfields"} = "ZZ"; 295 $self->{'indexfieldmap'}->{"ZZ"} = 1; 296 297 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n"; 298 # filter the text 299 $new_text = $self->filter_text ("allfields", $new_text); 300 301 $self->{'num_processed_bytes'} += length ($new_text); 302 $text .= "$new_text"; 303 } 304 278 305 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 279 306
Note:
See TracChangeset
for help on using the changeset viewer.