Changeset 18456 for gsdl/trunk/perllib/lucenebuildproc.pm
- Timestamp:
- 2009-02-03T09:48:19+13:00 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/lucenebuildproc.pm
r17797 r18456 63 63 64 64 65 sub text { 66 my $self = shift (@_); 67 my ($doc_obj,$file) = @_; 68 my $handle = $self->{'output_handle'}; 65 sub textedit { 66 my $self = shift (@_); 67 my ($doc_obj,$file,$edit_mode) = @_; 68 69 my $lucenehandle = $self->{'output_handle'}; 69 70 my $outhandle = $self->{'outhandle'}; 70 71 … … 72 73 return if ($doc_obj->get_doc_type() ne "indexed_doc"); 73 74 75 # skip this document if in "compress-text" mode and asked to delete it 76 return if (!$self->get_indexing_text() && ($edit_mode eq "delete")); 77 74 78 my $indexed_doc = $self->is_subcollection_doc($doc_obj); 75 79 76 80 # this is another document 77 $self->{'num_docs'} += 1; 81 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 82 $self->{'num_docs'} += 1; 83 } 84 else { 85 $self->{'num_docs'} -= 1; 86 } 78 87 79 88 # get the parameters for the output … … 86 95 my $ldoc_level = $levels->{'document'}; 87 96 my $lsec_level = $levels->{'section'}; 88 #my $lpar_level = $levels->{'paragraph'}; 89 97 98 # gs2_id should be depricated ##### 90 99 my $gs2_id = ""; 91 100 if ($ldoc_level) … … 102 111 } 103 112 my $gs2_docOID = $doc_obj->get_OID(); 104 my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n"; 105 my $documentendtag = "\n</$doc_tag_name>\n"; 113 my $documenttag = undef; 114 my $documentendtag = undef; 115 116 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n"; 117 $documentendtag = "\n</$doc_tag_name>\n"; 106 118 107 119 my $sec_tag_name = ""; … … 123 135 $self->{'num_sections'}++; 124 136 125 if ($sec_tag_name ne "") 126 { 127 my $sec_gs2_id = $self->{'num_sections'}; 128 my $sec_gs2_docOID = $gs2_docOID . "." . $section; 129 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n"; 130 } 137 my $sec_gs2_id = $self->{'num_sections'}; 138 my $sec_gs2_docOID = $gs2_docOID; 139 $sec_gs2_docOID .= ".$section" if ($section ne ""); 131 140 132 141 # if we are doing subcollections, then some docs shouldn't be indexed. … … 135 144 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section"; 136 145 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) { 137 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 146 if ($sec_tag_name ne "") { 147 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n"; 148 $text .= "\n</$sec_tag_name>\n" 149 } 138 150 $section = $doc_obj->get_next_section($section); 139 151 next; 140 152 } 141 153 142 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 154 if ($sec_tag_name ne "") 155 { 156 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n"; 157 } 158 159 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 160 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 161 } 162 else { 163 # delete 164 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section); 165 } 166 143 167 144 168 # has the user added a 'metadata' index? … … 233 257 # filter the text 234 258 $new_text = $self->filter_text ($field, $new_text); 235 $self->{'num_processed_bytes'} += length ($new_text); 259 260 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 261 $self->{'num_processed_bytes'} += length ($new_text); 262 $text .= "$new_text"; 263 } 264 else { 265 # delete 266 $self->{'num_processed_bytes'} -= length ($new_text); 267 } 236 268 237 $text .= "$new_text";238 269 239 270 if ($self->{'indexing_text'} && $new_field) { … … 287 318 $new_text = $self->filter_text ("metadata", $new_text); 288 319 289 $self->{'num_processed_bytes'} += length ($new_text); 290 $text .= "$new_text"; 291 292 320 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 321 $self->{'num_processed_bytes'} += length ($new_text); 322 $text .= "$new_text"; 323 } 324 else { 325 # delete 326 $self->{'num_processed_bytes'} -= length ($new_text); 327 } 293 328 } 294 329 … … 302 337 $new_text = $self->filter_text ("allfields", $new_text); 303 338 304 $self->{'num_processed_bytes'} += length ($new_text); 305 $text .= "$new_text"; 306 } 307 339 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 340 $self->{'num_processed_bytes'} += length ($new_text); 341 $text .= "$new_text"; 342 } 343 else { 344 # delete 345 $self->{'num_processed_bytes'} -= length ($new_text); 346 } 347 } 348 308 349 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 309 350 310 351 $section = $doc_obj->get_next_section($section); 311 } #while defined section 312 print $handle "$text\n$documentendtag"; 313 #print STDOUT "$text\n$documentendtag"; 314 } 352 } # while defined section 353 354 print $lucenehandle "$text\n$documentendtag"; 355 356 ## if ($edit_mode eq "delete") { 357 ## print STDERR "$text\n$documentendtag"; 358 ## } 359 360 } 361 362 sub text { 363 my $self = shift (@_); 364 my ($doc_obj,$file) = @_; 365 366 $self->textedit($doc_obj,$file,"add"); 367 } 368 369 sub textreindex 370 { 371 my $self = shift (@_); 372 my ($doc_obj,$file) = @_; 373 374 $self->textedit($doc_obj,$file,"reindex"); 375 } 376 377 sub textdelete 378 { 379 my $self = shift (@_); 380 my ($doc_obj,$file) = @_; 381 382 $self->textedit($doc_obj,$file,"delete"); 383 } 384 385 386 387 315 388 316 389 # /** We make this builder pretend to be a document processor so we can get … … 494 567 1; 495 568 569
Note:
See TracChangeset
for help on using the changeset viewer.