Changeset 23133 for main/trunk/greenstone2/perllib/basebuildproc.pm
- Timestamp:
- 2010-10-13T14:25:54+13:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/basebuildproc.pm
r23121 r23133 148 148 # reconstructed docs have no text, just metadata, so we need to 149 149 # remember how many bytes we had initially 150 $self->{'num_bytes'} = $self->{'starting_num_bytes'};151 150 #$self->{'num_bytes'} = $self->{'starting_num_bytes'}; 151 $self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs. 152 152 $self->{'num_processed_bytes'} = 0; 153 153 } … … 389 389 my $self = shift (@_); 390 390 my ($doc_obj, $filename, $edit_mode) = @_; 391 391 392 392 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document 393 393 my $doctype = $doc_obj->get_doc_type(); 394 394 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc"); 395 395 print STDERR "infodbedit, mode=$edit_mode, ".$doc_obj->get_OID()."\n"; 396 396 my $archivedir = ""; 397 397 if (defined $filename) … … 416 416 } 417 417 418 if (($edit_mode eq "add") || ($edit_mode eq "update")) { 419 #add this document to the browse structure 420 push(@{$self->{'doclist'}},$doc_obj->get_OID()) 421 unless ($doctype eq "classification"); 422 if ($edit_mode eq "update") { 423 $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1; 424 } 425 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 426 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 427 } 428 elsif ($edit_mode eq "delete") { 418 if ($edit_mode eq "delete") { 429 419 # record this doc so we don't process the reconstructed doc later 430 my $del_doc_oid = $doc_obj->get_OID();431 $self->{'dont_reconstruct'}->{$del_doc_oid} = 1;420 $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1; 421 # we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary 432 422 # do we need this??? where did num_docs come from, from reconstruction?? 433 $self->{'num_docs'} -= 1 unless ($doctype eq "classification"); 434 } 435 423 #$self->{'num_docs'} -= 1 unless ($doctype eq "classification"); 424 return; 425 } 426 427 if ($edit_mode eq "update") { 428 # we don't want to process the reconstructed doc later, but we will process this version now. 429 $self->{'dont_reconstruct'}->{$doc_obj->get_OID()} = 1; 430 } 431 432 # rest of code used for add and update. In both cases, we add to the classifiers and to the info database. 433 434 #add this document to the browse structure 435 push(@{$self->{'doclist'}},$doc_obj->get_OID()) 436 unless ($doctype eq "classification"); 437 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 438 439 if (!defined $filename) { 440 # a reconstructed doc 441 $self->{'num_bytes'} += $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes"); 442 print STDERR "new numbytes = $self->{'num_bytes'}\n"; 443 } 444 # classify the document 445 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 446 447 # now add all the section to the infodb. 448 436 449 # is this a paged or a hierarchical document 437 450 my ($thistype, $childtype) = $self->get_document_type ($doc_obj); … … 453 466 my %section_infodb = (); 454 467 455 # update a few statistics - how do we do update???? 456 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) { 457 458 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 459 $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 460 } 461 elsif ($edit_mode eq "delete") { 462 # delete 463 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section); 464 $self->{'num_sections'} -= 1 unless ($doctype eq "classification"); 465 } 466 468 # update a few statistics 469 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 470 $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 471 467 472 # output the fact that this document is a document (unless doctype 468 473 # has been set to something else from within a plugin … … 472 477 } 473 478 479 if ($first && defined $filename) { 480 # if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later 481 my $length = $doc_obj->get_total_text_length(); 482 $section_infodb{"total_numbytes"} = [ $length ]; 483 } 474 484 # Output whether this node contains text 475 485 # … … 504 514 # special case for URL metadata 505 515 if ($field =~ /^URL$/i) { 506 if (($edit_mode eq "add") || ($edit_mode eq "update")) { 507 508 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] }); 509 } 510 else { 511 # delete 512 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value); 513 } 514 515 516 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] }); 516 517 } 517 518 518 519 if (!defined $self->{'dontdb'}->{$field}) { 519 520 push(@{$section_infodb{$field}}, $value); … … 590 591 } 591 592 592 if (($edit_mode eq "add") || ($edit_mode eq "update")) { 593 # in case of update, this will overwrite old entry?? 594 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb); 595 } 596 else { 597 # delete 598 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID); 599 } 600 601 602 # output a database entry for the document number, except for Lucene (which no longer needs this information) 603 unless (ref($self) eq "lucenebuildproc") 593 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb); 594 595 # output a database entry for the document number, unless we are incremental 596 unless ($self->is_incremental_capable()) 604 597 { 605 if (($edit_mode eq "add") || ($edit_mode eq "update")) { 606 607 if ($self->{'db_level'} eq "document") { 608 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] }); 609 } 610 else { 611 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] }); 612 } 598 print STDERR "outputting db entry for doc number\n"; 599 if ($self->{'db_level'} eq "document") { 600 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] }); 613 601 } 614 602 else { 615 616 if ($self->{'db_level'} eq "document") { 617 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}); 618 } 619 else { 620 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}); 621 } 622 623 } 624 } 625 603 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] }); 604 } 605 } 606 626 607 $first = 0; 627 608 $section = $doc_obj->get_next_section($section); 628 609 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs 629 } 610 } # while defined section 611 612 print STDERR "end of infodb edit\n"; 630 613 } 631 614
Note:
See TracChangeset
for help on using the changeset viewer.