Changeset 18456
- Timestamp:
- 2009-02-03T09:48:19+13:00 (14 years ago)
- Location:
- gsdl/trunk
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/bin/script/import.pl
r18440 r18456 63 63 use plugout; 64 64 use manifest; 65 use inexport; 65 66 use util; 66 67 use scriptutil; … … 534 535 if ($manifest eq "") { 535 536 # Load in list of files in import folder from last import (if present) 536 $archive_info->load_ import_filelist ($arcinfo_src_filename);537 $archive_info->load_prev_import_filelist ($arcinfo_src_filename); 537 538 } 538 539 … … 571 572 # gobal blocking pass may set up some metadata 572 573 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 573 # Can now work out which files were deleted 574 575 # First convert all files to absolute form 576 # This is to support the situation where the import folder is not 577 # the default 578 579 my $prev_all_files = $archive_info->{'import_filelist'}; 580 foreach my $prev_file (keys %$prev_all_files) { 581 582 if (!&util::filename_is_absolute($prev_file)) { 583 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file); 584 delete $prev_all_files->{$prev_file}; 585 $prev_all_files->{$full_prev_file} = 1; 586 } 587 } 588 589 # Figure out which are the new files, existing files and so 590 # by implication the files from the previous import that are not 591 # there any more => mark them for deletion 592 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) { 593 594 my $full_curr_file = $curr_file; 595 596 if (!&util::filename_is_absolute($curr_file)) { 597 # add in import dir to make absolute 598 $full_curr_file = &util::filename_cat($importdir,$curr_file); 599 } 600 601 ## print STDERR "**** Checking $curr_file\n"; 602 603 # figure of if new file or not 604 if (defined $prev_all_files->{$full_curr_file}) { 605 # had it before 606 $block_hash->{'existing_files'}->{$curr_file} = 1; 607 # Now remove it, so by end of loop only the files 608 # that need deleting are left 609 610 delete $prev_all_files->{$full_curr_file} 611 } 612 else { 613 $block_hash->{'new_files'}->{$curr_file} = 1; 614 } 615 616 delete $block_hash->{'all_files'}->{$curr_file}; 617 } 618 619 print STDERR "Delete files:\n "; 620 621 my @delete_files = keys %$prev_all_files; 622 print STDERR join("\n ",@delete_files), "\n"; 574 # Can now work out which files were new, already existed, and have 575 # been deleted 576 577 &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir); 578 579 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}}; 580 if (scalar(@deleted_files>0)) { 581 print STDERR "Delete files:\n "; 582 print STDERR join("\n ",@deleted_files), "\n"; 583 } 584 585 my @new_files = sort keys %{$block_hash->{'new_files'}}; 586 if (scalar(@new_files>0)) { 587 print STDERR "New files:\n "; 588 print STDERR join("\n ",@new_files), "\n"; 589 } 590 591 &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir, 592 $verbosity); 623 593 624 594 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 625 595 } 626 596 else 627 597 { 628 598 # process any files marked for importing … … 631 601 } 632 602 633 # record files marked for deletion in arcinfo 634 foreach my $file (keys %{$manifest_lookup->{'delete'}}) { 635 # use 'archiveinf-src' GDBM file to look up all the OIDs 636 # this file is used in (note in most cases, it's just one OID) 637 638 # An improvement would be to have the record read 639 # into a hash array 640 # gdbmRecordToHash 641 642 my $gdbm_val 643 = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file); 644 645 my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm); 646 foreach my $oid (@oids) { 647 648 # find out if it's an assoc file or main doc 649 650 # archiveinf-doc, lookup $oid 651 # if "doc-file" 652 # mark it for deletion 653 654 # else (assoc file) 655 # mark all for re-indexing 656 657 # Now delete file 658 } 659 660 # $archive_info->add_info($OID,$doc_xml_file,"D"); 661 } 603 my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 604 605 &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir); 662 606 } 663 607 -
gsdl/trunk/bin/script/lucene_passes.pl
r18440 r18456 132 132 $output_filename = ""; 133 133 } 134 elsif ($line =~ m/<\/Delete>\s*$/) {135 if ($mode eq "index") {136 $doc_xml =~ s/\n+/\n/g;137 138 # notify lucene indexer139 print PIPEOUT "$doc_xml";140 }141 $doc_xml = "";142 }143 134 } 144 135 } … … 150 141 # * the command line of the java wrapper. 151 142 # * 152 # * @author John Rowe, DL Consulting153 143 # */ 154 144 sub main -
gsdl/trunk/perllib/GDBMUtils.pm
r17285 r18456 17 17 18 18 sub gdbmDatabaseGet 19 19 { 20 20 my ($database, $oid) = @_; 21 21 … … 29 29 # Done 30 30 return $value; 31 } 31 } 32 33 sub gdbmRecordToHash 34 { 35 my ($database, $oid) = @_; 36 37 my $val = gdbmDatabaseGet($database,$oid); 38 39 my $rec = {}; 40 41 while ($val =~ m/^<(.*?)>(.*)$/mg) { 42 my $metaname = $1; 43 my $metavalue = $2; 44 45 if (!defined $rec->{$metaname}) { 46 $rec->{$metaname} = [ $metavalue ]; 47 } 48 else { 49 push(@{$rec->{$metaname}},$metavalue); 50 } 51 } 52 53 return $rec; 54 } 55 32 56 33 57 sub gdbmDatabaseAppend 34 58 { 35 59 my ($database, $oid, $value) = @_; 36 60 … … 71 95 print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\"\n" if $debug; 72 96 73 # Think it would be clearer if this funcctionality was done 74 # by a separate executable, e.g. gdbmremove 75 `gdbmset$exe "$database" "$oid"`; 97 `gdbmdel$exe "$database" "$oid"`; 76 98 } 77 99 -
gsdl/trunk/perllib/arcinfo.pm
r18441 r18456 129 129 130 130 foreach my $file ( keys %$infodb_map ) { 131 $self->{' import_filelist'}->{$file} = 1;132 } 133 } 134 135 136 sub load_ import_filelist {131 $self->{'prev_import_filelist'}->{$file} = 1; 132 } 133 } 134 135 136 sub load_prev_import_filelist { 137 137 my $self = shift (@_); 138 138 my ($filename) = @_; -
gsdl/trunk/perllib/basebuildproc.pm
r17579 r18456 35 35 use doc; 36 36 use docproc; 37 use strict; no strict 'subs'; 37 use strict; 38 no strict 'subs'; 39 no strict 'refs'; 38 40 use util; 39 41 … … 366 368 367 369 368 sub infodb {369 my $self = shift (@_); 370 my ($doc_obj, $filename ) = @_;370 sub infodbedit { 371 my $self = shift (@_); 372 my ($doc_obj, $filename, $edit_mode) = @_; 371 373 372 374 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document … … 396 398 } 397 399 398 #add this document to the browse structure 399 push(@{$self->{'doclist'}},$doc_obj->get_OID()) 400 unless ($doctype eq "classification"); 400 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 401 #add this document to the browse structure 402 push(@{$self->{'doclist'}},$doc_obj->get_OID()) 403 unless ($doctype eq "classification"); 404 } 405 else { 406 # delete => remove this doc from browse structure 407 my $del_doc_oid = $doc_obj->get_OID(); 408 409 my @filtered_doc_list = (); 410 foreach my $oid (@{$self->{'doclist'}}) { 411 push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid); 412 } 413 $self->{'doclist'} = \@filtered_doc_list; 414 } 415 401 416 402 417 # classify this document 403 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 404 405 # this is another document 406 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 418 &classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode); 419 420 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 421 # this is another document 422 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 423 } 424 else { 425 # delete 426 $self->{'num_docs'} -= 1 unless ($doctype eq "classification"); 427 return; 428 } 407 429 408 430 # is this a paged or a hierarchical document … … 563 585 564 586 587 588 589 sub infodb { 590 my $self = shift (@_); 591 my ($doc_obj, $filename) = @_; 592 593 $self->infodbedit($doc_obj,$filename,"add"); 594 } 595 596 sub infodbreindex { 597 my $self = shift (@_); 598 my ($doc_obj, $filename) = @_; 599 600 $self->infodbedit($doc_obj,$filename,"reindex"); 601 } 602 603 sub infodbdelete { 604 my $self = shift (@_); 605 my ($doc_obj, $filename) = @_; 606 607 $self->infodbedit($doc_obj,$filename,"delete"); 608 } 609 610 565 611 sub text { 566 612 my $self = shift (@_); … … 571 617 die "\n"; 572 618 } 619 620 sub textreindex 621 { 622 my $self = shift @_; 623 624 my $outhandle = $self->{'outhandle'}; 625 print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n"; 626 if (!$self->is_incremental_capable()) { 627 628 print $outhandle " This operation is only possible with indexing tools with that support\n"; 629 print $outhandle " incremental building\n"; 630 } 631 die "\n"; 632 } 633 634 sub textdelete 635 { 636 my $self = shift @_; 637 638 my $outhandle = $self->{'outhandle'}; 639 print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n"; 640 if (!$self->is_incremental_capable()) { 641 642 print $outhandle " This operation is only possible with indexing tools with that support\n"; 643 print $outhandle " incremental building\n"; 644 } 645 die "\n"; 646 } 647 573 648 574 649 # should the document be indexed - according to the subcollection and language … … 692 767 } 693 768 694 sub assoc_files() { 769 sub assoc_files 770 { 695 771 my $self = shift (@_); 696 772 my ($doc_obj, $archivedir) = @_; -
gsdl/trunk/perllib/lucenebuildproc.pm
r17797 r18456 63 63 64 64 65 sub text { 66 my $self = shift (@_); 67 my ($doc_obj,$file) = @_; 68 my $handle = $self->{'output_handle'}; 65 sub textedit { 66 my $self = shift (@_); 67 my ($doc_obj,$file,$edit_mode) = @_; 68 69 my $lucenehandle = $self->{'output_handle'}; 69 70 my $outhandle = $self->{'outhandle'}; 70 71 … … 72 73 return if ($doc_obj->get_doc_type() ne "indexed_doc"); 73 74 75 # skip this document if in "compress-text" mode and asked to delete it 76 return if (!$self->get_indexing_text() && ($edit_mode eq "delete")); 77 74 78 my $indexed_doc = $self->is_subcollection_doc($doc_obj); 75 79 76 80 # this is another document 77 $self->{'num_docs'} += 1; 81 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 82 $self->{'num_docs'} += 1; 83 } 84 else { 85 $self->{'num_docs'} -= 1; 86 } 78 87 79 88 # get the parameters for the output … … 86 95 my $ldoc_level = $levels->{'document'}; 87 96 my $lsec_level = $levels->{'section'}; 88 #my $lpar_level = $levels->{'paragraph'}; 89 97 98 # gs2_id should be depricated ##### 90 99 my $gs2_id = ""; 91 100 if ($ldoc_level) … … 102 111 } 103 112 my $gs2_docOID = $doc_obj->get_OID(); 104 my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n"; 105 my $documentendtag = "\n</$doc_tag_name>\n"; 113 my $documenttag = undef; 114 my $documentendtag = undef; 115 116 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n"; 117 $documentendtag = "\n</$doc_tag_name>\n"; 106 118 107 119 my $sec_tag_name = ""; … … 123 135 $self->{'num_sections'}++; 124 136 125 if ($sec_tag_name ne "") 126 { 127 my $sec_gs2_id = $self->{'num_sections'}; 128 my $sec_gs2_docOID = $gs2_docOID . "." . $section; 129 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n"; 130 } 137 my $sec_gs2_id = $self->{'num_sections'}; 138 my $sec_gs2_docOID = $gs2_docOID; 139 $sec_gs2_docOID .= ".$section" if ($section ne ""); 131 140 132 141 # if we are doing subcollections, then some docs shouldn't be indexed. … … 135 144 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section"; 136 145 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) { 137 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 146 if ($sec_tag_name ne "") { 147 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n"; 148 $text .= "\n</$sec_tag_name>\n" 149 } 138 150 $section = $doc_obj->get_next_section($section); 139 151 next; 140 152 } 141 153 142 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 154 if ($sec_tag_name ne "") 155 { 156 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n"; 157 } 158 159 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 160 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 161 } 162 else { 163 # delete 164 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section); 165 } 166 143 167 144 168 # has the user added a 'metadata' index? … … 233 257 # filter the text 234 258 $new_text = $self->filter_text ($field, $new_text); 235 $self->{'num_processed_bytes'} += length ($new_text); 259 260 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 261 $self->{'num_processed_bytes'} += length ($new_text); 262 $text .= "$new_text"; 263 } 264 else { 265 # delete 266 $self->{'num_processed_bytes'} -= length ($new_text); 267 } 236 268 237 $text .= "$new_text";238 269 239 270 if ($self->{'indexing_text'} && $new_field) { … … 287 318 $new_text = $self->filter_text ("metadata", $new_text); 288 319 289 $self->{'num_processed_bytes'} += length ($new_text); 290 $text .= "$new_text"; 291 292 320 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 321 $self->{'num_processed_bytes'} += length ($new_text); 322 $text .= "$new_text"; 323 } 324 else { 325 # delete 326 $self->{'num_processed_bytes'} -= length ($new_text); 327 } 293 328 } 294 329 … … 302 337 $new_text = $self->filter_text ("allfields", $new_text); 303 338 304 $self->{'num_processed_bytes'} += length ($new_text); 305 $text .= "$new_text"; 306 } 307 339 if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 340 $self->{'num_processed_bytes'} += length ($new_text); 341 $text .= "$new_text"; 342 } 343 else { 344 # delete 345 $self->{'num_processed_bytes'} -= length ($new_text); 346 } 347 } 348 308 349 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 309 350 310 351 $section = $doc_obj->get_next_section($section); 311 } #while defined section 312 print $handle "$text\n$documentendtag"; 313 #print STDOUT "$text\n$documentendtag"; 314 } 352 } # while defined section 353 354 print $lucenehandle "$text\n$documentendtag"; 355 356 ## if ($edit_mode eq "delete") { 357 ## print STDERR "$text\n$documentendtag"; 358 ## } 359 360 } 361 362 sub text { 363 my $self = shift (@_); 364 my ($doc_obj,$file) = @_; 365 366 $self->textedit($doc_obj,$file,"add"); 367 } 368 369 sub textreindex 370 { 371 my $self = shift (@_); 372 my ($doc_obj,$file) = @_; 373 374 $self->textedit($doc_obj,$file,"reindex"); 375 } 376 377 sub textdelete 378 { 379 my $self = shift (@_); 380 my ($doc_obj,$file) = @_; 381 382 $self->textedit($doc_obj,$file,"delete"); 383 } 384 385 386 387 315 388 316 389 # /** We make this builder pretend to be a document processor so we can get … … 494 567 1; 495 568 569 -
gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm
r18441 r18456 87 87 my ($self) = @_; 88 88 89 print STDERR "*** Running ArchivesInf deinit\n";90 91 89 my $archive_info = $self->{'archive_info'}; 92 90 93 91 if (defined $archive_info) { 92 print STDERR "********* have parsed and processed an archive info file\n"; 93 94 94 my $archive_info_filename = $self->{'archive_info_filename'}; 95 95 96 96 my $file_list = $archive_info->get_file_list(); 97 97 98 foreach my $subfile (@$file_list) { 98 foreach my $subfile (@$file_list) { 99 99 my $doc_oid = $subfile->[1]; 100 100 101 101 my $index_status = $archive_info->get_status_info($doc_oid); 102 print STDERR "*** Updating $doc_oid $index_status\n"; 103 102 104 if ($index_status eq "D") { 103 105 # delete … … 203 205 my $tmp = &util::filename_cat ($file, $subfile->[0]); 204 206 next if $tmp eq $file; 205 206 # We always process the file... 207 208 my $doc_oid = $subfile->[1]; 209 my $index_status = $archive_info->get_status_info($doc_oid); 210 211 my $curr_mode = $processor->get_mode(); 212 my $new_mode = $curr_mode; 213 214 # Start by assuming we want to process the file... 207 215 my $process_file = 1; 208 216 … … 211 219 { 212 220 # Check to see if the file needs indexing 213 my $doc_oid = $subfile->[1];214 my $index_status = $archive_info->get_status_info($doc_oid);215 221 if ($index_status eq "B") 216 222 { … … 218 224 $process_file = 0; 219 225 } 226 elsif ($index_status eq "D") { 227 # Need to be delete it from the index. 228 $new_mode = $curr_mode."delete"; 229 $process_file = 1; 230 } 231 elsif ($index_status eq "R") { 232 # Need to be delete it from the index. 233 $new_mode = $curr_mode."reindex"; 234 $process_file = 1; 235 } 236 } 237 # ... or we're being asked to delete it (in which case skip it) 238 elsif ($index_status eq "D") { 239 # Delete it somehow from archives dir!! 240 # => get short name, lop off filename, concat archivedir 241 # move to recyle bin 242 243 $process_file = 0; 220 244 } 221 245 222 246 if ($process_file) { 223 247 # note: metadata is not carried on to the next level 248 249 $processor->set_mode($new_mode) if ($new_mode ne $curr_mode); 250 224 251 $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli); 225 } 226 252 253 $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode); 254 } 227 255 } 228 256
Note:
See TracChangeset
for help on using the changeset viewer.