Changeset 18456 for gsdl

Show
Ignore:
Timestamp:
03.02.2009 09:48:19 (11 years ago)
Author:
davidb
Message:

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

Location:
gsdl/trunk
Files:
7 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/import.pl

    r18440 r18456  
    6363use plugout; 
    6464use manifest; 
     65use inexport; 
    6566use util; 
    6667use scriptutil; 
     
    534535    if ($manifest eq "") { 
    535536    # Load in list of files in import folder from last import (if present) 
    536     $archive_info->load_import_filelist ($arcinfo_src_filename); 
     537    $archive_info->load_prev_import_filelist ($arcinfo_src_filename); 
    537538    } 
    538539 
     
    571572    # gobal blocking pass may set up some metadata 
    572573    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 
    573     # Can now work out which files were deleted 
    574  
    575     # First convert all files to absolute form 
    576     # This is to support the situation where the import folder is not 
    577     # the default 
    578  
    579     my $prev_all_files = $archive_info->{'import_filelist'}; 
    580     foreach my $prev_file (keys %$prev_all_files) { 
    581  
    582         if (!&util::filename_is_absolute($prev_file)) { 
    583         my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file); 
    584         delete $prev_all_files->{$prev_file}; 
    585         $prev_all_files->{$full_prev_file} = 1; 
    586         } 
    587     } 
    588  
    589     # Figure out which are the new files, existing files and so 
    590     # by implication the files from the previous import that are not 
    591     # there any more => mark them for deletion 
    592     foreach my $curr_file (keys %{$block_hash->{'all_files'}}) { 
    593  
    594         my $full_curr_file = $curr_file; 
    595  
    596         if (!&util::filename_is_absolute($curr_file)) { 
    597         # add in import dir to make absolute 
    598         $full_curr_file = &util::filename_cat($importdir,$curr_file); 
    599         } 
    600  
    601 ##      print STDERR "**** Checking $curr_file\n"; 
    602  
    603         # figure of if new file or not 
    604         if (defined $prev_all_files->{$full_curr_file}) { 
    605         # had it before 
    606         $block_hash->{'existing_files'}->{$curr_file} = 1; 
    607         # Now remove it, so by end of loop only the files 
    608         # that need deleting are left 
    609  
    610         delete $prev_all_files->{$full_curr_file} 
    611         } 
    612         else { 
    613         $block_hash->{'new_files'}->{$curr_file} = 1; 
    614         } 
    615  
    616         delete $block_hash->{'all_files'}->{$curr_file}; 
    617     } 
    618  
    619     print STDERR "Delete files:\n  "; 
    620  
    621     my @delete_files = keys %$prev_all_files; 
    622     print STDERR join("\n  ",@delete_files), "\n"; 
     574    # Can now work out which files were new, already existed, and have 
     575    # been deleted 
     576 
     577    &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir); 
     578     
     579    my @deleted_files = sort keys %{$block_hash->{'deleted_files'}}; 
     580    if (scalar(@deleted_files>0)) { 
     581        print STDERR "Delete files:\n  "; 
     582        print STDERR join("\n  ",@deleted_files), "\n"; 
     583    } 
     584 
     585    my @new_files = sort keys %{$block_hash->{'new_files'}}; 
     586    if (scalar(@new_files>0)) { 
     587        print STDERR "New files:\n  "; 
     588        print STDERR join("\n  ",@new_files), "\n"; 
     589    } 
     590 
     591    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir, 
     592        $verbosity); 
    623593 
    624594    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
    625595    } 
    626     else 
     596                   else 
    627597    { 
    628598    # process any files marked for importing 
     
    631601    } 
    632602 
    633     # record files marked for deletion in arcinfo 
    634     foreach my $file (keys %{$manifest_lookup->{'delete'}}) { 
    635         # use 'archiveinf-src' GDBM file to look up all the OIDs 
    636         # this file is used in (note in most cases, it's just one OID) 
    637  
    638         # An improvement would be to have the record read 
    639         # into a hash array 
    640         # gdbmRecordToHash 
    641  
    642         my $gdbm_val  
    643         = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file); 
    644  
    645         my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm); 
    646         foreach my $oid (@oids) { 
    647  
    648         # find out if it's an assoc file or main doc 
    649  
    650         # archiveinf-doc, lookup $oid 
    651         # if "doc-file"  
    652         #   mark it for deletion 
    653  
    654         # else (assoc file) 
    655         #  mark all for re-indexing 
    656  
    657         # Now delete file  
    658         } 
    659  
    660         # $archive_info->add_info($OID,$doc_xml_file,"D"); 
    661     } 
     603    my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 
     604 
     605    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir); 
    662606    } 
    663607 
  • gsdl/trunk/bin/script/lucene_passes.pl

    r18440 r18456  
    132132        $output_filename = ""; 
    133133    } 
    134     elsif ($line =~ m/<\/Delete>\s*$/) { 
    135         if ($mode eq "index") { 
    136         $doc_xml =~ s/\n+/\n/g; 
    137  
    138         # notify lucene indexer 
    139         print PIPEOUT "$doc_xml"; 
    140         } 
    141         $doc_xml = ""; 
    142     } 
    143134    } 
    144135} 
     
    150141#  *  the command line of the java wrapper. 
    151142#  * 
    152 #  *  @author John Rowe, DL Consulting 
    153143#  */ 
    154144sub main 
  • gsdl/trunk/perllib/GDBMUtils.pm

    r17285 r18456  
    1717 
    1818sub gdbmDatabaseGet 
    19   { 
     19{ 
    2020    my ($database, $oid) = @_; 
    2121 
     
    2929    # Done 
    3030    return $value; 
    31   } 
     31} 
     32 
     33sub gdbmRecordToHash 
     34{ 
     35    my ($database, $oid) = @_; 
     36 
     37    my $val = gdbmDatabaseGet($database,$oid); 
     38 
     39    my $rec = {}; 
     40 
     41    while ($val =~ m/^<(.*?)>(.*)$/mg) { 
     42    my $metaname = $1; 
     43    my $metavalue = $2; 
     44 
     45    if (!defined $rec->{$metaname}) { 
     46        $rec->{$metaname} = [ $metavalue ]; 
     47    } 
     48    else { 
     49        push(@{$rec->{$metaname}},$metavalue); 
     50    } 
     51    } 
     52 
     53    return $rec; 
     54} 
     55 
    3256 
    3357sub gdbmDatabaseAppend 
    34   { 
     58{ 
    3559    my ($database, $oid, $value) = @_; 
    3660 
     
    7195    print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\"\n" if $debug; 
    7296 
    73     # Think it would be clearer if this funcctionality was done 
    74     # by a separate executable, e.g. gdbmremove 
    75     `gdbmset$exe "$database" "$oid"`; 
     97    `gdbmdel$exe "$database" "$oid"`; 
    7698} 
    7799 
  • gsdl/trunk/perllib/arcinfo.pm

    r18441 r18456  
    129129 
    130130    foreach my $file ( keys %$infodb_map ) { 
    131     $self->{'import_filelist'}->{$file} = 1; 
    132     } 
    133 } 
    134  
    135  
    136 sub load_import_filelist { 
     131    $self->{'prev_import_filelist'}->{$file} = 1; 
     132    } 
     133} 
     134 
     135 
     136sub load_prev_import_filelist { 
    137137    my $self = shift (@_); 
    138138    my ($filename) = @_; 
  • gsdl/trunk/perllib/basebuildproc.pm

    r17579 r18456  
    3535use doc; 
    3636use docproc; 
    37 use strict; no strict 'subs'; 
     37use strict;  
     38no strict 'subs'; 
     39no strict 'refs'; 
    3840use util; 
    3941 
     
    366368 
    367369 
    368 sub infodb { 
    369     my $self = shift (@_); 
    370     my ($doc_obj, $filename) = @_; 
     370sub infodbedit { 
     371    my $self = shift (@_); 
     372    my ($doc_obj, $filename, $edit_mode) = @_; 
    371373 
    372374    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document 
     
    396398    } 
    397399 
    398     #add this document to the browse structure 
    399     push(@{$self->{'doclist'}},$doc_obj->get_OID())  
    400     unless ($doctype eq "classification"); 
     400    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     401    #add this document to the browse structure 
     402    push(@{$self->{'doclist'}},$doc_obj->get_OID())  
     403        unless ($doctype eq "classification"); 
     404    } 
     405    else { 
     406    # delete => remove this doc from browse structure 
     407    my $del_doc_oid = $doc_obj->get_OID(); 
     408 
     409    my @filtered_doc_list = (); 
     410    foreach my $oid (@{$self->{'doclist'}}) { 
     411        push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid); 
     412    } 
     413    $self->{'doclist'} = \@filtered_doc_list; 
     414    } 
     415 
    401416 
    402417    # classify this document 
    403     &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 
    404  
    405     # this is another document 
    406     $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 
     418    &classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode); 
     419 
     420    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     421    # this is another document 
     422    $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 
     423    } 
     424    else { 
     425    # delete 
     426    $self->{'num_docs'} -= 1 unless ($doctype eq "classification"); 
     427    return; 
     428    } 
    407429 
    408430    # is this a paged or a hierarchical document 
     
    563585 
    564586 
     587 
     588 
     589sub infodb { 
     590    my $self = shift (@_); 
     591    my ($doc_obj, $filename) = @_; 
     592 
     593    $self->infodbedit($doc_obj,$filename,"add"); 
     594} 
     595 
     596sub infodbreindex { 
     597    my $self = shift (@_); 
     598    my ($doc_obj, $filename) = @_; 
     599 
     600    $self->infodbedit($doc_obj,$filename,"reindex"); 
     601} 
     602 
     603sub infodbdelete { 
     604    my $self = shift (@_); 
     605    my ($doc_obj, $filename) = @_; 
     606 
     607    $self->infodbedit($doc_obj,$filename,"delete"); 
     608} 
     609 
     610 
    565611sub text { 
    566612    my $self = shift (@_); 
     
    571617    die "\n"; 
    572618} 
     619 
     620sub textreindex 
     621{ 
     622    my $self = shift @_; 
     623 
     624    my $outhandle = $self->{'outhandle'}; 
     625    print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n"; 
     626    if (!$self->is_incremental_capable()) { 
     627 
     628    print $outhandle "  This operation is only possible with indexing tools with that support\n"; 
     629    print $outhandle "  incremental building\n"; 
     630    } 
     631    die "\n"; 
     632} 
     633 
     634sub textdelete 
     635{ 
     636    my $self = shift @_; 
     637 
     638    my $outhandle = $self->{'outhandle'}; 
     639    print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n"; 
     640    if (!$self->is_incremental_capable()) { 
     641 
     642    print $outhandle "  This operation is only possible with indexing tools with that support\n"; 
     643    print $outhandle "  incremental building\n"; 
     644    } 
     645    die "\n"; 
     646} 
     647 
    573648 
    574649# should the document be indexed - according to the subcollection and language 
     
    692767} 
    693768 
    694 sub assoc_files() { 
     769sub assoc_files  
     770{ 
    695771    my $self = shift (@_); 
    696772    my ($doc_obj, $archivedir) = @_; 
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r17797 r18456  
    6363 
    6464 
    65 sub text { 
    66     my $self = shift (@_); 
    67     my ($doc_obj,$file) = @_; 
    68     my $handle = $self->{'output_handle'}; 
     65sub textedit { 
     66    my $self = shift (@_); 
     67    my ($doc_obj,$file,$edit_mode) = @_; 
     68 
     69    my $lucenehandle = $self->{'output_handle'}; 
    6970    my $outhandle = $self->{'outhandle'}; 
    7071 
     
    7273    return if ($doc_obj->get_doc_type() ne "indexed_doc"); 
    7374 
     75    # skip this document if in "compress-text" mode and asked to delete it 
     76    return if (!$self->get_indexing_text() && ($edit_mode eq "delete")); 
     77 
    7478    my $indexed_doc = $self->is_subcollection_doc($doc_obj); 
    7579 
    7680    # this is another document 
    77     $self->{'num_docs'} += 1; 
     81    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     82    $self->{'num_docs'} += 1; 
     83    } 
     84    else { 
     85    $self->{'num_docs'} -= 1; 
     86    } 
    7887 
    7988    # get the parameters for the output 
     
    8695    my $ldoc_level = $levels->{'document'}; 
    8796    my $lsec_level = $levels->{'section'}; 
    88     #my $lpar_level = $levels->{'paragraph'}; 
    89  
     97 
     98    # gs2_id should be depricated ##### 
    9099    my $gs2_id = ""; 
    91100    if ($ldoc_level) 
     
    102111    } 
    103112    my $gs2_docOID = $doc_obj->get_OID(); 
    104     my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n"; 
    105     my $documentendtag = "\n</$doc_tag_name>\n"; 
     113    my $documenttag = undef; 
     114    my $documentendtag = undef; 
     115 
     116    $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n"; 
     117    $documentendtag = "\n</$doc_tag_name>\n"; 
    106118 
    107119    my $sec_tag_name = ""; 
     
    123135    $self->{'num_sections'}++; 
    124136 
    125     if ($sec_tag_name ne "") 
    126     { 
    127         my $sec_gs2_id = $self->{'num_sections'}; 
    128         my $sec_gs2_docOID = $gs2_docOID . "." . $section; 
    129         $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n"; 
    130     } 
     137    my $sec_gs2_id = $self->{'num_sections'}; 
     138    my $sec_gs2_docOID = $gs2_docOID; 
     139    $sec_gs2_docOID .= ".$section" if ($section ne ""); 
    131140 
    132141    # if we are doing subcollections, then some docs shouldn't be indexed. 
     
    135144    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section"; 
    136145    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) { 
    137         $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 
     146        if ($sec_tag_name ne "") { 
     147        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n"; 
     148        $text .= "\n</$sec_tag_name>\n"  
     149        } 
    138150            $section = $doc_obj->get_next_section($section); 
    139151        next; 
    140152          } 
    141153 
    142     $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 
     154    if ($sec_tag_name ne "") 
     155    { 
     156        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n"; 
     157    } 
     158 
     159    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     160        $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 
     161    } 
     162    else { 
     163        # delete 
     164        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section); 
     165    } 
     166 
    143167 
    144168    # has the user added a 'metadata' index? 
     
    233257        # filter the text 
    234258        $new_text = $self->filter_text ($field, $new_text); 
    235         $self->{'num_processed_bytes'} += length ($new_text); 
     259 
     260        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     261            $self->{'num_processed_bytes'} += length ($new_text); 
     262            $text .= "$new_text"; 
     263        } 
     264        else { 
     265            # delete 
     266            $self->{'num_processed_bytes'} -= length ($new_text); 
     267        } 
    236268         
    237         $text .= "$new_text"; 
    238269 
    239270        if ($self->{'indexing_text'} && $new_field) { 
     
    287318        $new_text = $self->filter_text ("metadata", $new_text); 
    288319         
    289         $self->{'num_processed_bytes'} += length ($new_text); 
    290         $text .= "$new_text"; 
    291  
    292          
     320        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     321        $self->{'num_processed_bytes'} += length ($new_text); 
     322        $text .= "$new_text"; 
     323        } 
     324        else { 
     325        # delete 
     326        $self->{'num_processed_bytes'} -= length ($new_text); 
     327        }        
    293328    } 
    294329 
     
    302337        $new_text = $self->filter_text ("allfields", $new_text); 
    303338         
    304         $self->{'num_processed_bytes'} += length ($new_text); 
    305         $text .= "$new_text"; 
    306     } 
    307      
     339        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) { 
     340        $self->{'num_processed_bytes'} += length ($new_text); 
     341        $text .= "$new_text"; 
     342        } 
     343        else { 
     344        # delete 
     345        $self->{'num_processed_bytes'} -= length ($new_text); 
     346        } 
     347    } 
     348         
    308349    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 
    309350 
    310351        $section = $doc_obj->get_next_section($section); 
    311     } #while defined section 
    312     print $handle "$text\n$documentendtag"; 
    313     #print STDOUT "$text\n$documentendtag"; 
    314 } 
     352    } # while defined section 
     353 
     354    print $lucenehandle "$text\n$documentendtag"; 
     355 
     356##    if ($edit_mode eq "delete") {  
     357##       print STDERR "$text\n$documentendtag"; 
     358##    } 
     359 
     360} 
     361 
     362sub text { 
     363    my $self = shift (@_); 
     364    my ($doc_obj,$file) = @_; 
     365 
     366    $self->textedit($doc_obj,$file,"add"); 
     367} 
     368 
     369sub textreindex 
     370{ 
     371    my $self = shift (@_); 
     372    my ($doc_obj,$file) = @_; 
     373 
     374    $self->textedit($doc_obj,$file,"reindex"); 
     375} 
     376 
     377sub textdelete 
     378{ 
     379    my $self = shift (@_); 
     380    my ($doc_obj,$file) = @_; 
     381 
     382    $self->textedit($doc_obj,$file,"delete"); 
     383} 
     384 
     385 
     386 
     387 
    315388 
    316389# /** We make this builder pretend to be a document processor so we can get 
     
    4945671; 
    495568 
     569 
  • gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm

    r18441 r18456  
    8787    my ($self) = @_; 
    8888 
    89     print STDERR "*** Running ArchivesInf deinit\n"; 
    90  
    9189    my $archive_info = $self->{'archive_info'}; 
    9290 
    9391    if (defined $archive_info) { 
     92    print STDERR "********* have parsed and processed an archive info file\n"; 
     93 
    9494    my $archive_info_filename = $self->{'archive_info_filename'}; 
    9595 
    9696        my $file_list = $archive_info->get_file_list(); 
    9797 
    98     foreach my $subfile (@$file_list) { 
     98    foreach my $subfile (@$file_list) {      
    9999        my $doc_oid = $subfile->[1]; 
    100100 
    101101        my $index_status = $archive_info->get_status_info($doc_oid); 
     102        print STDERR "*** Updating $doc_oid $index_status\n"; 
     103 
    102104        if ($index_status eq "D") { 
    103105        # delete 
     
    203205        my $tmp = &util::filename_cat ($file, $subfile->[0]); 
    204206        next if $tmp eq $file; 
    205          
    206         # We always process the file... 
     207 
     208        my $doc_oid = $subfile->[1]; 
     209        my $index_status = $archive_info->get_status_info($doc_oid); 
     210 
     211        my $curr_mode = $processor->get_mode(); 
     212        my $new_mode = $curr_mode; 
     213 
     214        # Start by assuming we want to process the file... 
    207215        my $process_file = 1; 
    208216 
     
    211219        { 
    212220            # Check to see if the file needs indexing 
    213         my $doc_oid = $subfile->[1]; 
    214         my $index_status = $archive_info->get_status_info($doc_oid); 
    215221        if ($index_status eq "B") 
    216222        { 
     
    218224            $process_file = 0; 
    219225        } 
     226        elsif ($index_status eq "D") { 
     227            # Need to be delete it from the index. 
     228            $new_mode = $curr_mode."delete"; 
     229            $process_file = 1; 
     230        } 
     231        elsif ($index_status eq "R") { 
     232            # Need to be delete it from the index. 
     233            $new_mode = $curr_mode."reindex"; 
     234            $process_file = 1; 
     235        } 
     236        } 
     237        # ... or we're being asked to delete it (in which case skip it) 
     238        elsif ($index_status eq "D") { 
     239        # Delete it somehow from archives dir!! 
     240        # => get short name, lop off filename, concat archivedir 
     241        # move to recyle bin 
     242 
     243        $process_file = 0; 
    220244        } 
    221245 
    222246        if ($process_file) { 
    223247        # note: metadata is not carried on to the next level 
     248         
     249        $processor->set_mode($new_mode) if ($new_mode ne $curr_mode); 
     250 
    224251        $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli); 
    225         } 
    226  
     252 
     253        $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode); 
     254        } 
    227255    } 
    228256