Changeset 18456


Ignore:
Timestamp:
2009-02-03T09:48:19+13:00 (15 years ago)
Author:
davidb
Message:

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

Location:
gsdl/trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/import.pl

    r18440 r18456  
    6363use plugout;
    6464use manifest;
     65use inexport;
    6566use util;
    6667use scriptutil;
     
    534535    if ($manifest eq "") {
    535536    # Load in list of files in import folder from last import (if present)
    536     $archive_info->load_import_filelist ($arcinfo_src_filename);
     537    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
    537538    }
    538539
     
    571572    # gobal blocking pass may set up some metadata
    572573    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    573     # Can now work out which files were deleted
    574 
    575     # First convert all files to absolute form
    576     # This is to support the situation where the import folder is not
    577     # the default
    578 
    579     my $prev_all_files = $archive_info->{'import_filelist'};
    580     foreach my $prev_file (keys %$prev_all_files) {
    581 
    582         if (!&util::filename_is_absolute($prev_file)) {
    583         my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
    584         delete $prev_all_files->{$prev_file};
    585         $prev_all_files->{$full_prev_file} = 1;
    586         }
    587     }
    588 
    589     # Figure out which are the new files, existing files and so
    590     # by implication the files from the previous import that are not
    591     # there any more => mark them for deletion
    592     foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
    593 
    594         my $full_curr_file = $curr_file;
    595 
    596         if (!&util::filename_is_absolute($curr_file)) {
    597         # add in import dir to make absolute
    598         $full_curr_file = &util::filename_cat($importdir,$curr_file);
    599         }
    600 
    601 ##      print STDERR "**** Checking $curr_file\n";
    602 
    603         # figure of if new file or not
    604         if (defined $prev_all_files->{$full_curr_file}) {
    605         # had it before
    606         $block_hash->{'existing_files'}->{$curr_file} = 1;
    607         # Now remove it, so by end of loop only the files
    608         # that need deleting are left
    609 
    610         delete $prev_all_files->{$full_curr_file}
    611         }
    612         else {
    613         $block_hash->{'new_files'}->{$curr_file} = 1;
    614         }
    615 
    616         delete $block_hash->{'all_files'}->{$curr_file};
    617     }
    618 
    619     print STDERR "Delete files:\n  ";
    620 
    621     my @delete_files = keys %$prev_all_files;
    622     print STDERR join("\n  ",@delete_files), "\n";
     574    # Can now work out which files were new, already existed, and have
     575    # been deleted
     576
     577    &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir);
     578   
     579    my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
     580    if (scalar(@deleted_files>0)) {
     581        print STDERR "Delete files:\n  ";
     582        print STDERR join("\n  ",@deleted_files), "\n";
     583    }
     584
     585    my @new_files = sort keys %{$block_hash->{'new_files'}};
     586    if (scalar(@new_files>0)) {
     587        print STDERR "New files:\n  ";
     588        print STDERR join("\n  ",@new_files), "\n";
     589    }
     590
     591    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir,
     592        $verbosity);
    623593
    624594    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    625595    }
    626     else
     596                   else
    627597    {
    628598    # process any files marked for importing
     
    631601    }
    632602
    633     # record files marked for deletion in arcinfo
    634     foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
    635         # use 'archiveinf-src' GDBM file to look up all the OIDs
    636         # this file is used in (note in most cases, it's just one OID)
    637 
    638         # An improvement would be to have the record read
    639         # into a hash array
    640         # gdbmRecordToHash
    641 
    642         my $gdbm_val
    643         = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file);
    644 
    645         my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm);
    646         foreach my $oid (@oids) {
    647 
    648         # find out if it's an assoc file or main doc
    649 
    650         # archiveinf-doc, lookup $oid
    651         # if "doc-file"
    652         #   mark it for deletion
    653 
    654         # else (assoc file)
    655         #  mark all for re-indexing
    656 
    657         # Now delete file
    658         }
    659 
    660         # $archive_info->add_info($OID,$doc_xml_file,"D");
    661     }
     603    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
     604
     605    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir);
    662606    }
    663607
  • gsdl/trunk/bin/script/lucene_passes.pl

    r18440 r18456  
    132132        $output_filename = "";
    133133    }
    134     elsif ($line =~ m/<\/Delete>\s*$/) {
    135         if ($mode eq "index") {
    136         $doc_xml =~ s/\n+/\n/g;
    137 
    138         # notify lucene indexer
    139         print PIPEOUT "$doc_xml";
    140         }
    141         $doc_xml = "";
    142     }
    143134    }
    144135}
     
    150141#  *  the command line of the java wrapper.
    151142#  *
    152 #  *  @author John Rowe, DL Consulting
    153143#  */
    154144sub main
  • gsdl/trunk/perllib/GDBMUtils.pm

    r17285 r18456  
    1717
    1818sub gdbmDatabaseGet
    19   {
     19{
    2020    my ($database, $oid) = @_;
    2121
     
    2929    # Done
    3030    return $value;
    31   }
     31}
     32
     33sub gdbmRecordToHash
     34{
     35    my ($database, $oid) = @_;
     36
     37    my $val = gdbmDatabaseGet($database,$oid);
     38
     39    my $rec = {};
     40
     41    while ($val =~ m/^<(.*?)>(.*)$/mg) {
     42    my $metaname = $1;
     43    my $metavalue = $2;
     44
     45    if (!defined $rec->{$metaname}) {
     46        $rec->{$metaname} = [ $metavalue ];
     47    }
     48    else {
     49        push(@{$rec->{$metaname}},$metavalue);
     50    }
     51    }
     52
     53    return $rec;
     54}
     55
    3256
    3357sub gdbmDatabaseAppend
    34   {
     58{
    3559    my ($database, $oid, $value) = @_;
    3660
     
    7195    print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\"\n" if $debug;
    7296
    73     # Think it would be clearer if this funcctionality was done
    74     # by a separate executable, e.g. gdbmremove
    75     `gdbmset$exe "$database" "$oid"`;
     97    `gdbmdel$exe "$database" "$oid"`;
    7698}
    7799
  • gsdl/trunk/perllib/arcinfo.pm

    r18441 r18456  
    129129
    130130    foreach my $file ( keys %$infodb_map ) {
    131     $self->{'import_filelist'}->{$file} = 1;
    132     }
    133 }
    134 
    135 
    136 sub load_import_filelist {
     131    $self->{'prev_import_filelist'}->{$file} = 1;
     132    }
     133}
     134
     135
     136sub load_prev_import_filelist {
    137137    my $self = shift (@_);
    138138    my ($filename) = @_;
  • gsdl/trunk/perllib/basebuildproc.pm

    r17579 r18456  
    3535use doc;
    3636use docproc;
    37 use strict; no strict 'subs';
     37use strict;
     38no strict 'subs';
     39no strict 'refs';
    3840use util;
    3941
     
    366368
    367369
    368 sub infodb {
    369     my $self = shift (@_);
    370     my ($doc_obj, $filename) = @_;
     370sub infodbedit {
     371    my $self = shift (@_);
     372    my ($doc_obj, $filename, $edit_mode) = @_;
    371373
    372374    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
     
    396398    }
    397399
    398     #add this document to the browse structure
    399     push(@{$self->{'doclist'}},$doc_obj->get_OID())
    400     unless ($doctype eq "classification");
     400    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     401    #add this document to the browse structure
     402    push(@{$self->{'doclist'}},$doc_obj->get_OID())
     403        unless ($doctype eq "classification");
     404    }
     405    else {
     406    # delete => remove this doc from browse structure
     407    my $del_doc_oid = $doc_obj->get_OID();
     408
     409    my @filtered_doc_list = ();
     410    foreach my $oid (@{$self->{'doclist'}}) {
     411        push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
     412    }
     413    $self->{'doclist'} = \@filtered_doc_list;
     414    }
     415
    401416
    402417    # classify this document
    403     &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
    404 
    405     # this is another document
    406     $self->{'num_docs'} += 1 unless ($doctype eq "classification");
     418    &classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
     419
     420    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     421    # this is another document
     422    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
     423    }
     424    else {
     425    # delete
     426    $self->{'num_docs'} -= 1 unless ($doctype eq "classification");
     427    return;
     428    }
    407429
    408430    # is this a paged or a hierarchical document
     
    563585
    564586
     587
     588
     589sub infodb {
     590    my $self = shift (@_);
     591    my ($doc_obj, $filename) = @_;
     592
     593    $self->infodbedit($doc_obj,$filename,"add");
     594}
     595
     596sub infodbreindex {
     597    my $self = shift (@_);
     598    my ($doc_obj, $filename) = @_;
     599
     600    $self->infodbedit($doc_obj,$filename,"reindex");
     601}
     602
     603sub infodbdelete {
     604    my $self = shift (@_);
     605    my ($doc_obj, $filename) = @_;
     606
     607    $self->infodbedit($doc_obj,$filename,"delete");
     608}
     609
     610
    565611sub text {
    566612    my $self = shift (@_);
     
    571617    die "\n";
    572618}
     619
     620sub textreindex
     621{
     622    my $self = shift @_;
     623
     624    my $outhandle = $self->{'outhandle'};
     625    print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
     626    if (!$self->is_incremental_capable()) {
     627
     628    print $outhandle "  This operation is only possible with indexing tools with that support\n";
     629    print $outhandle "  incremental building\n";
     630    }
     631    die "\n";
     632}
     633
     634sub textdelete
     635{
     636    my $self = shift @_;
     637
     638    my $outhandle = $self->{'outhandle'};
     639    print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
     640    if (!$self->is_incremental_capable()) {
     641
     642    print $outhandle "  This operation is only possible with indexing tools with that support\n";
     643    print $outhandle "  incremental building\n";
     644    }
     645    die "\n";
     646}
     647
    573648
    574649# should the document be indexed - according to the subcollection and language
     
    692767}
    693768
    694 sub assoc_files() {
     769sub assoc_files
     770{
    695771    my $self = shift (@_);
    696772    my ($doc_obj, $archivedir) = @_;
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r17797 r18456  
    6363
    6464
    65 sub text {
    66     my $self = shift (@_);
    67     my ($doc_obj,$file) = @_;
    68     my $handle = $self->{'output_handle'};
     65sub textedit {
     66    my $self = shift (@_);
     67    my ($doc_obj,$file,$edit_mode) = @_;
     68
     69    my $lucenehandle = $self->{'output_handle'};
    6970    my $outhandle = $self->{'outhandle'};
    7071
     
    7273    return if ($doc_obj->get_doc_type() ne "indexed_doc");
    7374
     75    # skip this document if in "compress-text" mode and asked to delete it
     76    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
     77
    7478    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
    7579
    7680    # this is another document
    77     $self->{'num_docs'} += 1;
     81    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     82    $self->{'num_docs'} += 1;
     83    }
     84    else {
     85    $self->{'num_docs'} -= 1;
     86    }
    7887
    7988    # get the parameters for the output
     
    8695    my $ldoc_level = $levels->{'document'};
    8796    my $lsec_level = $levels->{'section'};
    88     #my $lpar_level = $levels->{'paragraph'};
    89 
     97
     98    # gs2_id should be depricated #####
    9099    my $gs2_id = "";
    91100    if ($ldoc_level)
     
    102111    }
    103112    my $gs2_docOID = $doc_obj->get_OID();
    104     my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
    105     my $documentendtag = "\n</$doc_tag_name>\n";
     113    my $documenttag = undef;
     114    my $documentendtag = undef;
     115
     116    $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
     117    $documentendtag = "\n</$doc_tag_name>\n";
    106118
    107119    my $sec_tag_name = "";
     
    123135    $self->{'num_sections'}++;
    124136
    125     if ($sec_tag_name ne "")
    126     {
    127         my $sec_gs2_id = $self->{'num_sections'};
    128         my $sec_gs2_docOID = $gs2_docOID . "." . $section;
    129         $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
    130     }
     137    my $sec_gs2_id = $self->{'num_sections'};
     138    my $sec_gs2_docOID = $gs2_docOID;
     139    $sec_gs2_docOID .= ".$section" if ($section ne "");
    131140
    132141    # if we are doing subcollections, then some docs shouldn't be indexed.
     
    135144    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
    136145    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
    137         $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
     146        if ($sec_tag_name ne "") {
     147        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
     148        $text .= "\n</$sec_tag_name>\n"
     149        }
    138150            $section = $doc_obj->get_next_section($section);
    139151        next;
    140152          }
    141153
    142     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     154    if ($sec_tag_name ne "")
     155    {
     156        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
     157    }
     158
     159    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     160        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     161    }
     162    else {
     163        # delete
     164        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
     165    }
     166
    143167
    144168    # has the user added a 'metadata' index?
     
    233257        # filter the text
    234258        $new_text = $self->filter_text ($field, $new_text);
    235         $self->{'num_processed_bytes'} += length ($new_text);
     259
     260        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     261            $self->{'num_processed_bytes'} += length ($new_text);
     262            $text .= "$new_text";
     263        }
     264        else {
     265            # delete
     266            $self->{'num_processed_bytes'} -= length ($new_text);
     267        }
    236268       
    237         $text .= "$new_text";
    238269
    239270        if ($self->{'indexing_text'} && $new_field) {
     
    287318        $new_text = $self->filter_text ("metadata", $new_text);
    288319       
    289         $self->{'num_processed_bytes'} += length ($new_text);
    290         $text .= "$new_text";
    291 
    292        
     320        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     321        $self->{'num_processed_bytes'} += length ($new_text);
     322        $text .= "$new_text";
     323        }
     324        else {
     325        # delete
     326        $self->{'num_processed_bytes'} -= length ($new_text);
     327        }       
    293328    }
    294329
     
    302337        $new_text = $self->filter_text ("allfields", $new_text);
    303338       
    304         $self->{'num_processed_bytes'} += length ($new_text);
    305         $text .= "$new_text";
    306     }
    307    
     339        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
     340        $self->{'num_processed_bytes'} += length ($new_text);
     341        $text .= "$new_text";
     342        }
     343        else {
     344        # delete
     345        $self->{'num_processed_bytes'} -= length ($new_text);
     346        }
     347    }
     348       
    308349    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
    309350
    310351        $section = $doc_obj->get_next_section($section);
    311     } #while defined section
    312     print $handle "$text\n$documentendtag";
    313     #print STDOUT "$text\n$documentendtag";
    314 }
     352    } # while defined section
     353
     354    print $lucenehandle "$text\n$documentendtag";
     355
     356##    if ($edit_mode eq "delete") {
     357##       print STDERR "$text\n$documentendtag";
     358##    }
     359
     360}
     361
     362sub text {
     363    my $self = shift (@_);
     364    my ($doc_obj,$file) = @_;
     365
     366    $self->textedit($doc_obj,$file,"add");
     367}
     368
     369sub textreindex
     370{
     371    my $self = shift (@_);
     372    my ($doc_obj,$file) = @_;
     373
     374    $self->textedit($doc_obj,$file,"reindex");
     375}
     376
     377sub textdelete
     378{
     379    my $self = shift (@_);
     380    my ($doc_obj,$file) = @_;
     381
     382    $self->textedit($doc_obj,$file,"delete");
     383}
     384
     385
     386
     387
    315388
    316389# /** We make this builder pretend to be a document processor so we can get
     
    4945671;
    495568
     569
  • gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm

    r18441 r18456  
    8787    my ($self) = @_;
    8888
    89     print STDERR "*** Running ArchivesInf deinit\n";
    90 
    9189    my $archive_info = $self->{'archive_info'};
    9290
    9391    if (defined $archive_info) {
     92    print STDERR "********* have parsed and processed an archive info file\n";
     93
    9494    my $archive_info_filename = $self->{'archive_info_filename'};
    9595
    9696        my $file_list = $archive_info->get_file_list();
    9797
    98     foreach my $subfile (@$file_list) {
     98    foreach my $subfile (@$file_list) {     
    9999        my $doc_oid = $subfile->[1];
    100100
    101101        my $index_status = $archive_info->get_status_info($doc_oid);
     102        print STDERR "*** Updating $doc_oid $index_status\n";
     103
    102104        if ($index_status eq "D") {
    103105        # delete
     
    203205        my $tmp = &util::filename_cat ($file, $subfile->[0]);
    204206        next if $tmp eq $file;
    205        
    206         # We always process the file...
     207
     208        my $doc_oid = $subfile->[1];
     209        my $index_status = $archive_info->get_status_info($doc_oid);
     210
     211        my $curr_mode = $processor->get_mode();
     212        my $new_mode = $curr_mode;
     213
     214        # Start by assuming we want to process the file...
    207215        my $process_file = 1;
    208216
     
    211219        {
    212220            # Check to see if the file needs indexing
    213         my $doc_oid = $subfile->[1];
    214         my $index_status = $archive_info->get_status_info($doc_oid);
    215221        if ($index_status eq "B")
    216222        {
     
    218224            $process_file = 0;
    219225        }
     226        elsif ($index_status eq "D") {
     227            # Need to be delete it from the index.
     228            $new_mode = $curr_mode."delete";
     229            $process_file = 1;
     230        }
     231        elsif ($index_status eq "R") {
     232            # Need to be delete it from the index.
     233            $new_mode = $curr_mode."reindex";
     234            $process_file = 1;
     235        }
     236        }
     237        # ... or we're being asked to delete it (in which case skip it)
     238        elsif ($index_status eq "D") {
     239        # Delete it somehow from archives dir!!
     240        # => get short name, lop off filename, concat archivedir
     241        # move to recyle bin
     242
     243        $process_file = 0;
    220244        }
    221245
    222246        if ($process_file) {
    223247        # note: metadata is not carried on to the next level
     248       
     249        $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
     250
    224251        $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
    225         }
    226 
     252
     253        $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
     254        }
    227255    }
    228256
Note: See TracChangeset for help on using the changeset viewer.