Context Navigation

← Previous Changeset
Next Changeset →

Changeset 18456

Timestamp:

2009-02-03T09:48:19+13:00 (15 years ago)

Author:

davidb

Message:

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

Location:

Files:

: 7 edited

bin/script/import.pl (modified) (4 diffs)
bin/script/lucene_passes.pl (modified) (2 diffs)
perllib/GDBMUtils.pm (modified) (3 diffs)
perllib/arcinfo.pm (modified) (1 diff)
perllib/basebuildproc.pm (modified) (6 diffs)
perllib/lucenebuildproc.pm (modified) (10 diffs)
perllib/plugins/ArchivesInfPlugin.pm (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/bin/script/import.pl

-              r18440
+              r18456
 use plugout;
 use manifest;
+use inexport;
 use util;
 use scriptutil;
 …
     if ($manifest eq "") {
     # Load in list of files in import folder from last import (if present)
     $archive_info->load_import_filelist ($arcinfo_src_filename);
+    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
+    }
 …
     # gobal blocking pass may set up some metadata
     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
+    # Can now work out which files were deleted
+    # First convert all files to absolute form
+    # This is to support the situation where the import folder is not
+    # the default
+    my $prev_all_files = $archive_info->{'import_filelist'};
+    foreach my $prev_file (keys %$prev_all_files) {
+        if (!&util::filename_is_absolute($prev_file)) {
+        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
+        delete $prev_all_files->{$prev_file};
+        $prev_all_files->{$full_prev_file} = 1;
+        }
+    }
+    # Figure out which are the new files, existing files and so
+    # by implication the files from the previous import that are not
+    # there any more => mark them for deletion
+    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
+        my $full_curr_file = $curr_file;
+        if (!&util::filename_is_absolute($curr_file)) {
+        # add in import dir to make absolute
+        $full_curr_file = &util::filename_cat($importdir,$curr_file);
+        }
+##      print STDERR "**** Checking $curr_file\n";
+        # figure of if new file or not
+        if (defined $prev_all_files->{$full_curr_file}) {
+        # had it before
+        $block_hash->{'existing_files'}->{$curr_file} = 1;
+        # Now remove it, so by end of loop only the files
+        # that need deleting are left
+        delete $prev_all_files->{$full_curr_file}
+        }
+        else {
+        $block_hash->{'new_files'}->{$curr_file} = 1;
+        }
+        delete $block_hash->{'all_files'}->{$curr_file};
+    }
+    print STDERR "Delete files:\n  ";
+    my @delete_files = keys %$prev_all_files;
+    print STDERR join("\n  ",@delete_files), "\n";
+    # Can now work out which files were new, already existed, and have
+    # been deleted
+    &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir);
+    my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
+    if (scalar(@deleted_files>0)) {
+        print STDERR "Delete files:\n  ";
+        print STDERR join("\n  ",@deleted_files), "\n";
+    }
+    my @new_files = sort keys %{$block_hash->{'new_files'}};
+    if (scalar(@new_files>0)) {
+        print STDERR "New files:\n  ";
+        print STDERR join("\n  ",@new_files), "\n";
+    }
+    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir,
+        $verbosity);
     &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
+    }
     else
+                   else
+    {
     # process any files marked for importing
 …
+    }
+    # record files marked for deletion in arcinfo
+    foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
+        # use 'archiveinf-src' GDBM file to look up all the OIDs
+        # this file is used in (note in most cases, it's just one OID)
+        # An improvement would be to have the record read
+        # into a hash array
+        # gdbmRecordToHash
+        my $gdbm_val
+        = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file);
+        my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm);
+        foreach my $oid (@oids) {
+        # find out if it's an assoc file or main doc
+        # archiveinf-doc, lookup $oid
+        # if "doc-file"
+        #   mark it for deletion
+        # else (assoc file)
+        #  mark all for re-indexing
+        # Now delete file
+        }
+        # $archive_info->add_info($OID,$doc_xml_file,"D");
+    }
+    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
+    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir);
+    }

gsdl/trunk/bin/script/lucene_passes.pl

-              r18440
+              r18456
         $output_filename = "";
+    }
-    elsif ($line =~ m/<\/Delete>\s*$/) {
-        if ($mode eq "index") {
-        $doc_xml =~ s/\n+/\n/g;
-        # notify lucene indexer
-        print PIPEOUT "$doc_xml";
+        }
-        $doc_xml = "";
+    }
+    }
+}
 …
 #  *  the command line of the java wrapper.
 #  *
-#  *  @author John Rowe, DL Consulting
 #  */
 sub main

gsdl/trunk/perllib/GDBMUtils.pm

-              r17285
+              r18456
 sub gdbmDatabaseGet
+  {
+{
     my ($database, $oid) = @_;
 …
     # Done
     return $value;
+  }
+}
+sub gdbmRecordToHash
+{
+    my ($database, $oid) = @_;
+    my $val = gdbmDatabaseGet($database,$oid);
+    my $rec = {};
+    while ($val =~ m/^<(.*?)>(.*)$/mg) {
+    my $metaname = $1;
+    my $metavalue = $2;
+    if (!defined $rec->{$metaname}) {
+        $rec->{$metaname} = [ $metavalue ];
+    }
+    else {
+        push(@{$rec->{$metaname}},$metavalue);
+    }
+    }
+    return $rec;
+}
 sub gdbmDatabaseAppend
+  {
+{
     my ($database, $oid, $value) = @_;
 …
     print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\"\n" if $debug;
+    # Think it would be clearer if this funcctionality was done
+    # by a separate executable, e.g. gdbmremove
+    `gdbmset$exe "$database" "$oid"`;
+    `gdbmdel$exe "$database" "$oid"`;
+}

gsdl/trunk/perllib/arcinfo.pm

-              r18441
+              r18456
     foreach my $file ( keys %$infodb_map ) {
     $self->{'import_filelist'}->{$file} = 1;
+    }
+}
 sub load_import_filelist {
+    $self->{'prev_import_filelist'}->{$file} = 1;
+    }
+}
+sub load_prev_import_filelist {
     my $self = shift (@_);
     my ($filename) = @_;

gsdl/trunk/perllib/basebuildproc.pm

-              r17579
+              r18456
 use doc;
 use docproc;
+use strict; no strict 'subs';
+use strict;
+no strict 'subs';
+no strict 'refs';
 use util;
 …
 sub infodb {
     my $self = shift (@_);
     my ($doc_obj, $filename) = @_;
+sub infodbedit {
+    my $self = shift (@_);
+    my ($doc_obj, $filename, $edit_mode) = @_;
     # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
 …
+    }
+    #add this document to the browse structure
+    push(@{$self->{'doclist'}},$doc_obj->get_OID())
+    unless ($doctype eq "classification");
+    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+    #add this document to the browse structure
+    push(@{$self->{'doclist'}},$doc_obj->get_OID())
+        unless ($doctype eq "classification");
+    }
+    else {
+    # delete => remove this doc from browse structure
+    my $del_doc_oid = $doc_obj->get_OID();
+    my @filtered_doc_list = ();
+    foreach my $oid (@{$self->{'doclist'}}) {
+        push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
+    }
+    $self->{'doclist'} = \@filtered_doc_list;
+    }
     # classify this document
+    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
+    # this is another document
+    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
+    &classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
+    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+    # this is another document
+    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
+    }
+    else {
+    # delete
+    $self->{'num_docs'} -= 1 unless ($doctype eq "classification");
+    return;
+    }
     # is this a paged or a hierarchical document
 …
+sub infodb {
+    my $self = shift (@_);
+    my ($doc_obj, $filename) = @_;
+    $self->infodbedit($doc_obj,$filename,"add");
+}
+sub infodbreindex {
+    my $self = shift (@_);
+    my ($doc_obj, $filename) = @_;
+    $self->infodbedit($doc_obj,$filename,"reindex");
+}
+sub infodbdelete {
+    my $self = shift (@_);
+    my ($doc_obj, $filename) = @_;
+    $self->infodbedit($doc_obj,$filename,"delete");
+}
 sub text {
     my $self = shift (@_);
 …
     die "\n";
+}
+sub textreindex
+{
+    my $self = shift @_;
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
+    if (!$self->is_incremental_capable()) {
+    print $outhandle "  This operation is only possible with indexing tools with that support\n";
+    print $outhandle "  incremental building\n";
+    }
+    die "\n";
+}
+sub textdelete
+{
+    my $self = shift @_;
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
+    if (!$self->is_incremental_capable()) {
+    print $outhandle "  This operation is only possible with indexing tools with that support\n";
+    print $outhandle "  incremental building\n";
+    }
+    die "\n";
+}
 # should the document be indexed - according to the subcollection and language
 …
+}
+sub assoc_files() {
+sub assoc_files
+{
     my $self = shift (@_);
     my ($doc_obj, $archivedir) = @_;

gsdl/trunk/perllib/lucenebuildproc.pm

-              r17797
+              r18456
+sub text {
+    my $self = shift (@_);
+    my ($doc_obj,$file) = @_;
+    my $handle = $self->{'output_handle'};
+sub textedit {
+    my $self = shift (@_);
+    my ($doc_obj,$file,$edit_mode) = @_;
+    my $lucenehandle = $self->{'output_handle'};
     my $outhandle = $self->{'outhandle'};
 …
     return if ($doc_obj->get_doc_type() ne "indexed_doc");
+    # skip this document if in "compress-text" mode and asked to delete it
+    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
     my $indexed_doc = $self->is_subcollection_doc($doc_obj);
     # this is another document
+    $self->{'num_docs'} += 1;
+    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+    $self->{'num_docs'} += 1;
+    }
+    else {
+    $self->{'num_docs'} -= 1;
+    }
     # get the parameters for the output
 …
     my $ldoc_level = $levels->{'document'};
     my $lsec_level = $levels->{'section'};
+    #my $lpar_level = $levels->{'paragraph'};
+    # gs2_id should be depricated #####
     my $gs2_id = "";
     if ($ldoc_level)
 …
+    }
     my $gs2_docOID = $doc_obj->get_OID();
+    my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
+    my $documentendtag = "\n</$doc_tag_name>\n";
+    my $documenttag = undef;
+    my $documentendtag = undef;
+    $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
+    $documentendtag = "\n</$doc_tag_name>\n";
     my $sec_tag_name = "";
 …
     $self->{'num_sections'}++;
+    if ($sec_tag_name ne "")
+    {
+        my $sec_gs2_id = $self->{'num_sections'};
+        my $sec_gs2_docOID = $gs2_docOID . "." . $section;
+        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
+    }
+    my $sec_gs2_id = $self->{'num_sections'};
+    my $sec_gs2_docOID = $gs2_docOID;
+    $sec_gs2_docOID .= ".$section" if ($section ne "");
     # if we are doing subcollections, then some docs shouldn't be indexed.
 …
     my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
     if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
+        $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
+        if ($sec_tag_name ne "") {
+        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
+        $text .= "\n</$sec_tag_name>\n"
+        }
             $section = $doc_obj->get_next_section($section);
         next;
+          }
+    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+    if ($sec_tag_name ne "")
+    {
+        $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
+    }
+    if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
+    }
+    else {
+        # delete
+        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
+    }
     # has the user added a 'metadata' index?
 …
         # filter the text
         $new_text = $self->filter_text ($field, $new_text);
+        $self->{'num_processed_bytes'} += length ($new_text);
+        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+            $self->{'num_processed_bytes'} += length ($new_text);
+            $text .= "$new_text";
+        }
+        else {
+            # delete
+            $self->{'num_processed_bytes'} -= length ($new_text);
+        }
-        $text .= "$new_text";
         if ($self->{'indexing_text'} && $new_field) {
 …
         $new_text = $self->filter_text ("metadata", $new_text);
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+        }
+        else {
+        # delete
+        $self->{'num_processed_bytes'} -= length ($new_text);
+        }
+    }
 …
         $new_text = $self->filter_text ("allfields", $new_text);
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+    }
+        if (($edit_mode eq "add") || ($edit_mode eq "reindex")) {
+        $self->{'num_processed_bytes'} += length ($new_text);
+        $text .= "$new_text";
+        }
+        else {
+        # delete
+        $self->{'num_processed_bytes'} -= length ($new_text);
+        }
+    }
     $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
         $section = $doc_obj->get_next_section($section);
+    } #while defined section
+    print $handle "$text\n$documentendtag";
+    #print STDOUT "$text\n$documentendtag";
+}
+    } # while defined section
+    print $lucenehandle "$text\n$documentendtag";
+##    if ($edit_mode eq "delete") {
+##       print STDERR "$text\n$documentendtag";
+##    }
+}
+sub text {
+    my $self = shift (@_);
+    my ($doc_obj,$file) = @_;
+    $self->textedit($doc_obj,$file,"add");
+}
+sub textreindex
+{
+    my $self = shift (@_);
+    my ($doc_obj,$file) = @_;
+    $self->textedit($doc_obj,$file,"reindex");
+}
+sub textdelete
+{
+    my $self = shift (@_);
+    my ($doc_obj,$file) = @_;
+    $self->textedit($doc_obj,$file,"delete");
+}
 # /** We make this builder pretend to be a document processor so we can get
 …
 ;

gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm

-              r18441
+              r18456
     my ($self) = @_;
-    print STDERR "*** Running ArchivesInf deinit\n";
     my $archive_info = $self->{'archive_info'};
     if (defined $archive_info) {
+    print STDERR "********* have parsed and processed an archive info file\n";
     my $archive_info_filename = $self->{'archive_info_filename'};
         my $file_list = $archive_info->get_file_list();
     foreach my $subfile (@$file_list) {
+    foreach my $subfile (@$file_list) {
         my $doc_oid = $subfile->[1];
         my $index_status = $archive_info->get_status_info($doc_oid);
+        print STDERR "*** Updating $doc_oid $index_status\n";
         if ($index_status eq "D") {
         # delete
 …
         my $tmp = &util::filename_cat ($file, $subfile->[0]);
         next if $tmp eq $file;
+        # We always process the file...
+        my $doc_oid = $subfile->[1];
+        my $index_status = $archive_info->get_status_info($doc_oid);
+        my $curr_mode = $processor->get_mode();
+        my $new_mode = $curr_mode;
+        # Start by assuming we want to process the file...
         my $process_file = 1;
 …
+        {
             # Check to see if the file needs indexing
-        my $doc_oid = $subfile->[1];
-        my $index_status = $archive_info->get_status_info($doc_oid);
         if ($index_status eq "B")
+        {
 …
             $process_file = 0;
+        }
+        elsif ($index_status eq "D") {
+            # Need to be delete it from the index.
+            $new_mode = $curr_mode."delete";
+            $process_file = 1;
+        }
+        elsif ($index_status eq "R") {
+            # Need to be delete it from the index.
+            $new_mode = $curr_mode."reindex";
+            $process_file = 1;
+        }
+        }
+        # ... or we're being asked to delete it (in which case skip it)
+        elsif ($index_status eq "D") {
+        # Delete it somehow from archives dir!!
+        # => get short name, lop off filename, concat archivedir
+        # move to recyle bin
+        $process_file = 0;
+        }
         if ($process_file) {
         # note: metadata is not carried on to the next level
+        $processor->set_mode($new_mode) if ($new_mode ne $curr_mode);
         $count += &plugin::read ($pluginfo, $base_dir, $tmp, $block_hash, {}, $processor, $maxdocs, ($total_count+$count), $gli);
+        }
+        $processor->set_mode($curr_mode) if ($new_mode ne $curr_mode);
+        }
+    }

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: