Changeset 22037


Ignore:
Timestamp:
2010-05-05T14:53:53+12:00 (14 years ago)
Author:
davidb
Message:

Manifest file processing upgraded to support OIDs. The code then uses the archiveinf-doc database to look up which files it uses.

Location:
main/trunk/greenstone2
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/export.pl

    r21664 r22037  
    488488    print STDERR "<export>\n" if $gli;
    489489   
    490     my $manifest_lookup = new manifest();
     490    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
    491491    if ($manifest ne "") { 
    492492    my $manifest_filename = $manifest;
  • main/trunk/greenstone2/bin/script/import.pl

    r22011 r22037  
    491491    print STDERR "<Import>\n" if $gli;
    492492   
    493     my $manifest_lookup = new manifest();
     493    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
    494494    if ($manifest ne "") { 
    495495    my $manifest_filename = $manifest;
     
    617617        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
    618618        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
    619        
    620        
     619                 
    621620        foreach my $df (@deleted_files) {
    622621            next if ($df =~ m/^$gsdl_tmp_area/);
     
    624623           
    625624            push(@filtered_deleted_files,$df);
    626         }
     625        }       
    627626       
    628        
     627
    629628        @deleted_files = @filtered_deleted_files;
    630629       
     
    661660    else
    662661    {
    663     # process any files marked for importing
    664     foreach my $file (keys %{$manifest_lookup->{'import'}}) {
    665         &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    666     }
     662    #
     663    # 1. Process delete files first
     664    #
    667665
    668666    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
    669667    my @full_deleted_files = ();
    670668
    671     foreach my $df (@deleted_files) {
    672         my $full_df = &util::filename_cat($importdir,$df);
     669    # ensure all filenames are absolute
     670    foreach my $df (@deleted_files) {       
     671        my $full_df =
     672        (&util::filename_is_absolute($df))
     673        ? $df
     674        : &util::filename_cat($importdir,$df);
     675
    673676        push(@full_deleted_files,$full_df);
    674677    }
    675 
     678   
     679    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
    676680    &inexport::mark_docs_for_deletion($archive_info,{},
    677681                      \@full_deleted_files,
    678682                      $archivedir, $verbosity, "delete");
     683
     684
     685    #
     686    # 2. Now files for reindexing
     687    #
     688
     689    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
     690    my @full_reindex_files = ();
     691
     692    # ensure all filenames are absolute
     693    foreach my $rf (@reindex_files) {       
     694        my $full_rf =
     695        (&util::filename_is_absolute($rf))
     696        ? $rf
     697        : &util::filename_cat($importdir,$rf);
     698
     699        push(@full_reindex_files,$full_rf);
     700    }
     701   
     702    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
     703    &inexport::mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
     704
     705    # And now ensure the new version of the file processed by appropriate
     706    # plugin
     707    foreach my $full_rf (@full_reindex_files) {
     708        &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
     709    }
     710
     711
     712    #
     713    # 3. Now finally any new files
     714    #
     715
     716    foreach my $file (keys %{$manifest_lookup->{'index'}}) {
     717        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
     718    }
     719
     720
    679721    }
    680722
  • main/trunk/greenstone2/perllib/manifest.pm

    r18441 r22037  
    11package manifest;
    22
    3 use XMLParser;
    43use strict;
    54no strict 'refs'; # allow filehandles to be variables and viceversa
    65
     6use XMLParser;
     7use dbutil;
     8
    79our $self;
    810
    911sub new {
    1012    my ($class) = shift (@_);
     13    my ($infodbtype,$archivedir) = @_;
    1114
    1215    $self = {} ;
     
    1518    $self->{'reindex'} = {};
    1619    $self->{'delete'} = {};
     20
     21    my $arcinfo_doc_filename
     22    = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
     23
     24    if (-e $arcinfo_doc_filename) {
     25    # Only store the infodb-doc filename if it exists
     26    # If it doesn't exist then this means the collection has not been
     27    #   built yet (or else the archives folder has been deleted).
     28    #   Either way we have no way to look up which files
     29    #   are associated with an OID.  If we we encounter an OID
     30    #   tag later on, we will use the fact that this field is
     31    #   not defined to issue a warning
     32
     33    $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
     34    $self->{'_infodbtype'} = $infodbtype;
     35    }
    1736
    1837    return bless $self, $class;
     
    89108    my ($expat, $element) = @_;
    90109
    91     if ($element eq "Filename")
    92     {
    93     $self->{'filename'} = "";
     110    if (($element eq "Filename") || ($element eq "OID"))
     111    {
     112    $self->{'item-val'} = "";
    94113    }
    95114    elsif ($element eq "Manifest") {
     
    99118    if (defined($self->{'file-type'}))
    100119    {
    101         print STDERR "Warning: Malformed XML manifest ($element nested inside " . $self->{'file-type'} . ")\n";
    102     }
    103 
    104     $self->{'file-type'} = $element;
     120        print STDERR "Warning: Malformed XML manifest\n";
     121        print STDERR "         Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
     122    }
     123    else {
     124        my $filetype = lc($element);
     125        $self->{'file-type'} = $filetype;
     126        if (!defined $self->{$filetype}) {
     127        print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
     128        }
     129    }
     130
    105131    }
    106132}
     
    114140    if ($element eq "Filename")
    115141    {
    116     $self->{lc($self->{'file-type'})}->{$self->{'filename'}} = 1;
    117     $self->{'filename'} = undef;
     142    my $filetype = $self->{'file-type'};
     143    my $filename  = $self->{'item-val'};
     144
     145    $self->{$filetype}->{$filename} = 1;
     146    $self->{'item-val'} = undef;
     147    }
     148    elsif ($element eq "OID") {
     149    # look up src and assoc filenames used by this doc oid
     150
     151    my $filetype = $self->{'file-type'};
     152    my $oid  = $self->{'item-val'};
     153
     154    if (defined $self->{'_infodbtype'}) {
     155       
     156
     157        my $infodbtype = $self->{'_infodbtype'};
     158        my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
     159       
     160        my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
     161
     162        my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
     163       
     164        my $doc_source_file = $doc_rec->{'src-file'}->[0];
     165        my $assoc_files = $doc_rec->{'assoc-file'};
     166        my @all_files = ($doc_source_file,@$assoc_files);
     167       
     168        foreach my $filename (@all_files) {
     169       
     170        if (!&util::filename_is_absolute($filename)) {
     171            $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
     172        }
     173
     174        $self->{$filetype}->{$filename} = 1;
     175        }
     176    }
     177    else {
     178        print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
     179        print STDERR "         Unable to look up source files that constitute document $oid.\n";
     180    }
     181
     182    $self->{'item-val'} = undef;
    118183    }
    119184    else
     
    129194    my ($expat) = @_;
    130195
    131     if (defined $self->{'filename'}) {
     196    if (defined $self->{'item-val'}) {
    132197    my $text = $_;
    133198    chomp($text);
     
    136201    $text =~ s/\s+$//; 
    137202   
    138     $self->{'filename'} .= $text if ($text !~ m/^\s*$/);
     203    $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
    139204    }
    140205}
     
    152217    my ($expat) = @_;
    153218
     219    if (defined $self->{'import'}) {
     220    print STDERR "Warning: <Import> tag is deprecated.\n";
     221    print STDERR "         Processing data as if it were tagged as <Index>\n";
     222    $self->{'index'} = $self->{'import'};
     223    }
     224
    154225}
    155226
  • main/trunk/greenstone2/perllib/plugin.pm

    r21618 r22037  
    222222sub remove_some {
    223223    my ($pluginfo, $infodbtype, $archivedir, $deleted_files) = @_;
    224     print STDERR "in remove some\n";
    225224    return if (scalar(@$deleted_files)==0);
    226225    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
Note: See TracChangeset for help on using the changeset viewer.