Changeset 22037

Show
Ignore:
Timestamp:
05.05.2010 14:53:53 (9 years ago)
Author:
davidb
Message:

Manifest file processing upgraded to support OIDs. The code then uses the archiveinf-doc database to look up which files it uses.

Location:
main/trunk/greenstone2
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/export.pl

    r21664 r22037  
    488488    print STDERR "<export>\n" if $gli; 
    489489     
    490     my $manifest_lookup = new manifest(); 
     490    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir); 
    491491    if ($manifest ne "") {   
    492492    my $manifest_filename = $manifest; 
  • main/trunk/greenstone2/bin/script/import.pl

    r22011 r22037  
    491491    print STDERR "<Import>\n" if $gli; 
    492492     
    493     my $manifest_lookup = new manifest(); 
     493    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir); 
    494494    if ($manifest ne "") {   
    495495    my $manifest_filename = $manifest; 
     
    617617        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area); 
    618618        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area); 
    619          
    620          
     619                   
    621620        foreach my $df (@deleted_files) { 
    622621            next if ($df =~ m/^$gsdl_tmp_area/); 
     
    624623             
    625624            push(@filtered_deleted_files,$df); 
    626         } 
     625        }        
    627626         
    628          
     627 
    629628        @deleted_files = @filtered_deleted_files; 
    630629         
     
    661660    else 
    662661    { 
    663     # process any files marked for importing 
    664     foreach my $file (keys %{$manifest_lookup->{'import'}}) { 
    665         &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli); 
    666     } 
     662    #  
     663    # 1. Process delete files first 
     664    #  
    667665 
    668666    my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 
    669667    my @full_deleted_files = (); 
    670668 
    671     foreach my $df (@deleted_files) { 
    672         my $full_df = &util::filename_cat($importdir,$df); 
     669    # ensure all filenames are absolute 
     670    foreach my $df (@deleted_files) {        
     671        my $full_df = 
     672        (&util::filename_is_absolute($df))  
     673        ? $df 
     674        : &util::filename_cat($importdir,$df); 
     675 
    673676        push(@full_deleted_files,$full_df); 
    674677    } 
    675  
     678     
     679    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files); 
    676680    &inexport::mark_docs_for_deletion($archive_info,{}, 
    677681                      \@full_deleted_files, 
    678682                      $archivedir, $verbosity, "delete"); 
     683 
     684 
     685    #  
     686    # 2. Now files for reindexing 
     687    #  
     688 
     689    my @reindex_files = keys %{$manifest_lookup->{'reindex'}}; 
     690    my @full_reindex_files = (); 
     691 
     692    # ensure all filenames are absolute 
     693    foreach my $rf (@reindex_files) {        
     694        my $full_rf = 
     695        (&util::filename_is_absolute($rf))  
     696        ? $rf 
     697        : &util::filename_cat($importdir,$rf); 
     698 
     699        push(@full_reindex_files,$full_rf); 
     700    } 
     701     
     702    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files); 
     703    &inexport::mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex"); 
     704 
     705    # And now ensure the new version of the file processed by appropriate 
     706    # plugin 
     707    foreach my $full_rf (@full_reindex_files) { 
     708        &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli); 
     709    } 
     710 
     711 
     712    #  
     713    # 3. Now finally any new files 
     714    #  
     715 
     716    foreach my $file (keys %{$manifest_lookup->{'index'}}) { 
     717        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli); 
     718    } 
     719 
     720 
    679721    } 
    680722 
  • main/trunk/greenstone2/perllib/manifest.pm

    r18441 r22037  
    11package manifest; 
    22 
    3 use XMLParser; 
    43use strict; 
    54no strict 'refs'; # allow filehandles to be variables and viceversa 
    65 
     6use XMLParser; 
     7use dbutil; 
     8 
    79our $self; 
    810 
    911sub new { 
    1012    my ($class) = shift (@_); 
     13    my ($infodbtype,$archivedir) = @_; 
    1114 
    1215    $self = {} ; 
     
    1518    $self->{'reindex'} = {}; 
    1619    $self->{'delete'} = {}; 
     20 
     21    my $arcinfo_doc_filename  
     22    = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir); 
     23 
     24    if (-e $arcinfo_doc_filename) { 
     25    # Only store the infodb-doc filename if it exists 
     26    # If it doesn't exist then this means the collection has not been 
     27    #   built yet (or else the archives folder has been deleted). 
     28    #   Either way we have no way to look up which files 
     29    #   are associated with an OID.  If we we encounter an OID 
     30    #   tag later on, we will use the fact that this field is 
     31    #   not defined to issue a warning 
     32 
     33    $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename; 
     34    $self->{'_infodbtype'} = $infodbtype; 
     35    } 
    1736 
    1837    return bless $self, $class; 
     
    89108    my ($expat, $element) = @_; 
    90109 
    91     if ($element eq "Filename") 
    92     { 
    93     $self->{'filename'} = ""; 
     110    if (($element eq "Filename") || ($element eq "OID")) 
     111    { 
     112    $self->{'item-val'} = ""; 
    94113    } 
    95114    elsif ($element eq "Manifest") { 
     
    99118    if (defined($self->{'file-type'})) 
    100119    { 
    101         print STDERR "Warning: Malformed XML manifest ($element nested inside " . $self->{'file-type'} . ")\n"; 
    102     } 
    103  
    104     $self->{'file-type'} = $element; 
     120        print STDERR "Warning: Malformed XML manifest\n"; 
     121        print STDERR "         Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n"; 
     122    } 
     123    else { 
     124        my $filetype = lc($element); 
     125        $self->{'file-type'} = $filetype; 
     126        if (!defined $self->{$filetype}) { 
     127        print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n"; 
     128        } 
     129    } 
     130 
    105131    } 
    106132} 
     
    114140    if ($element eq "Filename") 
    115141    { 
    116     $self->{lc($self->{'file-type'})}->{$self->{'filename'}} = 1; 
    117     $self->{'filename'} = undef; 
     142    my $filetype = $self->{'file-type'}; 
     143    my $filename  = $self->{'item-val'}; 
     144 
     145    $self->{$filetype}->{$filename} = 1; 
     146    $self->{'item-val'} = undef; 
     147    } 
     148    elsif ($element eq "OID") { 
     149    # look up src and assoc filenames used by this doc oid 
     150 
     151    my $filetype = $self->{'file-type'}; 
     152    my $oid  = $self->{'item-val'}; 
     153 
     154    if (defined $self->{'_infodbtype'}) { 
     155         
     156 
     157        my $infodbtype = $self->{'_infodbtype'}; 
     158        my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'}; 
     159         
     160        my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid); 
     161 
     162        my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string); 
     163         
     164        my $doc_source_file = $doc_rec->{'src-file'}->[0]; 
     165        my $assoc_files = $doc_rec->{'assoc-file'}; 
     166        my @all_files = ($doc_source_file,@$assoc_files); 
     167         
     168        foreach my $filename (@all_files) { 
     169         
     170        if (!&util::filename_is_absolute($filename)) { 
     171            $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename); 
     172        } 
     173 
     174        $self->{$filetype}->{$filename} = 1; 
     175        } 
     176    } 
     177    else { 
     178        print STDERR "Warning: No archiveinf-doc database in archives directory.\n"; 
     179        print STDERR "         Unable to look up source files that constitute document $oid.\n"; 
     180    } 
     181 
     182    $self->{'item-val'} = undef; 
    118183    } 
    119184    else 
     
    129194    my ($expat) = @_; 
    130195 
    131     if (defined $self->{'filename'}) { 
     196    if (defined $self->{'item-val'}) { 
    132197    my $text = $_; 
    133198    chomp($text); 
     
    136201    $text =~ s/\s+$//;   
    137202     
    138     $self->{'filename'} .= $text if ($text !~ m/^\s*$/); 
     203    $self->{'item-val'} .= $text if ($text !~ m/^\s*$/); 
    139204    } 
    140205} 
     
    152217    my ($expat) = @_; 
    153218 
     219    if (defined $self->{'import'}) { 
     220    print STDERR "Warning: <Import> tag is deprecated.\n"; 
     221    print STDERR "         Processing data as if it were tagged as <Index>\n"; 
     222    $self->{'index'} = $self->{'import'}; 
     223    } 
     224 
    154225} 
    155226 
  • main/trunk/greenstone2/perllib/plugin.pm

    r21618 r22037  
    222222sub remove_some { 
    223223    my ($pluginfo, $infodbtype, $archivedir, $deleted_files) = @_; 
    224     print STDERR "in remove some\n"; 
    225224    return if (scalar(@$deleted_files)==0); 
    226225    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);