Changeset 18441 for gsdl

Show
Ignore:
Timestamp:
01.02.2009 14:36:22 (11 years ago)
Author:
davidb
Message:

Modifications for incremental building to support files that need to be deleted

Location:
gsdl/trunk/perllib
Files:
12 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/arcinfo.pm

    r15889 r18441  
    4242use strict; 
    4343 
     44use dbutil; 
    4445 
    4546# File format read in: OID <tab> Filename <tab> Optional-Index-Status 
     
    6061} 
    6162 
    62 sub load_info { 
    63     my $self = shift (@_); 
    64     my ($filename) = @_; 
    65  
    66     $self->{'info'} = {}; 
    67   
     63sub _load_info_txt  
     64{ 
     65    my $self = shift (@_); 
     66    my ($filename) = @_; 
     67 
    6868    if (defined $filename && -e $filename) { 
    6969    open (INFILE, $filename) ||  
     
    8080    close (INFILE); 
    8181    } 
    82 } 
    83  
    84 sub save_info { 
     82 
     83 
     84} 
     85 
     86sub _load_info_gdbm 
     87{ 
     88    my $self = shift (@_); 
     89    my ($filename) = @_; 
     90 
     91    my $infodb_map = {}; 
     92 
     93    &dbutil::read_infodb_file_gdbm($filename,$infodb_map); 
     94 
     95    foreach my $oid ( keys %$infodb_map ) { 
     96    my $vals = $infodb_map->{$oid}; 
     97    # interested in doc-file and index-status 
     98 
     99    my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m); 
     100    my ($index_status) = ($vals=~/^<index-status>(.*)$/m); 
     101    $self->add_info ($oid,$doc_file,$index_status); 
     102    } 
     103} 
     104 
     105sub load_info { 
     106    my $self = shift (@_); 
     107    my ($filename) = @_; 
     108 
     109    $self->{'info'} = {}; 
     110 
     111    if ((defined $filename) && (-e $filename)) { 
     112    if ($filename =~ m/\.inf$/) { 
     113        $self->_load_info_txt($filename); 
     114    } 
     115    else { 
     116        $self->_load_info_gdbm($filename); 
     117    } 
     118    } 
     119} 
     120 
     121sub _load_filelist_gdbm 
     122{ 
     123    my $self = shift (@_); 
     124    my ($filename) = @_; 
     125 
     126    my $infodb_map = {}; 
     127 
     128    &dbutil::read_infodb_keys_gdbm($filename,$infodb_map); 
     129 
     130    foreach my $file ( keys %$infodb_map ) { 
     131    $self->{'import_filelist'}->{$file} = 1; 
     132    } 
     133} 
     134 
     135 
     136sub load_import_filelist { 
     137    my $self = shift (@_); 
     138    my ($filename) = @_; 
     139 
     140    $self->{'import-filelist'} = {}; 
     141 
     142    if ((defined $filename) && (-e $filename)) { 
     143    if ($filename =~ m/\.inf$/) { 
     144        # e.g. 'archives-src.inf' (which includes complete list of file 
     145        # from last time import.pl was run) 
     146        $self->_load_info_txt($filename); 
     147    } 
     148    else { 
     149        $self->_load_filelist_gdbm($filename); 
     150    } 
     151    } 
     152} 
     153 
     154sub _save_info_txt { 
    85155    my $self = shift (@_); 
    86156    my ($filename) = @_; 
     
    97167    } 
    98168    close (OUTFILE); 
     169} 
     170 
     171sub _save_info_gdbm { 
     172    my $self = shift (@_); 
     173    my ($filename) = @_; 
     174 
     175    # Not the most efficient operation, but will do for now 
     176 
     177    # read it in 
     178    my $infodb_map = {}; 
     179    &dbutil::read_infodb_file_gdbm($filename,$infodb_map); 
     180 
     181    # change index-status values 
     182    foreach my $info (@{$self->get_OID_list()}) { 
     183    if (defined $info) { 
     184        my ($oid,$doc_file,$index_status) = @$info; 
     185        if (defined $infodb_map->{$oid}) { 
     186        my $vals_ref = \$infodb_map->{$oid}; 
     187        $$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m; 
     188        } 
     189        else { 
     190        print STDERR "Warning: $filename does not have key $oid\n"; 
     191        } 
     192    } 
     193    } 
     194 
     195 
     196    # write out again 
     197    my $infodb_handle = &dbutil::open_infodb_write_handle_gdbm($filename); 
     198    foreach my $oid ( keys %$infodb_map ) { 
     199    # consider making the following a method in dbutil 
     200    # e.g. write_infodb_rawentry_gdbm($infodb_handle,$oid,$vals); 
     201 
     202    # no need to escape, as $infodb_map->{$oid} hasn't been unescaped 
     203    print $infodb_handle "[$oid]\n"; 
     204    print $infodb_handle $infodb_map->{$oid}; 
     205    print $infodb_handle '-' x 70, "\n"; 
     206    } 
     207    &dbutil::close_infodb_write_handle_gdbm($infodb_handle); 
     208 
     209} 
     210 
     211sub save_info { 
     212    my $self = shift (@_); 
     213    my ($filename) = @_; 
     214 
     215    if ($filename =~ m/\.inf$/) { 
     216    $self->_save_info_txt($filename); 
     217    } 
     218    else { 
     219    $self->_save_info_gdbm($filename); 
     220    } 
    99221} 
    100222 
  • gsdl/trunk/perllib/basebuilder.pm

    r17573 r18441  
    470470    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections(); 
    471471    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); 
    472      
     472 
    473473    # store the mapping between the index names and the directory names 
    474474    # the index map is used to determine what indexes there are, so any that are not built should not be put into the map. 
  • gsdl/trunk/perllib/cfgread.pm

    r16929 r18441  
    141141    my ($filename, $data, $stringexp, $arrayexp, $hashexp, $arrayarrayexp, 
    142142    $hashhashexp) = @_; 
    143      
     143 
    144144    if (open (COLCFG, ">$filename")) { 
    145145    foreach my $key (sort(keys(%$data))) { 
  • gsdl/trunk/perllib/cpan/Image/Size.pm

    r13983 r18441  
    1818package Image::Size; 
    1919 
    20 require 5.6.0; 
     20# require 5.6.0; 
     21require 5.006_000; 
    2122 
    2223use strict; 
  • gsdl/trunk/perllib/dbutil.pm

    r17476 r18441  
    320320  my $infodb_map = shift(@_); 
    321321 
    322   open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt\n"; 
     322  open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt \$infodb_file_path\"\n"; 
    323323  my $infodb_line = ""; 
    324324  my $infodb_key = ""; 
     
    340340      $infodb_value .= $infodb_line; 
    341341    } 
     342  } 
     343 
     344  close (PIPEIN); 
     345} 
     346 
     347sub read_infodb_keys_gdbm 
     348{ 
     349  my $infodb_file_path = shift(@_); 
     350  my $infodb_map = shift(@_); 
     351 
     352  open (PIPEIN, "gdbmkeys \"$infodb_file_path\" |") || die "couldn't open pipe from gdbmkeys \$infodb_file_path\"\n"; 
     353  my $infodb_line = ""; 
     354  my $infodb_key = ""; 
     355  my $infodb_value = ""; 
     356  while (defined ($infodb_line = <PIPEIN>)) 
     357  { 
     358      chomp $infodb_line; # remove end of line  
     359 
     360      $infodb_map->{$infodb_line} = 1; 
    342361  } 
    343362 
  • gsdl/trunk/perllib/manifest.pm

    r17249 r18441  
    11package manifest; 
    2  
    32 
    43use XMLParser; 
    54use strict; 
    65no strict 'refs'; # allow filehandles to be variables and viceversa 
    7  
    86 
    97our $self; 
  • gsdl/trunk/perllib/plugin.pm

    r17746 r18441  
    301301    die "\n"; 
    302302    } 
    303      
     303 
    304304    my $had_error = 0; 
    305305    # pass this file by each of the plugins in turn until one 
  • gsdl/trunk/perllib/plugins/ArchivesInfPlugin.pm

    r17738 r18441  
    2424########################################################################### 
    2525 
    26 # plugin which reads through an archives.inf file 
    27 # (i.e. the file generated in the archives directory 
    28 # when an import is done), processing each file it finds  
     26# plugin which reads through an archives.inf (or GDBM equivalent, 
     27# archiveinf-doc.{ldb,bdb} file (i.e. the file generated in the 
     28# archives directory when an import is done), processing each file it 
     29# finds 
    2930 
    3031package ArchivesInfPlugin; 
     
    8687    my ($self) = @_; 
    8788 
     89    print STDERR "*** Running ArchivesInf deinit\n"; 
     90 
    8891    my $archive_info = $self->{'archive_info'}; 
    8992 
     
    9396        my $file_list = $archive_info->get_file_list(); 
    9497 
    95     # change each file to "Been Indexed" 
    96  
    9798    foreach my $subfile (@$file_list) { 
    9899        my $doc_oid = $subfile->[1]; 
    99         # why do we get this when it is not used??? 
     100 
    100101        my $index_status = $archive_info->get_status_info($doc_oid); 
    101         $archive_info->set_status_info($doc_oid,"B"); 
     102        if ($index_status eq "D") { 
     103        # delete 
     104        $archive_info->delete_info($doc_oid); 
     105        } 
     106        elsif ($index_status =~ m/^(I|R)$/) { 
     107        # mark as "been indexed" 
     108        $archive_info->set_status_info($doc_oid,"B"); 
     109        } 
    102110    } 
    103111 
     
    169177 
    170178    # see if this has a archives information file within it 
    171     my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf"); 
     179##    my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf"); 
     180    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb"; 
     181    my $doc_db = "archiveinf-doc$db_ext"; 
     182    my $archive_info_filename = &util::filename_cat($base_dir,$file,$doc_db); 
    172183 
    173184    if (-e $archive_info_filename) { 
     
    187198    # process each file 
    188199    foreach my $subfile (@$file_list) { 
     200 
    189201        last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs); 
    190202 
     
    198210        if ($processor->is_incremental_capable() && $self->{'incremental'}) 
    199211        { 
    200             # We don't need to process the file if it has already been built 
     212            # Check to see if the file needs indexing 
    201213        my $doc_oid = $subfile->[1]; 
    202214        my $index_status = $archive_info->get_status_info($doc_oid); 
    203215        if ($index_status eq "B") 
    204216        { 
    205             # Don't process this file 
     217            # Don't process this file as it has already been indexed 
    206218            $process_file = 0; 
    207219        } 
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r18404 r18441  
    433433    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    434434 
     435    if (!-d $filename_full_path) { 
     436    $block_hash->{'all_files'}->{$file} = 1; 
     437    } 
     438 
    435439    my $associate_tail_re = $self->{'associate_tail_re'}; 
    436440    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 
  • gsdl/trunk/perllib/plugins/DirectoryPlugin.pm

    r17738 r18441  
    131131 
    132132    my $output_dir = $processor->getoutputdir(); 
    133     my $archives_inf = &util::filename_cat($output_dir,"archives.inf"); 
    134      
     133##  my $archives_inf = &util::filename_cat($output_dir,"archives.inf"); 
     134    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb"; 
     135    my $doc_db = "archiveinf-doc$db_ext"; 
     136        my $archives_inf = &util::filename_cat($output_dir,$doc_db); 
     137 
    135138    if ( -e $archives_inf ) { 
    136139        $self->{'inf_timestamp'} = -M $archives_inf; 
     
    278281    return $directory_ok unless (defined $directory_ok && $directory_ok == 1); 
    279282 
     283    $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'}; 
     284 
    280285    $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'}; 
    281286    $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'}; 
     
    576581        if ($filename_timestamp > $inf_timestamp) { 
    577582            # filename has been around for longer than inf 
    578 #####           print $outhandle "**** Skipping $subfile\n"; 
     583            print $outhandle "**** Skipping $subfile\n" if ($verbosity >3); 
    579584            next; 
    580585        } 
  • gsdl/trunk/perllib/plugouts/BasePlugout.pm

    r17884 r18441  
    404404    ############################## 
    405405    $self->saveas($doc_obj,$doc_dir); 
    406 ##    $self->archiveinf_gdbm($doc_obj,$doc_dir);  
     406    $self->archiveinf_gdbm($doc_obj,$doc_dir);  
    407407 
    408408} 
     
    660660    my $source_filename = $doc_obj->get_source_filename(); 
    661661 
    662     my $oid_files = { 'src-file' => $source_filename, 
     662    my $working_info = $self->{'output_info'};  
     663    my $doc_info = $working_info->get_info($oid); 
     664    my ($doc_file,$index_status) = @$doc_info; 
     665 
     666    my $oid_files = { 'doc-file' => $doc_file, 
     667              'index-status' => $index_status, 
     668              'src-file' => $source_filename, 
    663669              'assoc-files' => [] }; 
    664670     
     
    701707 
    702708    my $doc_db_text = ""; 
     709    $doc_db_text .= "<doc-file>$oid_files->{'doc-file'}\n"; 
     710    $doc_db_text .= "<index-status>$oid_files->{'index-status'}\n"; 
    703711    $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n"; 
    704712    foreach my $af (@{$oid_files->{'assoc-files'}}) { 
  • gsdl/trunk/perllib/util.pm

    r18411 r18441  
    768768} 
    769769 
     770sub filename_is_absolute 
     771{ 
     772    my ($filename) = @_; 
     773 
     774    if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     775    return ($filename =~ m/^(\w:)?\\/); 
     776    } 
     777    else { 
     778    return ($filename =~ m/^\//); 
     779    } 
     780} 
     781 
     782 
    770783## @method make_absolute() 
    771784# 
     
    786799     
    787800    my ($base_dir, $dir) = @_; 
    788     print STDERR "dir = $dir\n"; 
     801###    print STDERR "dir = $dir\n"; 
    789802    $dir =~ s/[\\\/]+/\//g; 
    790803    $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);  
     
    793806    $dir =~ s|/[.][.]?/|/|g; 
    794807    $dir =~ tr|/|/|s; 
    795     print STDERR "dir = $dir\n"; 
     808###    print STDERR "dir = $dir\n"; 
    796809     
    797810    return $dir;