Changeset 16390

Show
Ignore:
Timestamp:
14.07.2008 14:54:58 (11 years ago)
Author:
kjdon
Message:

global block pass: read_block is no more. blockign done in a first pass file_block_read. use can_process_this_file instead of read_block at start of read to see whether to process or not. associate_tail_re stuff partly set up here, but DirectoryPlugin? does the actual blocking of files.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16022 r16390  
    7676    'deft' => "", 
    7777    'reqd' => "no" }, 
     78      { 'name' => "no_blocking", 
     79    'desc' => "{BasePlugin.no_blocking}", 
     80    'type' => "flag", 
     81    'reqd' => "no"}, 
    7882      { 'name' => "block_exp", 
    7983    'desc' => "{BasePlugin.block_exp}", 
    8084    'type' => "regexp", 
    8185    'deft' => "", 
    82     'reqd' => "no" }, 
    83       { 'name' => "smart_block", 
    84     'desc' => "{BasePlugin.smart_block}", 
    85     'type' => "flag", 
    8686    'reqd' => "no" }, 
    8787      { 'name' => "associate_ext", 
     
    107107    'deft' => "auto", 
    108108    'list' => $encoding_plus_auto_list, 
    109     'reqd' => "no" } 
     109    'reqd' => "no" }, 
     110      { 'name' => "smart_block", 
     111        'desc' => "{BasePlugin.smart_block}", 
     112        'type' => "flag", 
     113        'reqd' => "no", 
     114    'hiddengli' => "yes" } # deprecated, but leave in for old collections 
     115 
    110116       
    111117      ]; 
     
    129135 
    130136    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists); 
     137     
     138    if ($self->{'info_only'}) { 
     139        # don't worry about any options etc 
     140        return bless $self, $class; 
     141    } 
     142 
     143    if ($self->{'smart_block'}) { 
     144    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n"; 
     145    } 
     146    $self->{'smart_block'} = undef; 
    131147 
    132148    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class; 
     
    161177    } 
    162178 
    163     $self->{'shared_fileroot'} = {}; 
    164     $self->{'file_blocks'} = {}; 
    165  
    166  
    167179    return bless $self, $class; 
    168180 
     
    181193    $self->{'outhandle'} = $outhandle if defined $outhandle; 
    182194    $self->{'failhandle'} = $failhandle; 
    183  
     195#    $self->SUPER::init(@_); 
     196     
    184197    # set process_exp and block_exp to defaults unless they were 
    185198    # explicitly set 
     
    245258} 
    246259 
    247 # default implementation is to do nothing.  
    248 sub store_block_files 
    249 { 
     260# default implementation is to do nothing 
     261sub store_block_files { 
     262     
    250263    my $self =shift (@_); 
    251     my ($filename) = @_; 
    252     return; 
     264    my ($filename_full_path, $block_hash) = @_; 
     265 
     266} 
     267 
     268# put files to block into hash  
     269sub use_block_expressions { 
     270 
     271    my $self =shift (@_); 
     272    my ($filename_full_path, $block_hash) = @_; 
     273 
     274    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) { 
     275    $block_hash->{'file_blocks'}->{$filename_full_path} = 1; 
     276    } 
     277 
    253278} 
    254279 
     
    257282{ 
    258283    my $self =shift; 
    259     my $filename = shift; 
     284    my ($filename, $block_hash) = @_; 
    260285 
    261286    if ($self->{'cover_image'}) { 
     
    266291    }    
    267292    if (-e $coverfile) { 
    268         $self->{'file_blocks'}->{$coverfile} = 1; 
     293        $block_hash->{'file_blocks'}->{$coverfile} = 1; 
    269294    }  
    270295    } 
     
    273298} 
    274299 
    275 sub root_ext_split 
    276 { 
    277     my $self = shift (@_); 
    278     my ($filename,$tail_re) = @_; 
    279      
    280     my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/); 
    281  
    282     if ((!defined $file_prefix) || (!defined $file_ext)) { 
    283     ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/); 
    284     } 
    285  
    286     return ($file_prefix,$file_ext); 
    287 } 
    288  
    289 sub metadata_read { 
     300 
     301# discover all the files that should be blocked by this plugin 
     302# check the args ... 
     303sub file_block_read { 
     304 
    290305    my $self = shift (@_);   
    291     my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 
     306    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_; 
    292307    # Keep track of filenames with same root but different extensions 
    293308    # Used to support -associate_ext and the more generalised 
    294309    # -associate_tail_re 
     310    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    295311 
    296312    my $associate_tail_re = $self->{'associate_tail_re'}; 
    297313    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 
    298  
     314     
    299315    my ($file_prefix,$file_ext)  
    300         = $self->root_ext_split($file,$associate_tail_re); 
    301  
     316        = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re); 
     317     
    302318    if ((defined $file_prefix) && (defined $file_ext)) { 
    303  
    304         my $shared_fileroot = $self->{'shared_fileroot'}; 
     319        my $shared_fileroot = $block_hash->{'shared_fileroot'}; 
    305320        if (!defined $shared_fileroot->{$file_prefix}) { 
    306321        my $file_prefix_rec = { 'tie_to'  => undef,  
     
    311326        my $file_prefix_rec = $shared_fileroot->{$file_prefix}; 
    312327 
    313         my $process_exp = $self->{'process_exp'}; 
    314  
    315         if ($file =~ m/$process_exp/) { 
     328        if ($self->can_process_this_file($filename_full_path)) { 
    316329        # This is the document the others should be tied to 
    317330        $file_prefix_rec->{'tie_to'} = $file_ext; 
     
    319332        else { 
    320333        if ($file_ext =~ m/$associate_tail_re$/) { 
     334            # this file should be associated to the main one 
    321335            $file_prefix_rec->{'exts'}->{$file_ext} = 1; 
    322336        } 
     
    326340    } 
    327341 
     342    # check block expressions 
     343    $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'}; 
     344 
    328345    # now check whether we are actually processing this 
    329     my $filename = $file; 
    330     $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
    331     if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) { 
     346    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) { 
    332347    return undef; # can't recognise 
    333348    } 
    334  
    335     # do smart blocking if appropriate 
    336     if ($self->{'smart_block'}) { 
    337     $self->store_block_files($filename); 
    338     } 
     349    
     350    $self->store_block_files($filename_full_path, $block_hash) unless $self->{'no_blocking'}; 
     351 
    339352    # block the cover image if there is one 
    340353    if ($self->{'cover_image'}) { 
    341     $self->block_cover_image($filename); 
     354    $self->block_cover_image($filename_full_path, $block_hash) unless $self->{'no_blocking'}; 
    342355    } 
    343356        
     
    345358} 
    346359 
    347 sub tie_to_filename 
    348 { 
    349     my $self = shift (@_);   
    350    
    351     my ($file_ext,$file_prefix_rec) = @_; 
    352  
    353     if (defined $file_prefix_rec) { 
    354     my $tie_to = $file_prefix_rec->{'tie_to'}; 
    355  
    356     if (defined $tie_to) { 
    357         if ($tie_to eq $file_ext) { 
    358         return 1; 
    359         } 
    360     } 
    361     } 
    362  
     360# plugins that rely on more than process_exp (eg XML plugins) can override this method 
     361sub can_process_this_file { 
     362    my $self = shift(@_); 
     363    my ($filename) = @_; 
     364 
     365    if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) { 
     366    return 1; 
     367    } 
    363368    return 0; 
    364 } 
    365  
    366 sub tie_to_assoc_file 
    367 { 
    368     my $self = shift (@_);     
    369     my ($file_ext,$file_prefix_rec) = @_; 
    370  
    371     if (defined $file_prefix_rec) { 
    372     my $tie_to = $file_prefix_rec->{'tie_to'}; 
    373     if (defined $tie_to) { 
    374  
    375         my $exts = $file_prefix_rec->{'exts'}; 
    376  
    377         my $has_file_ext = $exts->{$file_ext}; 
    378  
    379         if ($has_file_ext) { 
    380         return 1; 
    381         } 
    382     } 
    383     } 
    384  
    385     return 0; 
    386 } 
    387  
    388  
    389 sub associate_with 
    390 { 
    391     my $self = shift (@_);     
    392     my ($file, $filename, $metadata) = @_; 
    393  
    394     my $associate_tail_re = $self->{'associate_tail_re'}; 
    395     return 0 if (!$associate_tail_re); 
    396  
    397     # If file, see if matches with "tie_to" doc or is one of the 
    398     # associated filename extensions. 
    399  
    400     my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re); 
    401  
    402     if ((defined $file_prefix) && (defined $file_ext)) { 
    403  
    404     my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix}; 
    405      
    406     if ($self->tie_to_filename($file_ext,$file_prefix_rec)) { 
    407  
    408         # Set up gsdlassocfile_tobe 
    409  
    410         my $exts = $file_prefix_rec->{'exts'}; 
    411          
    412         if (!defined $metadata->{'gsdlassocfile_tobe'}) { 
    413         $metadata->{'gsdlassocfile_tobe'} = []; 
    414         } 
    415  
    416         my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'}; 
    417              
    418         my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/); 
    419         foreach my $e (keys %$exts) {        
    420         my $assoc_file = "$full_prefix$e"; 
    421         print STDERR "  $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n"; 
    422         my $mime_type = ""; # let system auto detect this 
    423         push(@$assoc_tobe,"$assoc_file:$mime_type:");  
    424         } 
    425  
    426     } 
    427     elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) { 
    428  
    429  
    430         # a form of smart block      
    431         return 1; 
    432     } 
    433     } 
    434  
    435     return 0; 
    436 } 
    437  
    438 sub get_full_filenames { 
    439     my $self = shift (@_); 
    440     my ($base_dir, $file) = @_; 
    441  
    442     my $filename_full_path = $file; 
    443     # add on directory if present 
    444     $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
    445     my $filename_no_path = $file; 
    446     # remove directory if present 
    447     $filename_no_path =~ s/^.*[\/\\]//; 
    448     return ($filename_full_path, $filename_no_path); 
    449 } 
    450  
    451 sub read_block { 
    452     my $self = shift (@_);   
    453    
    454     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    455  
    456  
    457     my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file); 
    458  
    459     if ($self->associate_with($file,$filename_full_path,$metadata)) { 
    460     # a form of smart block 
    461     $self->{'num_blocked'} ++; 
    462     return (0,undef); # blocked 
    463     } 
    464  
    465     my $smart_block = $self->{'smart_block'}; 
    466     my $smart_block_BN = $self->{'smart_block_BN'}; 
    467     
    468     if ($smart_block || $smart_block_BN) { 
    469     if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){ 
    470         $self->{'num_blocked'} ++; 
    471         return (0,undef); # blocked 
    472     } 
    473     } else { 
    474     if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) { 
    475         $self->{'num_blocked'} ++; 
    476         return (0,undef); # blocked 
    477     } 
    478     if ($self->{'cover_image'}) { 
    479         if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){ 
    480         $self->{'num_blocked'} ++; 
    481         return (0,undef); # blocked 
    482         } 
    483     } 
    484     } 
    485  
    486     if ($filename_full_path !~ /$self->{'process_exp'}/ || !-f $filename_full_path) { 
    487     return (undef,undef); # can't recognise 
    488     } 
    489      
    490     ##why are we returning the full filename - do we need this?? 
    491     return (1,$filename_full_path); 
    492 } 
    493  
    494  
    495 #filename_encoding set by user 
    496 sub filename_to_utf8_metadata 
    497 { 
     369     
     370} 
     371 
     372# just converts path as is to utf8. 
     373sub filepath_to_utf8 { 
    498374    my $self = shift (@_);   
    499375    my ($file, $file_encoding) = @_; 
    500  
    501     my $outhandle = $self->{'outhandle'}; 
    502  
    503     my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 
     376    my $filemeta = $file; 
    504377 
    505378    my $filename_encoding = $self->{'filename_encoding'}; 
     
    529402    ); 
    530403    } 
     404 
     405    return $filemeta; 
     406} 
     407 
     408# gets the filename with no path, converts to utf8, and then dm safes it. 
     409#filename_encoding set by user 
     410sub filename_to_utf8_metadata 
     411{ 
     412    my $self = shift (@_);   
     413    my ($file, $file_encoding) = @_; 
     414 
     415    my $outhandle = $self->{'outhandle'}; 
     416 
     417    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 
     418    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding); 
     419 
    531420    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta); 
    532421 
     
    649538sub read_into_doc_obj { 
    650539    my $self = shift (@_);   
    651     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
     540    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    652541 
    653542    my $outhandle = $self->{'outhandle'}; 
     
    658547        if $self->{'verbosity'} > 1; 
    659548 
    660     my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file); 
     549    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    661550    # create a new document 
    662551    my $doc_obj = new doc ($filename_full_path, "indexed_doc"); 
     
    724613} 
    725614 
     615# implement this if you are extracting metadata for other documents 
     616sub metadata_read { 
     617    my $self = shift (@_); 
     618    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 
     619     
     620    # can we process this file?? 
     621    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
     622    return undef unless $self->can_process_this_file($filename_full_path); 
     623 
     624    return 1; # we recognise the file, but don't actually do anything with it 
     625} 
     626 
     627 
    726628# The BasePlugin read() function. This function calls read_into_doc_obj() 
    727629# to ensure all the right things to make general options work for a 
     
    741643sub read { 
    742644    my $self = shift (@_);   
    743     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    744  
    745     # check that we are not blocked 
    746     my ($block_status,$filename) = $self->read_block(@_);     
    747     return $block_status if ((!defined $block_status) || ($block_status==0)); 
    748  
     645    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
     646 
     647    # can we process this file?? 
     648    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
     649    return undef unless $self->can_process_this_file($filename_full_path); 
     650     
    749651    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_); 
    750652    
     
    771673 
    772674    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n"; 
    773     # die "BasePlugin::process function must be implemented in sub-class\n"; 
    774675 
    775676    return undef; # never gets here 
     
    781682 
    782683} 
     684 
    783685# write_file -- used by ConvertToPlug, for example in post processing 
    784686# 
     
    848750        # need to be associated with a document, but the document hasn't 
    849751        # been formed yet. 
    850          
    851752        my $equiv_form = ""; 
    852753        foreach my $gaf (@{$metadata->{$field}}) { 
     
    854755        my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/); 
    855756        my $filename = $full_filename; 
    856                    
    857757        $doc_obj->associate_file($full_filename,$tail_filename,$mimetype); 
    858758 
     
    860760 
    861761        my ($file_prefix,$file_extended_ext)  
    862             = $self->root_ext_split($tail_filename,$associate_tail_re); 
     762            = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re); 
    863763        my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/); 
    864764 
     
    943843 
    944844 
     845 
    9458461;