Changeset 16390 for gsdl


Ignore:
Timestamp:
2008-07-14T14:54:58+12:00 (16 years ago)
Author:
kjdon
Message:

global block pass: read_block is no more. blockign done in a first pass file_block_read. use can_process_this_file instead of read_block at start of read to see whether to process or not. associate_tail_re stuff partly set up here, but DirectoryPlugin does the actual blocking of files.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16022 r16390  
    7676    'deft' => "",
    7777    'reqd' => "no" },
     78      { 'name' => "no_blocking",
     79    'desc' => "{BasePlugin.no_blocking}",
     80    'type' => "flag",
     81    'reqd' => "no"},
    7882      { 'name' => "block_exp",
    7983    'desc' => "{BasePlugin.block_exp}",
    8084    'type' => "regexp",
    8185    'deft' => "",
    82     'reqd' => "no" },
    83       { 'name' => "smart_block",
    84     'desc' => "{BasePlugin.smart_block}",
    85     'type' => "flag",
    8686    'reqd' => "no" },
    8787      { 'name' => "associate_ext",
     
    107107    'deft' => "auto",
    108108    'list' => $encoding_plus_auto_list,
    109     'reqd' => "no" }
     109    'reqd' => "no" },
     110      { 'name' => "smart_block",
     111        'desc' => "{BasePlugin.smart_block}",
     112        'type' => "flag",
     113        'reqd' => "no",
     114    'hiddengli' => "yes" } # deprecated, but leave in for old collections
     115
    110116     
    111117      ];
     
    129135
    130136    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
     137   
     138    if ($self->{'info_only'}) {
     139        # don't worry about any options etc
     140        return bless $self, $class;
     141    }
     142
     143    if ($self->{'smart_block'}) {
     144    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
     145    }
     146    $self->{'smart_block'} = undef;
    131147
    132148    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
     
    161177    }
    162178
    163     $self->{'shared_fileroot'} = {};
    164     $self->{'file_blocks'} = {};
    165 
    166 
    167179    return bless $self, $class;
    168180
     
    181193    $self->{'outhandle'} = $outhandle if defined $outhandle;
    182194    $self->{'failhandle'} = $failhandle;
    183 
     195#    $self->SUPER::init(@_);
     196   
    184197    # set process_exp and block_exp to defaults unless they were
    185198    # explicitly set
     
    245258}
    246259
    247 # default implementation is to do nothing.
    248 sub store_block_files
    249 {
     260# default implementation is to do nothing
     261sub store_block_files {
     262   
    250263    my $self =shift (@_);
    251     my ($filename) = @_;
    252     return;
     264    my ($filename_full_path, $block_hash) = @_;
     265
     266}
     267
     268# put files to block into hash
     269sub use_block_expressions {
     270
     271    my $self =shift (@_);
     272    my ($filename_full_path, $block_hash) = @_;
     273
     274    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
     275    $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
     276    }
     277
    253278}
    254279
     
    257282{
    258283    my $self =shift;
    259     my $filename = shift;
     284    my ($filename, $block_hash) = @_;
    260285
    261286    if ($self->{'cover_image'}) {
     
    266291    }   
    267292    if (-e $coverfile) {
    268         $self->{'file_blocks'}->{$coverfile} = 1;
     293        $block_hash->{'file_blocks'}->{$coverfile} = 1;
    269294    }
    270295    }
     
    273298}
    274299
    275 sub root_ext_split
    276 {
    277     my $self = shift (@_);
    278     my ($filename,$tail_re) = @_;
    279    
    280     my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
    281 
    282     if ((!defined $file_prefix) || (!defined $file_ext)) {
    283     ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
    284     }
    285 
    286     return ($file_prefix,$file_ext);
    287 }
    288 
    289 sub metadata_read {
     300
     301# discover all the files that should be blocked by this plugin
     302# check the args ...
     303sub file_block_read {
     304
    290305    my $self = shift (@_); 
    291     my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
     306    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
    292307    # Keep track of filenames with same root but different extensions
    293308    # Used to support -associate_ext and the more generalised
    294309    # -associate_tail_re
     310    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    295311
    296312    my $associate_tail_re = $self->{'associate_tail_re'};
    297313    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
    298 
     314   
    299315    my ($file_prefix,$file_ext)
    300         = $self->root_ext_split($file,$associate_tail_re);
    301 
     316        = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
     317   
    302318    if ((defined $file_prefix) && (defined $file_ext)) {
    303 
    304         my $shared_fileroot = $self->{'shared_fileroot'};
     319        my $shared_fileroot = $block_hash->{'shared_fileroot'};
    305320        if (!defined $shared_fileroot->{$file_prefix}) {
    306321        my $file_prefix_rec = { 'tie_to'  => undef,
     
    311326        my $file_prefix_rec = $shared_fileroot->{$file_prefix};
    312327
    313         my $process_exp = $self->{'process_exp'};
    314 
    315         if ($file =~ m/$process_exp/) {
     328        if ($self->can_process_this_file($filename_full_path)) {
    316329        # This is the document the others should be tied to
    317330        $file_prefix_rec->{'tie_to'} = $file_ext;
     
    319332        else {
    320333        if ($file_ext =~ m/$associate_tail_re$/) {
     334            # this file should be associated to the main one
    321335            $file_prefix_rec->{'exts'}->{$file_ext} = 1;
    322336        }
     
    326340    }
    327341
     342    # check block expressions
     343    $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
     344
    328345    # now check whether we are actually processing this
    329     my $filename = $file;
    330     $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    331     if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     346    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
    332347    return undef; # can't recognise
    333348    }
    334 
    335     # do smart blocking if appropriate
    336     if ($self->{'smart_block'}) {
    337     $self->store_block_files($filename);
    338     }
     349   
     350    $self->store_block_files($filename_full_path, $block_hash) unless $self->{'no_blocking'};
     351
    339352    # block the cover image if there is one
    340353    if ($self->{'cover_image'}) {
    341     $self->block_cover_image($filename);
     354    $self->block_cover_image($filename_full_path, $block_hash) unless $self->{'no_blocking'};
    342355    }
    343356       
     
    345358}
    346359
    347 sub tie_to_filename
    348 {
    349     my $self = shift (@_); 
    350  
    351     my ($file_ext,$file_prefix_rec) = @_;
    352 
    353     if (defined $file_prefix_rec) {
    354     my $tie_to = $file_prefix_rec->{'tie_to'};
    355 
    356     if (defined $tie_to) {
    357         if ($tie_to eq $file_ext) {
    358         return 1;
    359         }
    360     }
    361     }
    362 
     360# plugins that rely on more than process_exp (eg XML plugins) can override this method
     361sub can_process_this_file {
     362    my $self = shift(@_);
     363    my ($filename) = @_;
     364
     365    if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
     366    return 1;
     367    }
    363368    return 0;
    364 }
    365 
    366 sub tie_to_assoc_file
    367 {
    368     my $self = shift (@_);   
    369     my ($file_ext,$file_prefix_rec) = @_;
    370 
    371     if (defined $file_prefix_rec) {
    372     my $tie_to = $file_prefix_rec->{'tie_to'};
    373     if (defined $tie_to) {
    374 
    375         my $exts = $file_prefix_rec->{'exts'};
    376 
    377         my $has_file_ext = $exts->{$file_ext};
    378 
    379         if ($has_file_ext) {
    380         return 1;
    381         }
    382     }
    383     }
    384 
    385     return 0;
    386 }
    387 
    388 
    389 sub associate_with
    390 {
    391     my $self = shift (@_);   
    392     my ($file, $filename, $metadata) = @_;
    393 
    394     my $associate_tail_re = $self->{'associate_tail_re'};
    395     return 0 if (!$associate_tail_re);
    396 
    397     # If file, see if matches with "tie_to" doc or is one of the
    398     # associated filename extensions.
    399 
    400     my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
    401 
    402     if ((defined $file_prefix) && (defined $file_ext)) {
    403 
    404     my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
    405    
    406     if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
    407 
    408         # Set up gsdlassocfile_tobe
    409 
    410         my $exts = $file_prefix_rec->{'exts'};
    411        
    412         if (!defined $metadata->{'gsdlassocfile_tobe'}) {
    413         $metadata->{'gsdlassocfile_tobe'} = [];
    414         }
    415 
    416         my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
    417            
    418         my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
    419         foreach my $e (keys %$exts) {       
    420         my $assoc_file = "$full_prefix$e";
    421         print STDERR "  $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
    422         my $mime_type = ""; # let system auto detect this
    423         push(@$assoc_tobe,"$assoc_file:$mime_type:");
    424         }
    425 
    426     }
    427     elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
    428 
    429 
    430         # a form of smart block     
    431         return 1;
    432     }
    433     }
    434 
    435     return 0;
    436 }
    437 
    438 sub get_full_filenames {
    439     my $self = shift (@_);
    440     my ($base_dir, $file) = @_;
    441 
    442     my $filename_full_path = $file;
    443     # add on directory if present
    444     $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    445     my $filename_no_path = $file;
    446     # remove directory if present
    447     $filename_no_path =~ s/^.*[\/\\]//;
    448     return ($filename_full_path, $filename_no_path);
    449 }
    450 
    451 sub read_block {
    452     my $self = shift (@_); 
    453  
    454     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    455 
    456 
    457     my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
    458 
    459     if ($self->associate_with($file,$filename_full_path,$metadata)) {
    460     # a form of smart block
    461     $self->{'num_blocked'} ++;
    462     return (0,undef); # blocked
    463     }
    464 
    465     my $smart_block = $self->{'smart_block'};
    466     my $smart_block_BN = $self->{'smart_block_BN'};
    467    
    468     if ($smart_block || $smart_block_BN) {
    469     if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
    470         $self->{'num_blocked'} ++;
    471         return (0,undef); # blocked
    472     }
    473     } else {
    474     if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
    475         $self->{'num_blocked'} ++;
    476         return (0,undef); # blocked
    477     }
    478     if ($self->{'cover_image'}) {
    479         if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
    480         $self->{'num_blocked'} ++;
    481         return (0,undef); # blocked
    482         }
    483     }
    484     }
    485 
    486     if ($filename_full_path !~ /$self->{'process_exp'}/ || !-f $filename_full_path) {
    487     return (undef,undef); # can't recognise
    488     }
    489    
    490     ##why are we returning the full filename - do we need this??
    491     return (1,$filename_full_path);
    492 }
    493 
    494 
    495 #filename_encoding set by user
    496 sub filename_to_utf8_metadata
    497 {
     369   
     370}
     371
     372# just converts path as is to utf8.
     373sub filepath_to_utf8 {
    498374    my $self = shift (@_); 
    499375    my ($file, $file_encoding) = @_;
    500 
    501     my $outhandle = $self->{'outhandle'};
    502 
    503     my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
     376    my $filemeta = $file;
    504377
    505378    my $filename_encoding = $self->{'filename_encoding'};
     
    529402    );
    530403    }
     404
     405    return $filemeta;
     406}
     407
     408# gets the filename with no path, converts to utf8, and then dm safes it.
     409#filename_encoding set by user
     410sub filename_to_utf8_metadata
     411{
     412    my $self = shift (@_); 
     413    my ($file, $file_encoding) = @_;
     414
     415    my $outhandle = $self->{'outhandle'};
     416
     417    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
     418    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
     419
    531420    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
    532421
     
    649538sub read_into_doc_obj {
    650539    my $self = shift (@_); 
    651     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     540    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    652541
    653542    my $outhandle = $self->{'outhandle'};
     
    658547        if $self->{'verbosity'} > 1;
    659548
    660     my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
     549    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    661550    # create a new document
    662551    my $doc_obj = new doc ($filename_full_path, "indexed_doc");
     
    724613}
    725614
     615# implement this if you are extracting metadata for other documents
     616sub metadata_read {
     617    my $self = shift (@_);
     618    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
     619   
     620    # can we process this file??
     621    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     622    return undef unless $self->can_process_this_file($filename_full_path);
     623
     624    return 1; # we recognise the file, but don't actually do anything with it
     625}
     626
     627
    726628# The BasePlugin read() function. This function calls read_into_doc_obj()
    727629# to ensure all the right things to make general options work for a
     
    741643sub read {
    742644    my $self = shift (@_); 
    743     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    744 
    745     # check that we are not blocked
    746     my ($block_status,$filename) = $self->read_block(@_);   
    747     return $block_status if ((!defined $block_status) || ($block_status==0));
    748 
     645    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     646
     647    # can we process this file??
     648    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     649    return undef unless $self->can_process_this_file($filename_full_path);
     650   
    749651    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
    750652   
     
    771673
    772674    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
    773     # die "BasePlugin::process function must be implemented in sub-class\n";
    774675
    775676    return undef; # never gets here
     
    781682
    782683}
     684
    783685# write_file -- used by ConvertToPlug, for example in post processing
    784686#
     
    848750        # need to be associated with a document, but the document hasn't
    849751        # been formed yet.
    850        
    851752        my $equiv_form = "";
    852753        foreach my $gaf (@{$metadata->{$field}}) {
     
    854755        my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
    855756        my $filename = $full_filename;
    856                  
    857757        $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
    858758
     
    860760
    861761        my ($file_prefix,$file_extended_ext)
    862             = $self->root_ext_split($tail_filename,$associate_tail_re);
     762            = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
    863763        my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
    864764
     
    943843
    944844
     845
    9458461;
Note: See TracChangeset for help on using the changeset viewer.