Changeset 10280


Ignore:
Timestamp:
2005-07-25T14:19:14+12:00 (19 years ago)
Author:
chi
Message:

Some major changes to allow secondary plugin setting.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r10254 r10280  
    213213
    214214
    215 
    216 
    217215sub get_arguments
    218216{
     
    386384    die "\n";
    387385    }
     386
    388387
    389388    # else parsing was successful.
     
    431430        $self->{'places'} = $places_ref;
    432431    }
    433     }   
     432    }
    434433    return bless $self, $class;
    435434   
     
    673672
    674673
    675 # The BasPlug read() function. This function does all the right things
    676 # to make general options work for a given plugin. It calls the process()
    677 # function which does all the work specific to a plugin (like the old
    678 # read functions used to do). Most plugins should define their own
    679 # process() function and let this read() function keep control.
    680 #
    681 # recursive plugins (e.g. RecPlug) and specialized plugins like those
    682 # capable of processing many documents within a single file (e.g.
    683 # GMLPlug) should normally implement their own version of read()
    684 #
    685 # Return number of files processed, undef if can't recognise, -1 if can't
    686 # process
    687 # Note that $base_dir might be "" and that $file might
    688 # include directories
    689 
    690 sub read {
     674sub read_block {
    691675    my $self = shift (@_); 
    692676 
    693677    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    694678
    695     if ($self->is_recursive()) {
    696     gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
    697     }
    698 
    699     my $outhandle   = $self->{'outhandle'};
    700     my $smart_block = $self->{'smart_block'};
    701     my $smart_block_BN = $self->{'smart_block_BN'};
    702679
    703680    my $filename = $file;
     
    707684    # a form of smart block
    708685    $self->{'num_blocked'} ++;
    709     return 0; # blocked
    710     }
     686    return (0,undef); # blocked
     687    }
     688
     689    my $smart_block = $self->{'smart_block'};
     690    my $smart_block_BN = $self->{'smart_block_BN'};
    711691
    712692    if ($smart_block || $smart_block_BN) {
     
    714694    if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
    715695        $self->{'num_blocked'} ++;
    716         return 0; # blocked
     696        return (0,undef); # blocked
    717697    }
    718698    } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
    719699    $self->{'num_blocked'} ++;
    720     return 0; # blocked
     700    return (0,undef); # blocked
    721701    }
    722702   
    723703    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
    724     return undef; # can't recognise
    725     }
     704    return (undef,undef); # can't recognise
     705    }
     706   
     707    return (1,$filename);
     708}
     709
     710sub read_tidy_file {
     711
     712    my $self = shift (@_); 
     713 
     714    my ($file) = @_;
     715
    726716    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     717
     718    return $file;
     719}
     720
     721
     722
     723# The BasPlug read_into_doc_obj() function. This function does all the
     724# right things to make general options work for a given plugin.  It reads in
     725# a file and sets up a slew of metadata all saved in doc_obj, which
     726# it then returns as part of a tuple (process_status,doc_obj)
     727#
     728# Much of this functionality used to reside in read, but it was broken
     729# down into a supporting routine to make the code more flexible. 
     730#
     731# recursive plugins (e.g. RecPlug) and specialized plugins like those
     732# capable of processing many documents within a single file (e.g.
     733# GMLPlug) will normally want to implement their own version of
     734# read_into_doc_obj()
     735#
     736# Note that $base_dir might be "" and that $file might
     737# include directories
     738sub read_into_doc_obj {
     739    my $self = shift (@_); 
     740    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     741
     742    if ($self->is_recursive()) {
     743    gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
     744    }
     745
     746    my $outhandle   = $self->{'outhandle'};
     747
     748    my ($block_status,$filename) = $self->read_block(@_);   
     749    return $block_status if ((!defined $block_status) || ($block_status==0));
     750    $file = $self->read_tidy_file($file);
    727751
    728752    # Do encoding stuff
     
    760784    $self->{'num_not_processed'} ++;
    761785
    762     return 0; # what should we return here?? error but don't want to pass it on
     786    return (0,undef); # what should we return here?? error but don't want to pass it on
    763787    }
    764788   
     
    773797    undef $text;
    774798    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    775     return -1;
     799    return (-1,undef);
    776800    }
    777801    $text='';
     
    790814    $doc_obj->set_OID();
    791815    }
    792 
    793     # process the document
    794     $processor->process($doc_obj);
    795 
    796     if(defined($self->{'places_filename'})){
    797     &util::rm($self->{'places_filename'});
    798     $self->{'places_filename'} = undef;
    799     }
    800 
    801     $self->{'num_processed'} ++;
    802     undef $doc_obj;
    803     return 1; # processed the file
     816   
     817    return (1,$doc_obj);
     818}
     819
     820
     821# The BasPlug read() function. This function calls read_into_doc_obj()
     822# to ensure all the right things to make general options work for a
     823# given plugin are done. It then calls the process() function which
     824# does all the work specific to a plugin (like the old read functions
     825# used to do). Most plugins should define their own process() function
     826# and let this read() function keep control. 
     827#
     828# recursive plugins (e.g. RecPlug) and specialized plugins like those
     829# capable of processing many documents within a single file (e.g.
     830# GMLPlug) might want to implement their own version of read(), but
     831# more likely need to implement their own version of read_into_doc_obj()
     832#
     833# Return number of files processed, undef if can't recognise, -1 if can't
     834# process
     835
     836sub read {
     837    my $self = shift (@_); 
     838    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     839
     840    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
     841   
     842    if ((defined $process_status) && ($process_status == 1)) {
     843    # process the document
     844    $processor->process($doc_obj);
     845   
     846    if(defined($self->{'places_filename'})){
     847        &util::rm($self->{'places_filename'});
     848        $self->{'places_filename'} = undef;
     849    }
     850   
     851    $self->{'num_processed'} ++;
     852    undef $doc_obj;
     853    }
     854
     855    # if process_status == 1, then the file has been processed.
     856    return $process_status;
     857
    804858}
    805859
     
    830884    return;
    831885    }
    832 
    833886    $$textref = "";
    834 
    835887    if (!open (FILE, $filename)) {
    836888    gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
    837      die "\n";
    838      }
    839 
     889    die "\n";
     890    }
     891     
    840892    if ($encoding eq "ascii") {
    841893    undef $/;
     
    847899    $reader->set_encoding ($encoding);
    848900    $reader->read_file ($textref);
    849 
    850     #Now segments chinese if the separate_cjk option is set
     901        #Now segments chinese if the separate_cjk option is set
    851902    if ($self->{'separate_cjk'}) {
    852903        # segment the Chinese words
     
    854905    }
    855906    }
    856 
    857907    close FILE;
    858908}
     909
     910# write_file -- used by ConvertToPlug, for example in post processing
     911#
     912sub utf8_write_file {
     913    my $self = shift (@_);
     914    my ($textref, $filename) = @_;
     915   
     916    if (!open (FILE, ">$filename")) {
     917    gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
     918     die "\n";
     919     }
     920    print FILE $$textref;
     921   
     922    close FILE;
     923}
     924
    859925
    860926sub filename_based_title
     
    887953    my ($filename) = @_;
    888954
     955   
    889956    my ($language, $encoding, $extracted_encoding);
    890957    if ($self->{'input_encoding'} eq "auto") {
     
    910977        $encoding = $self->{'input_encoding'};
    911978    }
     979
    912980    return ($language, $encoding);
    913981}
     
    9421010    }
    9431011    }
    944 
     1012   
    9451013
    9461014    # remove <title>stuff</title> -- as titles tend often to be in English
Note: See TracChangeset for help on using the changeset viewer.