Ignore:
Timestamp:
2000-06-27T17:10:07+12:00 (24 years ago)
Author:
sjboddie
Message:

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1242 r1244  
    3333use doc;
    3434
    35 sub print_usage {
     35sub print_general_usage {
    3636    my ($plugin_name) = @_;
    3737
    38     print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
    39     print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
    40    
    4138    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
    42     print STDERR "  currently supported general options are:\n";
    4339    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
    4440    print STDERR "                     converted from these encodings and stored internally as\n";
     
    7167}
    7268
     69# print_usage should be overridden for any sub-classes having
     70# their own plugin specific options
     71sub print_usage {
     72    print STDERR "\nThis plugin has no plugin specific options\n\n";
     73
     74}
     75
    7376sub new {
    7477    my $class = shift (@_);
     
    8588             q^extract_acronyms^, \$self->{'extract_acronyms'},
    8689             "allow_extra_options")) {
    87     &print_usage($plugin_name);
     90
     91    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
     92    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
     93        &print_general_usage($plugin_name);
    8894    die "\n";
    8995    }
     
    103109    # set process_exp and block_exp to defaults unless they were
    104110    # explicitly set
    105     if ((!$self->is_recursive()) &&
     111
     112    if ((!$self->is_recursive()) and
    106113    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
    107    
     114
    108115    $self->{'process_exp'} = $self->get_default_process_exp ();
    109116    if ($self->{'process_exp'} eq "") {
    110         warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";
     117        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
    111118    }
    112119    }
     
    115122    $self->{'block_exp'} = $self->get_default_block_exp ();
    116123    }
     124   
     125    # handle input_encoding aliases
     126    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
     127    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
    117128}
    118129
     
    152163# process() function and let this read() function keep control.
    153164#
     165# recursive plugins (e.g. RecPlug) and specialized plugins like those
     166# capable of processing many documents within a single file (e.g.
     167# GMLPlug) should normally implement their own version of read()
     168#
    154169# Return number of files processed, undef if can't process
    155170# Note that $base_dir might be "" and that $file might
     
    165180
    166181    my $filename = &util::filename_cat($base_dir, $file);
    167     return 0 if $filename =~ /$self->{'block_exp'}/;
     182    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
    168183    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
    169184    return undef;
     
    174189    # create a new document
    175190    my $doc_obj = new doc ($file, "indexed_doc");
    176     my $cursection =
    177191   
    178192    # read in file ($text will be in utf8)
     
    190204
    191205    # do plugin specific processing of doc_obj
    192     $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
    193 
    194     # add text
    195     $doc_obj->add_utf8_text ($cursection, $text);
     206    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
    196207
    197208    # do any automatic metadata extraction
     
    207218}
    208219
     220# returns undef if file is rejected by the plugin
    209221sub process {
    210222    my $self = shift (@_);
     
    212224
    213225    die "Basplug::process function must be implemented in sub-class\n";
     226
     227    return undef; # never gets here
    214228}
    215229
     
    223237
    224238    $$textref = "";
    225     my $encoding = "";
    226     if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
    227     $encoding = "iso_8859_1";
    228     } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {
    229     $encoding = "windows_1256";
    230     } else {
    231     $encoding = $self->{'input_encoding'};
    232     }
    233239
    234240    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
    235241
    236     if ($encoding eq "ascii") {
     242    if ($self->{'input_encoding'} eq "ascii") {
    237243    undef $/;
    238244    $$textref = <FILE>;
     
    241247    my $reader = new multiread();
    242248    $reader->set_handle ('BasPlug::FILE');
    243     $reader->set_encoding ($encoding);
     249    $reader->set_encoding ($self->{'input_encoding'});
    244250    $reader->read_file ($textref);
    245251
    246     if ($encoding eq "gb") {
     252    if ($self->{'input_encoding'} eq "gb") {
    247253        # segment the Chinese words
    248254        $$textref = &cnseg::segment($$textref);
Note: See TracChangeset for help on using the changeset viewer.