Ignore:
Timestamp:
2000-06-21T10:14:14+12:00 (24 years ago)
Author:
sjboddie
Message:

Made BasPlug take options (these options are available to all plugins
derived from BasPlug). Also added a -input_encoding option and related
sub-routines. Existing plugins will need to be caught up with these changes
before they can take advantage of the new option (or any others I may get
around to adding).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r839 r1219  
    2626package BasPlug;
    2727
     28use parsargv;
     29use multiread;
     30use cnseg;
     31use strict;
     32
     33sub print_usage {
     34    print STDERR "\nOne of your plugins uses an incorrect general option (general options are those\n";
     35    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
     36   
     37    print STDERR "\n  usage: plugin plugin-name [options]\n\n";
     38    print STDERR "  currently supported general options are:\n";
     39    print STDERR "   -input_encoding  The encoding of the source documents. Documents will be\n";
     40    print STDERR "                    converted from these encodings and stored internally as\n";
     41    print STDERR "                    utf8. The default input_encoding is Latin1. Accepted values\n";
     42    print STDERR "                    are:\n";
     43    print STDERR "                      iso_8859_1 (extended ascii)\n";
     44    print STDERR "                      Latin1 (the same as iso-8859-1)\n";
     45    print STDERR "                      ascii (7 bit ascii -- may be faster than Latin1 as no\n";
     46    print STDERR "                             conversion is neccessary)\n";
     47    print STDERR "                      gb (GB or GBK simplified Chinese)\n";
     48    print STDERR "                      iso_8859_6 (8 bit Arabic)\n";
     49    print STDERR "                      Arabic (the same as iso-8859-6)\n";
     50    print STDERR "                      utf8 (either utf8 or unicode -- automatically detected)\n";
     51    print STDERR "                      unicode (just unicode -- doesn't currently do endian\n";
     52    print STDERR "                               detection)\n\n";
     53}
    2854
    2955sub new {
    30     my ($class) = @_;
     56    my $class = shift (@_);
    3157
    32     return bless {}, $class;
     58    my $self = {};
     59    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|Arabic|utf8|unicode)\$";
     60
     61    # general options available to all plugins
     62    if (!parsargv::parse(\@_, "input_encoding/$encodings/Latin1", \$self->{'input_encoding'},
     63             "allow_extra_options")) {
     64    &print_usage();
     65    die "\n";
     66    }
     67
     68    return bless $self, $class;
    3369}
    3470
     
    6197}
    6298
    63 sub extra_metadata
    64 {
    65     my ($self,$doc_obj,$cursection, $metadata) = @_;
     99# uses the multiread package to read in the entire file pointed to
     100# by filename and loads the resulting text into $$textref. Input text
     101# may be in any of the encodings handled by multiread, output text
     102# will be in utf8
     103sub read_file {
     104    my $self = shift (@_);
     105    my ($filename, $textref) = @_;
    66106
    67     foreach $field (keys(%$metadata)) {
     107    $$textref = "";
     108    my $encoding = "";
     109    if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
     110    $encoding = "latin1";
     111    } elsif ($self->{'input_encoding'} =~ /^(Arabic|iso_8859_6)$/) {
     112    $encoding = "arabic";
     113    } else {
     114    $encoding = $self->{'input_encoding'};
     115    }
     116
     117    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
     118
     119    if ($encoding eq "ascii") {
     120    undef $/;
     121    $$textref = <FILE>;
     122    $/ = "\n";
     123    } else {
     124    my $reader = new multiread();
     125    $reader->set_handle ('BasPlug::FILE');
     126    $reader->set_encoding ($encoding);
     127    $reader->read_file ($textref);
     128
     129    if ($encoding eq "gb") {
     130        # segment the Chinese words
     131        $$textref = &cnseg::segment($$textref);
     132    }
     133    }
     134
     135    close FILE;
     136}
     137
     138# add any extra metadata that's been passed around from one
     139# plugin to another.
     140# extra_metadata uses add_utf8_metadata so it expects metadata values
     141# to already be in utf8
     142sub extra_metadata {
     143    my $self = shift (@_);
     144    my ($doc_obj, $cursection, $metadata) = @_;
     145
     146    foreach my $field (keys(%$metadata)) {
    68147    # $metadata->{$field} may be an array reference
    69148    if (ref ($metadata->{$field}) eq "ARRAY") {
    70149        map {
    71         $doc_obj->add_metadata ($cursection, $field, $_);
     150        $doc_obj->add_utf8_metadata ($cursection, $field, $_);
    72151        } @{$metadata->{$field}};
    73152    } else {
    74         $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
     153        $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
    75154    }
    76155    }
Note: See TracChangeset for help on using the changeset viewer.