Ignore:
Timestamp:
2000-07-13T10:21:53+12:00 (24 years ago)
Author:
sjboddie
Message:

merged changes to trunk into New_Config_Format branch

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/New_Config_Format-branch/gsdl/perllib/plugins/BasPlug.pm

    r839 r1279  
    2626package BasPlug;
    2727
     28use parsargv;
     29use multiread;
     30use cnseg;
     31use acronym;
     32use strict;
     33use doc;
     34
     35sub print_general_usage {
     36    my ($plugin_name) = @_;
     37
     38    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
     39    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
     40    print STDERR "                     converted from these encodings and stored internally as\n";
     41    print STDERR "                     utf8. The default input_encoding is Latin1. Accepted values\n";
     42    print STDERR "                     are:\n";
     43    print STDERR "                        iso_8859_1 (extended ascii)\n";
     44    print STDERR "                        Latin1 (the same as iso-8859-1)\n";
     45    print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
     46    print STDERR "                               conversion is neccessary)\n";
     47    print STDERR "                        gb (GB or GBK simplified Chinese)\n";
     48    print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
     49    print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
     50    print STDERR "                        Arabic (the same as windows_1256)\n";
     51    print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
     52    print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
     53    print STDERR "                                 detection)\n";
     54    print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
     55    print STDERR "                     Matching filenames will be processed by this plugin.\n";
     56    print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
     57    print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
     58    print STDERR "                     .htm or .html (case-insensitive).\n";
     59    print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
     60    print STDERR "                     being passed to any further plugins in the list. This has no\n";
     61    print STDERR "                     real effect other than to prevent lots of warning messages\n";
     62    print STDERR "                     about input files you don't care about. Each plugin may or may\n";
     63    print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
     64    print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
     65    print STDERR "                     file extensions.\n";
     66    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
     67}
     68
     69# print_usage should be overridden for any sub-classes having
     70# their own plugin specific options
     71sub print_usage {
     72    print STDERR "\nThis plugin has no plugin specific options\n\n";
     73
     74}
    2875
    2976sub new {
    30     my ($class) = @_;
    31 
    32     return bless {}, $class;
     77    my $class = shift (@_);
     78    my $plugin_name = shift (@_);
     79
     80    my $self = {};
     81    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
     82
     83    # general options available to all plugins
     84    if (!parsargv::parse(\@_,
     85             qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'},
     86             q^process_exp/.*/^, \$self->{'process_exp'},
     87             q^block_exp/.*/^, \$self->{'block_exp'},
     88             q^extract_acronyms^, \$self->{'extract_acronyms'},
     89             "allow_extra_options")) {
     90
     91    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
     92    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
     93        &print_general_usage($plugin_name);
     94    die "\n";
     95    }
     96
     97    return bless $self, $class;
     98}
     99
     100# initialize BasPlug options
     101# if init() is overridden in a sub-class, remember to call BasPlug::init()
     102sub init {
     103    my $self = shift (@_);
     104    my ($verbosity) = @_;
     105
     106    # verbosity is passed through from the processor
     107    $self->{'verbosity'} = $verbosity;
     108
     109    # set process_exp and block_exp to defaults unless they were
     110    # explicitly set
     111
     112    if ((!$self->is_recursive()) and
     113    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
     114
     115    $self->{'process_exp'} = $self->get_default_process_exp ();
     116    if ($self->{'process_exp'} eq "") {
     117        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
     118    }
     119    }
     120
     121    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
     122    $self->{'block_exp'} = $self->get_default_block_exp ();
     123    }
     124   
     125    # handle input_encoding aliases
     126    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
     127    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
    33128}
    34129
     
    42137}
    43138
    44 # return 1 if this class might recurse using $pluginfo
     139# this function should be overridden to return 1
     140# in recursive plugins
    45141sub is_recursive {
    46142    my $self = shift (@_);
    47143
    48     die "BasPlug::is_recursive function must be implemented in sub classes\n";
    49 }
    50 
    51 # return number of files processed, undef if can't process
     144    return 0;
     145}
     146
     147sub get_default_block_exp {
     148    my $self = shift (@_);
     149
     150    return "";
     151}
     152
     153sub get_default_process_exp {
     154    my $self = shift (@_);
     155
     156    return "";
     157}
     158
     159# The BasPlug read() function. This function does all the right things
     160# to make general options work for a given plugin. It calls the process()
     161# function which does all the work specific to a plugin (like the old
     162# read functions used to do). Most plugins should define their own
     163# process() function and let this read() function keep control.
     164#
     165# recursive plugins (e.g. RecPlug) and specialized plugins like those
     166# capable of processing many documents within a single file (e.g.
     167# GMLPlug) should normally implement their own version of read()
     168#
     169# Return number of files processed, undef if can't process
    52170# Note that $base_dir might be "" and that $file might
    53171# include directories
     172
    54173sub read {
    55174    my $self = shift (@_);
    56175    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    57176
    58     die "BasPlug::read function must be implemented in sub classes\n";
    59 
    60     return undef; # will never get here
    61 }
    62 
    63 sub extra_metadata
    64 {
    65     my ($self,$doc_obj,$cursection, $metadata) = @_;
    66 
    67     foreach $field (keys(%$metadata)) {
     177    if ($self->is_recursive()) {
     178    die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
     179    }
     180
     181    my $filename = &util::filename_cat($base_dir, $file);
     182    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
     183    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     184    return undef;
     185    }
     186    my $plugin_name = ref ($self);
     187    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     188   
     189    # create a new document
     190    my $doc_obj = new doc ($file, "indexed_doc");
     191   
     192    # read in file ($text will be in utf8)
     193    my $text = "";
     194    $self->read_file ($filename, \$text);
     195
     196    if ($text !~ /\w/) {
     197    print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
     198    return 0;
     199    }
     200
     201    # include any metadata passed in from previous plugins
     202    # note that this metadata is associated with the top level section
     203    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     204
     205    # do plugin specific processing of doc_obj
     206    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
     207
     208    # do any automatic metadata extraction
     209    $self->auto_extract_metadata ($doc_obj);
     210
     211    # add an OID
     212    $doc_obj->set_OID();
     213
     214    # process the document
     215    $processor->process($doc_obj);
     216
     217    return 1; # processed the file
     218}
     219
     220# returns undef if file is rejected by the plugin
     221sub process {
     222    my $self = shift (@_);
     223    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     224
     225    die "Basplug::process function must be implemented in sub-class\n";
     226
     227    return undef; # never gets here
     228}
     229
     230# uses the multiread package to read in the entire file pointed to
     231# by filename and loads the resulting text into $$textref. Input text
     232# may be in any of the encodings handled by multiread, output text
     233# will be in utf8
     234sub read_file {
     235    my $self = shift (@_);
     236    my ($filename, $textref) = @_;
     237
     238    $$textref = "";
     239
     240    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
     241
     242    if ($self->{'input_encoding'} eq "ascii") {
     243    undef $/;
     244    $$textref = <FILE>;
     245    $/ = "\n";
     246    } else {
     247    my $reader = new multiread();
     248    $reader->set_handle ('BasPlug::FILE');
     249    $reader->set_encoding ($self->{'input_encoding'});
     250    $reader->read_file ($textref);
     251
     252    if ($self->{'input_encoding'} eq "gb") {
     253        # segment the Chinese words
     254        $$textref = &cnseg::segment($$textref);
     255    }
     256    }
     257
     258    close FILE;
     259}
     260
     261# add any extra metadata that's been passed around from one
     262# plugin to another.
     263# extra_metadata uses add_utf8_metadata so it expects metadata values
     264# to already be in utf8
     265sub extra_metadata {
     266    my $self = shift (@_);
     267    my ($doc_obj, $cursection, $metadata) = @_;
     268
     269    foreach my $field (keys(%$metadata)) {
    68270    # $metadata->{$field} may be an array reference
    69271    if (ref ($metadata->{$field}) eq "ARRAY") {
    70272        map {
    71         $doc_obj->add_metadata ($cursection, $field, $_);
     273        $doc_obj->add_utf8_metadata ($cursection, $field, $_);
    72274        } @{$metadata->{$field}};
    73275    } else {
    74         $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
     276        $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
     277    }
     278    }
     279}
     280
     281# extract acronyms (and hopefully other stuff soon too).
     282sub auto_extract_metadata {
     283    my $self = shift (@_);
     284    my ($doc_obj) = @_;
     285
     286    if ($self->{'extract_acronyms'}) {
     287    my $thissection = $doc_obj->get_top_section();
     288    while (defined $thissection) {
     289        my $text = $doc_obj->get_text($thissection);
     290        $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
     291        $thissection = $doc_obj->get_next_section ($thissection);
     292    }
     293    }
     294}
     295
     296sub extract_acronyms {
     297    my $self = shift (@_);
     298    my ($textref, $doc_obj, $thissection) = @_;
     299
     300    my $acro_array =  &acronym::acronyms($textref);
     301
     302    foreach my $acro (@$acro_array) {
     303
     304    #do the normal acronym
     305    $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
     306    print "found " . $acro->to_string() . "\n";
     307       
     308    # do the KWIC (Key Word In Context) acronym
     309    my @kwic = $acro->to_string_kwic();
     310    foreach my $kwic (@kwic) {
     311        $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
     312        print "found (KWIC)" . $kwic . "\n";
    75313    }
    76314    }
Note: See TracChangeset for help on using the changeset viewer.