Changeset 1242


Ignore:
Timestamp:
2000-06-27T09:36:01+12:00 (24 years ago)
Author:
sjboddie
Message:

Added Stuart Yeate's acronym extraction code and made it a standard
plugin option (i.e. any plugins derived from BasPlug -- and using
BasPlugs read() function -- can include a -extract_acronyms option to
automatically extract acronyms and set as metadata). Currently sets
"Acronym" and "AcronymKWIC" (key word in context) metadata, one day I'll
add options to allow for setting only one or the other.
The acronym extraction code is currently very slow (Stuart tells me he
has a java version which is much better), it seems like it could probably
be sped up a little without too much effort though.
Also moved HTMLPlugs -process_exp and -block_exp options to BasPlug. These
options allow a plugin to specify regular expressions to match against
filenames when deciding which documents to process (or ignore).

Location:
trunk/gsdl/perllib
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1229 r1242  
    2929use multiread;
    3030use cnseg;
     31use acronym;
    3132use strict;
     33use doc;
    3234
    3335sub print_usage {
    34     print STDERR "\nOne of your plugins uses an incorrect general option (general options are those\n";
     36    my ($plugin_name) = @_;
     37
     38    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
    3539    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
    3640   
    37     print STDERR "\n  usage: plugin plugin-name [options]\n\n";
     41    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
    3842    print STDERR "  currently supported general options are:\n";
    39     print STDERR "   -input_encoding  The encoding of the source documents. Documents will be\n";
    40     print STDERR "                    converted from these encodings and stored internally as\n";
    41     print STDERR "                    utf8. The default input_encoding is Latin1. Accepted values\n";
    42     print STDERR "                    are:\n";
    43     print STDERR "                      iso_8859_1 (extended ascii)\n";
    44     print STDERR "                      Latin1 (the same as iso-8859-1)\n";
    45     print STDERR "                      ascii (7 bit ascii -- may be faster than Latin1 as no\n";
    46     print STDERR "                             conversion is neccessary)\n";
    47     print STDERR "                      gb (GB or GBK simplified Chinese)\n";
    48     print STDERR "                      iso_8859_6 (8 bit Arabic)\n";
    49     print STDERR "                      windows_1256 (Windows codepage 1256 (Arabic))\n";
    50     print STDERR "                      Arabic (the same as windows_1256)\n";
    51     print STDERR "                      utf8 (either utf8 or unicode -- automatically detected)\n";
    52     print STDERR "                      unicode (just unicode -- doesn't currently do endian\n";
    53     print STDERR "                               detection)\n\n";
     43    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
     44    print STDERR "                     converted from these encodings and stored internally as\n";
     45    print STDERR "                     utf8. The default input_encoding is Latin1. Accepted values\n";
     46    print STDERR "                     are:\n";
     47    print STDERR "                        iso_8859_1 (extended ascii)\n";
     48    print STDERR "                        Latin1 (the same as iso-8859-1)\n";
     49    print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
     50    print STDERR "                               conversion is neccessary)\n";
     51    print STDERR "                        gb (GB or GBK simplified Chinese)\n";
     52    print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
     53    print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
     54    print STDERR "                        Arabic (the same as windows_1256)\n";
     55    print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
     56    print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
     57    print STDERR "                                 detection)\n";
     58    print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
     59    print STDERR "                     Matching filenames will be processed by this plugin.\n";
     60    print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
     61    print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
     62    print STDERR "                     .htm or .html (case-insensitive).\n";
     63    print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
     64    print STDERR "                     being passed to any further plugins in the list. This has no\n";
     65    print STDERR "                     real effect other than to prevent lots of warning messages\n";
     66    print STDERR "                     about input files you don't care about. Each plugin may or may\n";
     67    print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
     68    print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
     69    print STDERR "                     file extensions.\n";
     70    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
    5471}
    5572
    5673sub new {
    5774    my $class = shift (@_);
     75    my $plugin_name = shift (@_);
    5876
    5977    my $self = {};
     
    6179
    6280    # general options available to all plugins
    63     if (!parsargv::parse(\@_, "input_encoding/$encodings/Latin1", \$self->{'input_encoding'},
     81    if (!parsargv::parse(\@_,
     82             qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'},
     83             q^process_exp/.*/^, \$self->{'process_exp'},
     84             q^block_exp/.*/^, \$self->{'block_exp'},
     85             q^extract_acronyms^, \$self->{'extract_acronyms'},
    6486             "allow_extra_options")) {
    65     &print_usage();
     87    &print_usage($plugin_name);
    6688    die "\n";
    6789    }
    6890
    6991    return bless $self, $class;
     92}
     93
     94# initialize BasPlug options
     95# if init() is overridden in a sub-class, remember to call BasPlug::init()
     96sub init {
     97    my $self = shift (@_);
     98    my ($verbosity) = @_;
     99
     100    # verbosity is passed through from the processor
     101    $self->{'verbosity'} = $verbosity;
     102
     103    # set process_exp and block_exp to defaults unless they were
     104    # explicitly set
     105    if ((!$self->is_recursive()) &&
     106    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
     107   
     108    $self->{'process_exp'} = $self->get_default_process_exp ();
     109    if ($self->{'process_exp'} eq "") {
     110        warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";
     111    }
     112    }
     113
     114    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
     115    $self->{'block_exp'} = $self->get_default_block_exp ();
     116    }
    70117}
    71118
     
    79126}
    80127
    81 # return 1 if this class might recurse using $pluginfo
     128# this function should be overridden to return 1
     129# in recursive plugins
    82130sub is_recursive {
    83131    my $self = shift (@_);
    84132
    85     die "BasPlug::is_recursive function must be implemented in sub classes\n";
    86 }
    87 
    88 # return number of files processed, undef if can't process
     133    return 0;
     134}
     135
     136sub get_default_block_exp {
     137    my $self = shift (@_);
     138
     139    return "";
     140}
     141
     142sub get_default_process_exp {
     143    my $self = shift (@_);
     144
     145    return "";
     146}
     147
     148# The BasPlug read() function. This function does all the right things
     149# to make general options work for a given plugin. It calls the process()
     150# function which does all the work specific to a plugin (like the old
     151# read functions used to do). Most plugins should define their own
     152# process() function and let this read() function keep control.
     153#
     154# Return number of files processed, undef if can't process
    89155# Note that $base_dir might be "" and that $file might
    90156# include directories
     157
    91158sub read {
    92159    my $self = shift (@_);
    93160    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    94161
    95     die "BasPlug::read function must be implemented in sub classes\n";
    96 
    97     return undef; # will never get here
     162    if ($self->is_recursive()) {
     163    die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
     164    }
     165
     166    my $filename = &util::filename_cat($base_dir, $file);
     167    return 0 if $filename =~ /$self->{'block_exp'}/;
     168    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     169    return undef;
     170    }
     171    my $plugin_name = ref ($self);
     172    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     173   
     174    # create a new document
     175    my $doc_obj = new doc ($file, "indexed_doc");
     176    my $cursection =
     177   
     178    # read in file ($text will be in utf8)
     179    my $text = "";
     180    $self->read_file ($filename, \$text);
     181
     182    if ($text !~ /\w/) {
     183    print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
     184    return 0;
     185    }
     186
     187    # include any metadata passed in from previous plugins
     188    # note that this metadata is associated with the top level section
     189    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     190
     191    # do plugin specific processing of doc_obj
     192    $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
     193
     194    # add text
     195    $doc_obj->add_utf8_text ($cursection, $text);
     196
     197    # do any automatic metadata extraction
     198    $self->auto_extract_metadata ($doc_obj);
     199
     200    # add an OID
     201    $doc_obj->set_OID();
     202
     203    # process the document
     204    $processor->process($doc_obj);
     205
     206    return 1; # processed the file
     207}
     208
     209sub process {
     210    my $self = shift (@_);
     211    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     212
     213    die "Basplug::process function must be implemented in sub-class\n";
    98214}
    99215
     
    157273}
    158274
     275# extract acronyms (and hopefully other stuff soon too).
     276sub auto_extract_metadata {
     277    my $self = shift (@_);
     278    my ($doc_obj) = @_;
     279
     280    if ($self->{'extract_acronyms'}) {
     281    my $thissection = $doc_obj->get_top_section();
     282    while (defined $thissection) {
     283        my $text = $doc_obj->get_text($thissection);
     284        $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
     285        $thissection = $doc_obj->get_next_section ($thissection);
     286    }
     287    }
     288}
     289
     290sub extract_acronyms {
     291    my $self = shift (@_);
     292    my ($textref, $doc_obj, $thissection) = @_;
     293
     294    my $acro_array =  &acronym::acronyms($textref);
     295
     296    foreach my $acro (@$acro_array) {
     297
     298    #do the normal acronym
     299    $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
     300    print "found " . $acro->to_string() . "\n";
     301       
     302    # do the KWIC (Key Word In Context) acronym
     303    my @kwic = $acro->to_string_kwic();
     304    foreach my $kwic (@kwic) {
     305        $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
     306        print "found (KWIC)" . $kwic . "\n";
     307    }
     308    }
     309}
     310
    1593111;
Note: See TracChangeset for help on using the changeset viewer.