Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1242

Timestamp:

2000-06-27T09:36:01+12:00 (24 years ago)

Author:

sjboddie

Message:

Added Stuart Yeate's acronym extraction code and made it a standard
plugin option (i.e. any plugins derived from BasPlug -- and using
BasPlugs read() function -- can include a -extract_acronyms option to
automatically extract acronyms and set as metadata). Currently sets
"Acronym" and "AcronymKWIC" (key word in context) metadata, one day I'll
add options to allow for setting only one or the other.
The acronym extraction code is currently very slow (Stuart tells me he
has a java version which is much better), it seems like it could probably
be sped up a little without too much effort though.
Also moved HTMLPlugs -process_exp and -block_exp options to BasPlug. These
options allow a plugin to specify regular expressions to match against
filenames when deciding which documents to process (or ignore).

Location:

trunk/gsdl/perllib

Files:

: 1 added
: 1 edited

acronym.pm (added)
plugins/BasPlug.pm (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1229
+              r1242
 use multiread;
 use cnseg;
+use acronym;
 use strict;
+use doc;
 sub print_usage {
+    print STDERR "\nOne of your plugins uses an incorrect general option (general options are those\n";
+    my ($plugin_name) = @_;
+    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
     print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
     print STDERR "\n  usage: plugin plugin-name [options]\n\n";
+    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
     print STDERR "  currently supported general options are:\n";
+    print STDERR "   -input_encoding  The encoding of the source documents. Documents will be\n";
+    print STDERR "                    converted from these encodings and stored internally as\n";
+    print STDERR "                    utf8. The default input_encoding is Latin1. Accepted values\n";
+    print STDERR "                    are:\n";
+    print STDERR "                      iso_8859_1 (extended ascii)\n";
+    print STDERR "                      Latin1 (the same as iso-8859-1)\n";
+    print STDERR "                      ascii (7 bit ascii -- may be faster than Latin1 as no\n";
+    print STDERR "                             conversion is neccessary)\n";
+    print STDERR "                      gb (GB or GBK simplified Chinese)\n";
+    print STDERR "                      iso_8859_6 (8 bit Arabic)\n";
+    print STDERR "                      windows_1256 (Windows codepage 1256 (Arabic))\n";
+    print STDERR "                      Arabic (the same as windows_1256)\n";
+    print STDERR "                      utf8 (either utf8 or unicode -- automatically detected)\n";
+    print STDERR "                      unicode (just unicode -- doesn't currently do endian\n";
+    print STDERR "                               detection)\n\n";
+    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
+    print STDERR "                     converted from these encodings and stored internally as\n";
+    print STDERR "                     utf8. The default input_encoding is Latin1. Accepted values\n";
+    print STDERR "                     are:\n";
+    print STDERR "                        iso_8859_1 (extended ascii)\n";
+    print STDERR "                        Latin1 (the same as iso-8859-1)\n";
+    print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
+    print STDERR "                               conversion is neccessary)\n";
+    print STDERR "                        gb (GB or GBK simplified Chinese)\n";
+    print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
+    print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
+    print STDERR "                        Arabic (the same as windows_1256)\n";
+    print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
+    print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
+    print STDERR "                                 detection)\n";
+    print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
+    print STDERR "                     Matching filenames will be processed by this plugin.\n";
+    print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
+    print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
+    print STDERR "                     .htm or .html (case-insensitive).\n";
+    print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
+    print STDERR "                     being passed to any further plugins in the list. This has no\n";
+    print STDERR "                     real effect other than to prevent lots of warning messages\n";
+    print STDERR "                     about input files you don't care about. Each plugin may or may\n";
+    print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
+    print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
+    print STDERR "                     file extensions.\n";
+    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
+}
 sub new {
     my $class = shift (@_);
+    my $plugin_name = shift (@_);
     my $self = {};
 …
     # general options available to all plugins
+    if (!parsargv::parse(\@_, "input_encoding/$encodings/Latin1", \$self->{'input_encoding'},
+    if (!parsargv::parse(\@_,
+             qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'},
+             q^process_exp/.*/^, \$self->{'process_exp'},
+             q^block_exp/.*/^, \$self->{'block_exp'},
+             q^extract_acronyms^, \$self->{'extract_acronyms'},
              "allow_extra_options")) {
     &print_usage();
+    &print_usage($plugin_name);
     die "\n";
+    }
     return bless $self, $class;
+}
+# initialize BasPlug options
+# if init() is overridden in a sub-class, remember to call BasPlug::init()
+sub init {
+    my $self = shift (@_);
+    my ($verbosity) = @_;
+    # verbosity is passed through from the processor
+    $self->{'verbosity'} = $verbosity;
+    # set process_exp and block_exp to defaults unless they were
+    # explicitly set
+    if ((!$self->is_recursive()) &&
+    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
+    $self->{'process_exp'} = $self->get_default_process_exp ();
+    if ($self->{'process_exp'} eq "") {
+        warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";
+    }
+    }
+    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
+    $self->{'block_exp'} = $self->get_default_block_exp ();
+    }
+}
 …
+}
+# return 1 if this class might recurse using $pluginfo
+# this function should be overridden to return 1
+# in recursive plugins
 sub is_recursive {
     my $self = shift (@_);
+    die "BasPlug::is_recursive function must be implemented in sub classes\n";
+}
+# return number of files processed, undef if can't process
+    return 0;
+}
+sub get_default_block_exp {
+    my $self = shift (@_);
+    return "";
+}
+sub get_default_process_exp {
+    my $self = shift (@_);
+    return "";
+}
+# The BasPlug read() function. This function does all the right things
+# to make general options work for a given plugin. It calls the process()
+# function which does all the work specific to a plugin (like the old
+# read functions used to do). Most plugins should define their own
+# process() function and let this read() function keep control.
+#
+# Return number of files processed, undef if can't process
 # Note that $base_dir might be "" and that $file might
 # include directories
 sub read {
     my $self = shift (@_);
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
+    die "BasPlug::read function must be implemented in sub classes\n";
+    return undef; # will never get here
+    if ($self->is_recursive()) {
+    die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
+    }
+    my $filename = &util::filename_cat($base_dir, $file);
+    return 0 if $filename =~ /$self->{'block_exp'}/;
+    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
+    return undef;
+    }
+    my $plugin_name = ref ($self);
+    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
+    # create a new document
+    my $doc_obj = new doc ($file, "indexed_doc");
+    my $cursection =
+    # read in file ($text will be in utf8)
+    my $text = "";
+    $self->read_file ($filename, \$text);
+    if ($text !~ /\w/) {
+    print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
+    return 0;
+    }
+    # include any metadata passed in from previous plugins
+    # note that this metadata is associated with the top level section
+    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
+    # do plugin specific processing of doc_obj
+    $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
+    # add text
+    $doc_obj->add_utf8_text ($cursection, $text);
+    # do any automatic metadata extraction
+    $self->auto_extract_metadata ($doc_obj);
+    # add an OID
+    $doc_obj->set_OID();
+    # process the document
+    $processor->process($doc_obj);
+    return 1; # processed the file
+}
+sub process {
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
+    die "Basplug::process function must be implemented in sub-class\n";
+}
 …
+}
+# extract acronyms (and hopefully other stuff soon too).
+sub auto_extract_metadata {
+    my $self = shift (@_);
+    my ($doc_obj) = @_;
+    if ($self->{'extract_acronyms'}) {
+    my $thissection = $doc_obj->get_top_section();
+    while (defined $thissection) {
+        my $text = $doc_obj->get_text($thissection);
+        $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
+        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
+}
+sub extract_acronyms {
+    my $self = shift (@_);
+    my ($textref, $doc_obj, $thissection) = @_;
+    my $acro_array =  &acronym::acronyms($textref);
+    foreach my $acro (@$acro_array) {
+    #do the normal acronym
+    $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
+    print "found " . $acro->to_string() . "\n";
+    # do the KWIC (Key Word In Context) acronym
+    my @kwic = $acro->to_string_kwic();
+    foreach my $kwic (@kwic) {
+        $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
+        print "found (KWIC)" . $kwic . "\n";
+    }
+    }
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1242

Legend:

trunk/gsdl/perllib/plugins/BasPlug.pm

Download in other formats: