Context Navigation

← Previous Changeset
Next Changeset →

Changeset 15868

Timestamp:

2008-06-05T09:21:21+12:00 (16 years ago)

Author:

kjdon

Message:

plugin overhaul: BasPlug has been split into several base plugins: PrintInfo just does the printing for pluginfo.pl, and does the argument parsing in the constructor. All plugins and supporting extractors etc inherit directly or indirectly from this. AbstractPlugin adds a few methods to this, is used by Directory and ArchivesInf plugins. These are not really plugins so can we remove them? anyway, not sure if AbstractPlugin will live for very long. BasePlugin is a proper base plugin, has read and read_into_doc_obj methods. It does nothing with reading in the file or textcat stuff. Makes a basic doc obj and adds some metadata. It also handles all the blocking stuff, associate ext stuff etc. Binary plugins can implement the process method to do file specific stuff. AutoExtractMetadata inherits BasePlugin and adds automatic metadata extraction using hte new Extractor plugins. ReadTextFile is the equivalent in functionality to the old BasPlug - does lang and encoding extraction, and reading in the file. It inherits from AutoExtractMetadata. If your file type is binary and will have no text, then inherit from BasePlugin. If its binary but ends up with text (eg using convert_to) then inherit from AutoExtractMetadata. If your file is a text type file, then inherit from ReadTextFile.

Location:

gsdl/trunk/perllib/plugins

Files:

: 3 added
: 1 edited

AbstractPlugin.pm (added)
BasePlugin.pm (modified) (23 diffs)
PrintInfo.pm (added)
ReadTextFile.pm (added)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/plugins/BasePlugin.pm

-              r15865
+              r15868
 ###########################################################################
+#
 # BasPlug.pm -- base class for all the import plugins
+# BasePlugin.pm -- base class for all the import plugins
 # A component of the Greenstone digital library software
 # from the New Zealand Digital Library Project at the
 …
 ###########################################################################
+package BasPlug;
+BEGIN {
+    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
+}
+eval {require bytes};
+# suppress the annoying "subroutine redefined" warning that various
+# plugins cause under perl 5.6
+$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
+package BasePlugin;
 use strict;
 …
 use File::Basename;
-use Kea;
 use multiread;
 use encodings;
 use unicode;
-use cnseg;
-use acronym;
-use textcat;
 use doc;
 eval "require diagnostics"; # some perl distros (eg mac) don't have this
-use DateExtract;
 use ghtml;
 use gsprintf 'gsprintf';
+use printusage;
+use parse2;
+use GISBasPlug;
+@BasPlug::ISA = ( GISBasPlug );
+my $unicode_list =
+use PrintInfo;
+BEGIN {
+    @BasePlugin::ISA = ( 'PrintInfo' );
+}
+our $encoding_list =
     [ { 'name' => "ascii",
     'desc' => "{BasPlug.input_encoding.ascii}" },
+    'desc' => "{ReadTextFile.input_encoding.ascii}" },
       { 'name' => "utf8",
     'desc' => "{BasPlug.input_encoding.utf8}" },
+    'desc' => "{ReadTextFile.input_encoding.utf8}" },
       { 'name' => "unicode",
+    'desc' => "{BasPlug.input_encoding.unicode}" } ];
+my $auto_unicode_list =
+    [ { 'name' => "auto",
+    'desc' => "{BasPlug.input_encoding.auto}" } ];
+    'desc' => "{ReadTextFile.input_encoding.unicode}" } ];
 my $e = $encodings::encodings;
 …
      'desc' => $e->{$enc}->{'name'}};
+    push(@{$unicode_list},$hashEncode);
+}
+push(@{$auto_unicode_list},@{$unicode_list});
+    push(@{$encoding_list},$hashEncode);
+}
+our $encoding_plus_auto_list =
+    [ { 'name' => "auto",
+    'desc' => "{ReadTextFile.input_encoding.auto}" } ];
+push(@{$encoding_plus_auto_list},@{$encoding_list});
 my $arguments =
     [ { 'name' => "process_exp",
     'desc' => "{BasPlug.process_exp}",
+    'desc' => "{BasePlugin.process_exp}",
     'type' => "regexp",
     'deft' => "",
     'reqd' => "no" },
       { 'name' => "block_exp",
     'desc' => "{BasPlug.block_exp}",
+    'desc' => "{BasePlugin.block_exp}",
     'type' => "regexp",
     'deft' => "",
     'reqd' => "no" },
       { 'name' => "smart_block",
     'desc' => "{BasPlug.smart_block}",
+    'desc' => "{BasePlugin.smart_block}",
     'type' => "flag",
     'reqd' => "no" },
       { 'name' => "associate_ext",
     'desc' => "{BasPlug.associate_ext}",
+    'desc' => "{BasePlugin.associate_ext}",
     'type' => "string",
     'reqd' => "no" },
       { 'name' => "associate_tail_re",
     'desc' => "{BasPlug.associate_tail_re}",
+    'desc' => "{BasePlugin.associate_tail_re}",
     'type' => "string",
     'reqd' => "no" },
       { 'name' => "use_as_doc_identifier",
     'desc' => "{BasPlug.use_as_doc_identifier}",
+    'desc' => "{BasePlugin.use_as_doc_identifier}",
     'type' => "string",
     'reqd' => "no" ,
     'deft' => "" } ,
+      { 'name' => "input_encoding",
+    'desc' => "{BasPlug.input_encoding}",
+    'type' => "enum",
+    'list' => $auto_unicode_list,
+    'reqd' => "no" ,
+    'deft' => "auto" } ,
+      { 'name' => "default_encoding",
+    'desc' => "{BasPlug.default_encoding}",
+    'type' => "enum",
+    'list' => $unicode_list,
+    'reqd' => "no",
+        'deft' => "utf8" },
+      { 'name' => "extract_language",
+    'desc' => "{BasPlug.extract_language}",
+     { 'name' => "no_cover_image",
+    'desc' => "{BasePlugin.no_cover_image}",
     'type' => "flag",
     'reqd' => "no" },
+      { 'name' => "default_language",
+    'desc' => "{BasPlug.default_language}",
+    'type' => "string",
+    'deft' => "en",
+    'reqd' => "no" },
+      { 'name' => "extract_acronyms",
+    'desc' => "{BasPlug.extract_acronyms}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "markup_acronyms",
+    'desc' => "{BasPlug.markup_acronyms}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "extract_keyphrases",
+    'desc' => "{BasPlug.extract_keyphrases}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "extract_keyphrases_kea4",
+    'desc' => "{BasPlug.extract_keyphrases_kea4}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "extract_keyphrase_options",
+    'desc' => "{BasPlug.extract_keyphrase_options}",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" },
+      { 'name' => "first",
+    'desc' => "{BasPlug.first}",
+    'type' => "string",
+    'reqd' => "no" },
+      { 'name' => "extract_email",
+    'desc' => "{BasPlug.extract_email}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "extract_historical_years",
+    'desc' => "{BasPlug.extract_historical_years}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "maximum_year",
+    'desc' => "{BasPlug.maximum_year}",
+    'type' => "int",
+    'deft' => (localtime)[5]+1900,
+    'char_length' => "4",
+    #'range' => "2,100",
+    'reqd' => "no"},
+      { 'name' => "maximum_century",
+    'desc' => "{BasPlug.maximum_century}",
+    'type' => "string",
+    'deft' => "-1",
+    'reqd' => "no" },
+      { 'name' => "no_bibliography",
+    'desc' => "{BasPlug.no_bibliography}",
+    'type' => "flag",
+    'reqd' => "no"},
+      { 'name' => "no_cover_image",
+    'desc' => "{BasPlug.no_cover_image}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "separate_cjk",
+    'desc' => "{BasPlug.separate_cjk}",
+    'type' => "flag",
+    'reqd' => "no",
+    'hiddengli' => "yes" },
+      { 'name' => "new_extract_email",
+    'desc' => "",
+    'type' => "flag",
+    'reqd' => "no",
+    'hiddengli' => "yes" } ];
+my $gis_arguments =
+    [ { 'name' => "extract_placenames",
+    'desc' => "{GISBasPlug.extract_placenames}",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "gazetteer",
+    'desc' => "{GISBasPlug.gazetteer}",
+    'type' => "string",
+    'reqd' => "no" },
+      { 'name' => "place_list",
+    'desc' => "{GISBasPlug.place_list}",
+    'type' => "flag",
+    'reqd' => "no" } ];
+my $options = { 'name'     => "BasPlug",
+        'desc'     => "{BasPlug.desc}",
+      { 'name' => "filename_encoding",
+    'desc' => "{BasePlugin.filename_encoding}",
+    'type' => "enum",
+    'deft' => "auto",
+    'list' => $encoding_plus_auto_list,
+    'reqd' => "no" }
+      ];
+my $options = { 'name'     => "BasePlugin",
+        'desc'     => "{BasePlugin.desc}",
         'abstract' => "yes",
         'inherits' => "no",
 …
-sub set_incremental {
-    my $self = shift(@_);
-    my ($incremental) = @_;
-    $self->{'incremental'} = $incremental;
+}
-sub get_arguments
+{
-    my $self = shift(@_);
-    my $optionlistref = $self->{'option_list'};
-    my @optionlist = @$optionlistref;
-    my $pluginoptions = pop(@$optionlistref);
-    my $pluginarguments = $pluginoptions->{'args'};
-    return $pluginarguments;
+}
-sub print_xml_usage
+{
-    my $self = shift(@_);
-    my $header = shift(@_);
-    my $high_level_information_only = shift(@_);
-    # XML output is always in UTF-8
-    gsprintf::output_strings_in_UTF8;
-    if ($header) {
-    &PrintUsage::print_xml_header("plugin");
+    }
-    $self->print_xml($high_level_information_only);
+}
-sub print_xml
+{
-    my $self = shift(@_);
-    my $high_level_information_only = shift(@_);
-    my $optionlistref = $self->{'option_list'};
-    my @optionlist = @$optionlistref;
-    my $pluginoptions = shift(@$optionlistref);
-    return if (!defined($pluginoptions));
-    # Find the process and block default expressions in the plugin arguments
-    my $process_exp = "";
-    my $block_exp = "";
-    if (defined($pluginoptions->{'args'})) {
-    foreach my $option (@{$pluginoptions->{'args'}}) {
-        if ($option->{'name'} eq "process_exp") {
-        $process_exp = $option->{'deft'};
+        }
-        if ($option->{'name'} eq "block_exp") {
-        $block_exp = $option->{'deft'};
+        }
+    }
+    }
-    gsprintf(STDERR, "<PlugInfo>\n");
-    gsprintf(STDERR, "  <Name>$pluginoptions->{'name'}</Name>\n");
-    my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
-    $desc =~ s/</&amp;lt;/g; # doubly escaped
-    $desc =~ s/>/&amp;gt;/g;
-    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
-    gsprintf(STDERR, "  <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
-    gsprintf(STDERR, "  <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
-    gsprintf(STDERR, "  <Processes>$process_exp</Processes>\n");
-    gsprintf(STDERR, "  <Blocks>$block_exp</Blocks>\n");
-    gsprintf(STDERR, "  <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
-    # adding new option that works with replace_srcdoc_with_html.pl
-    gsprintf(STDERR, "  <SourceReplaceable>" . ($pluginoptions->{'srcreplaceable'} || "no") . "</SourceReplaceable>\n");
-    unless (defined($high_level_information_only)) {
-    gsprintf(STDERR, "  <Arguments>\n");
-    if (defined($pluginoptions->{'args'})) {
-        &PrintUsage::print_options_xml($pluginoptions->{'args'});
+    }
-    gsprintf(STDERR, "  </Arguments>\n");
-    # Recurse up the plugin hierarchy
-    $self->print_xml();
+    }
-    gsprintf(STDERR, "</PlugInfo>\n");
+}
-sub print_txt_usage
+{
-    my $self = shift(@_);
-    # Print the usage message for a plugin (recursively)
-    my $descoffset = $self->determine_description_offset(0);
-    $self->print_plugin_usage($descoffset, 1);
+}
-sub determine_description_offset
+{
-    my $self = shift(@_);
-    my $maxoffset = shift(@_);
-    my $optionlistref = $self->{'option_list'};
-    my @optionlist = @$optionlistref;
-    my $pluginoptions = shift(@$optionlistref);
-    return $maxoffset if (!defined($pluginoptions));
-    # Find the length of the longest option string of this plugin
-    my $pluginargs = $pluginoptions->{'args'};
-    if (defined($pluginargs)) {
-    my $longest = &PrintUsage::find_longest_option_string($pluginargs);
-    if ($longest > $maxoffset) {
-        $maxoffset = $longest;
+    }
+    }
-    # Recurse up the plugin hierarchy
-    $maxoffset = $self->determine_description_offset($maxoffset);
-    $self->{'option_list'} = \@optionlist;
-    return $maxoffset;
+}
-sub print_plugin_usage
+{
-    my $self = shift(@_);
-    my $descoffset = shift(@_);
-    my $isleafclass = shift(@_);
-    my $optionlistref = $self->{'option_list'};
-    my @optionlist = @$optionlistref;
-    my $pluginoptions = shift(@$optionlistref);
-    return if (!defined($pluginoptions));
-    my $pluginname = $pluginoptions->{'name'};
-    my $pluginargs = $pluginoptions->{'args'};
-    my $plugindesc = $pluginoptions->{'desc'};
-    # Produce the usage information using the data structure above
-    if ($isleafclass) {
-    if (defined($plugindesc)) {
-        gsprintf(STDERR, "$plugindesc\n\n");
+    }
-    gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
+    }
-    # Display the plugin options, if there are some
-    if (defined($pluginargs)) {
-    # Calculate the column offset of the option descriptions
-    my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
-    if ($isleafclass) {
-        gsprintf(STDERR, " {common.specific_options}:\n");
+    }
-    else {
-        gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
+    }
-    # Display the plugin options
-    &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
+    }
-    # Recurse up the plugin hierarchy
-    $self->print_plugin_usage($descoffset, 0);
-    $self->{'option_list'} = \@optionlist;
+}
 sub new {
+    # Set Encodings to the list!!
+    # Start the BasPlug Constructor
+    my $class = shift (@_);
+    my ($pluginlist,$args,$hashArgOptLists) = @_;
+    my ($class) = shift (@_);
+    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
     push(@$pluginlist, $class);
+    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+    push(@{$hashArgOptLists->{"OptList"}},$options);
+    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
     my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
-    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
-    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
-    if (GISBasPlug::has_mapdata()) {
-    push(@$arguments,@$gis_arguments);
+    }
-    my $self = {};
-    $self->{'outhandle'} = STDERR;
-    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
-    $self->{"info_only"} = 0;
-    # Check if gsdlinfo is in the argument list or not - if it is, don't parse
-    # the args, just return the object.
-    foreach my $strArg (@{$args})
+    {
-    if($strArg eq "-gsdlinfo")
+    {
-        $self->{"info_only"} = 1;
-        return bless $self, $class;
+    }
+    }
-    if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
+    {
-    my $classTempClass = bless $self, $class;
-    print STDERR "<BadPlugin p=$plugin_name>\n";
-    &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
-    $classTempClass->print_txt_usage("");  # Use default resource bundle
-    die "\n";
+    }
-    delete $self->{"info_only"};
-    # else parsing was successful.
     $self->{'plugin_type'} = $plugin_name;
+    #$self->{'outhandle'} = STDERR;
     $self->{'num_processed'} = 0;
     $self->{'num_not_processed'} = 0;
 …
     $self->{'file_blocks'} = {};
-    if ($self->{'extract_placenames'}) {
-    my $outhandle = $self->{'outhandle'};
-    my $places_ref
-        = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
-    if (!defined $places_ref) {
-        print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
-        print $outhandle "         No placename extraction will take place.\n";
-        $self->{'extract_placenames'} = undef;
+    }
-    else {
-        $self->{'places'} = $places_ref;
+    }
+    }
     return bless $self, $class;
+}
 # initialize BasPlug options
 # if init() is overridden in a sub-class, remember to call BasPlug::init()
+}
+# initialize BasePlugin options
+# if init() is overridden in a sub-class, remember to call BasePlugin::init()
 sub init {
     my $self = shift (@_);
 …
     my $self = shift (@_);
     my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
-   #my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
-   #print STDERR "Calling method; $cfilename:$cline $cpackage->$csubr\n";
-    $self->initialise_extractors();
+}
 …
     # import.pl only has one plugin pass, but buildcol.pl has multiple ones
+    my ($self) = @_;
+    $self->finalise_extractors();
+    my ($self) = shift (@_);
+}
 …
     my ($self) = @_;
+}
+sub set_incremental {
+    my $self = shift(@_);
+    my ($incremental) = @_;
+    $self->{'incremental'} = $incremental;
+}
 …
+}
+sub get_full_filenames {
+    my $self = shift (@_);
+    my ($base_dir, $file) = @_;
+    my $filename_full_path = $file;
+    # add on directory if present
+    $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
+    my $filename_no_path = $file;
+    # remove directory if present
+    $filename_no_path =~ s/^.*[\/\\]//;
+    return ($filename_full_path, $filename_no_path);
+}
 sub read_block {
 …
+    my $filename = $file;
+    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
+    if ($self->associate_with($file,$filename,$metadata)) {
+    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
+    if ($self->associate_with($file,$filename_full_path,$metadata)) {
     # a form of smart block
     $self->{'num_blocked'} ++;
 …
     if ($smart_block || $smart_block_BN) {
     if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
+    if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
         $self->{'num_blocked'} ++;
         return (0,undef); # blocked
+    }
     } else {
     if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
+    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
         $self->{'num_blocked'} ++;
         return (0,undef); # blocked
+    }
     if ($self->{'cover_image'}) {
         if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
+        if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
         $self->{'num_blocked'} ++;
         return (0,undef); # blocked
 …
+    }
     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
+    if ($filename_full_path !~ /$self->{'process_exp'}/ || !-f $filename_full_path) {
     return (undef,undef); # can't recognise
+    }
+    return (1,$filename);
+}
+sub read_tidy_file {
+    ##why are we returning the full filename - do we need this??
+    return (1,$filename_full_path);
+}
+#filename_encoding set by user
+sub filename_to_utf8_metadata
+{
     my $self = shift (@_);
+    my ($file) = @_;
+    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
+    return $file;
+}
+sub filename_to_metadata
+{
+    my $self = shift (@_);
+    my ($file, $encoding) = @_;
+    my ($file, $file_encoding) = @_;
     my $outhandle = $self->{'outhandle'};
+    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
+    my $filename_encoding = $self->{'filename_encoding'};
+    if ($filename_encoding eq "auto") {
+    # we check the locale first
+    if (!defined $self->{'filesystem_encoding'}) {
+        $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
+        $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
+    }
+    if ($self->{'filesystem_encoding'} ne "undefined") {
+        $filename_encoding = $self->{'filesystem_encoding'};
+    } else {
+        # try the encoding of the document, if available
+        if (defined $file_encoding) {
+        $filename_encoding = $file_encoding;
+        } else {
+        # use utf8
+        $filename_encoding = "utf8";
+        }
+    }
+    }
+    if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) {
+    $filemeta = unicode::unicode2utf8(
+      unicode::convert2unicode($filename_encoding, \$filemeta)
+    );
+    }
+    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
+    return $dmsafe_filemeta;
+}
+sub get_filesystem_encoding {
+    my $self = shift(@_);
+    my $outhandle = $self->{'outhandle'};
     my $filesystem_encoding = undef;
     eval {
     use POSIX qw(locale_h);
+    # With only one parameter, setlocale retrieves the current value
+    # With only one parameter, setlocale retrieves the
+    # current value
     my $current_locale = setlocale(LC_CTYPE);
     if ($current_locale =~ m/^.*\.(.*?)$/) {
         my $char_encoding = lc($1);
 …
         $char_encoding =~ s/-/_/g;
         $char_encoding =~ s/^utf_8$/utf8/;
         if ($char_encoding =~ m/^\d+$/) {
         if (defined $encodings::encodings->{"windows_$char_encoding"}) {
 …
+        }
+        }
         if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
         || (defined $encodings::encodings->{$char_encoding})) {
 …
+        }
+    }
     };
 …
+    }
+    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
+    # how do we know what encoding the filename is in?
+    # => one answer is to check the locale
+    if (defined $filesystem_encoding) {
+    if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) {
+        $filemeta = unicode::unicode2utf8(
+            unicode::convert2unicode($filesystem_encoding, \$filemeta)
+              );
+    }
+    }
+    # assume it is in the same encoding as its contents
+    elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) {
+    $filemeta = unicode::unicode2utf8(
+        unicode::convert2unicode($encoding, \$filemeta)
+    );
+    }
+    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
+    return $dmsafe_filemeta;
+}
+sub add_OID
+{
+    return $filesystem_encoding;
+}
+# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
+sub set_Source_metadata {
+    my $self = shift (@_);
+    my ($doc_obj, $filename_no_path, $file_encoding) = @_;
+    my $top_section = $doc_obj->get_top_section();
+    # the original encoding filename
+    $doc_obj->set_metadata_element($top_section, "Source", $filename_no_path);
+    # UTF-8 version of filename
+    my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
+    $doc_obj->set_utf8_metadata_element($top_section, "SourceUTF8", $filemeta);
+}
+sub add_OID {
     my $self = shift (@_);
     my ($doc_obj) = @_;
 …
+}
+# The BasPlug read_into_doc_obj() function. This function does all the
+# right things to make general options work for a given plugin.  It reads in
+# The BasePlugin read_into_doc_obj() function. This function does all the
+# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
 # a file and sets up a slew of metadata all saved in doc_obj, which
 # it then returns as part of a tuple (process_status,doc_obj)
 …
 # Note that $base_dir might be "" and that $file might
 # include directories
+# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
 sub read_into_doc_obj {
     my $self = shift (@_);
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
+    if ($self->is_recursive()) {
+    gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
+    }
+    my $outhandle   = $self->{'outhandle'};
+    my ($block_status,$filename) = $self->read_block(@_);
+    return $block_status if ((!defined $block_status) || ($block_status==0));
+    $file = $self->read_tidy_file($file);
+    # Do encoding stuff
+    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
+    if ($self->{'verbosity'} > 2) {
+    print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
+    }
+    my $outhandle = $self->{'outhandle'};
+    # should we move this to read? What about secondary plugins?
+    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
+    print $outhandle "$self->{'plugin_type'} processing $file\n"
+        if $self->{'verbosity'} > 1;
+    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
     # create a new document
     my $doc_obj = new doc ($filename, "indexed_doc");
+    my $doc_obj = new doc ($filename_full_path, "indexed_doc");
     my $top_section = $doc_obj->get_top_section();
+    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
+    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
+    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
+    # this should look at the plugin option too...
+    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
     $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
+    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
+    my $filemeta = $self->filename_to_metadata($file,$encoding);
+    $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
+    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
+    $self->Set_Source_metadata($doc_obj, $filename_no_path);
+    # plugin specific stuff - what args do we need here??
+    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
+    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
+    return -1;
+    }
+    # include any metadata passed in from previous plugins
+    # note that this metadata is associated with the top level section
+    my $section = $doc_obj->get_top_section();
+    # can we merge these two methods??
+    $self->add_associated_files($doc_obj, $filename_full_path);
+    $self->extra_metadata ($doc_obj, $section, $metadata);
+    $self->auto_extract_metadata($doc_obj);
+    # if we haven't found any Title so far, assign one
+    # this was shifted to here from inside read()
+    $self->title_fallback($doc_obj,$section,$filename_no_path);
+    $self->add_OID($doc_obj);
+    return (1,$doc_obj);
+}
+sub add_dummy_text {
+    my $self = shift(@_);
+    my ($doc_obj, $section) = @_;
+    # add NoText metadata so we can hide this dummy text in format statements
+    $doc_obj->add_metadata($section, "NoText", "1");
+    $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
+}
+# does nothing. Can be overridden by subclass
+sub auto_extract_metadata {
+    my $self = shift(@_);
+    my ($doc_obj) = @_;
+}
+# adds cover image, associate_file options stuff. Should be called by sub class
+# read_into_doc_obj
+sub add_associated_files {
+    my $self = shift(@_);
+    # whatis filename??
+    my ($doc_obj, $filename) = @_;
+    # add in the cover image
     if ($self->{'cover_image'}) {
     $self->associate_cover_image($doc_obj, $filename);
+    }
-    # read in file ($text will be in utf8)
-    my $text = "";
-    $self->read_file ($filename, $encoding, $language, \$text);
-    if (!length ($text)) {
-    my $plugin_name = ref ($self);
-    if ($gli) {
-        print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
+    }
-    gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
-    my $failhandle = $self->{'failhandle'};
-    gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
-    # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
-    $self->{'num_not_processed'} ++;
-    return (0,undef); # what should we return here?? error but don't want to pass it on
+    }
+    # include any metadata passed in from previous plugins
+    # note that this metadata is associated with the top level section
+    my $associate_tail_re = $self->{'associate_tail_re'};
+    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
+    # do plugin specific processing of doc_obj
+    unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
+    $text = '';
+    undef $text;
+    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
+    return (-1,undef);
+    }
+    $text='';
+    undef $text;
+    # do any automatic metadata extraction
+    $self->auto_extract_metadata ($doc_obj);
+    $self->add_OID($doc_obj);
+    return (1,$doc_obj);
+}
+# The BasPlug read() function. This function calls read_into_doc_obj()
+}
+# The BasePlugin read() function. This function calls read_into_doc_obj()
 # to ensure all the right things to make general options work for a
 # given plugin are done. It then calls the process() function which
 …
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
+    # check that we are not blocked
+    my ($block_status,$filename) = $self->read_block(@_);
+    return $block_status if ((!defined $block_status) || ($block_status==0));
     my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
     if ((defined $process_status) && ($process_status == 1)) {
     # process the document
     $processor->process($doc_obj);
+    if(defined($self->{'places_filename'})){
+        &util::rm($self->{'places_filename'});
+        $self->{'places_filename'} = undef;
+    }
     $self->{'num_processed'} ++;
     undef $doc_obj;
+    }
+    # delete any temp files that we may have created
+    $self->clean_up_after_doc_obj_processing();
     # if process_status == 1, then the file has been processed.
 …
     my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
     gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
     # die "Basplug::process function must be implemented in sub-class\n";
+    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
+    # die "BasePlugin::process function must be implemented in sub-class\n";
     return undef; # never gets here
+}
+# uses the multiread package to read in the entire file pointed to
+# by filename and loads the resulting text into $$textref. Input text
+# may be in any of the encodings handled by multiread, output text
+# will be in utf8
+sub read_file {
+    my $self = shift (@_);
+    my ($filename, $encoding, $language, $textref) = @_;
+    if (!-r $filename)
+    {
+    my $outhandle = $self->{'outhandle'};
+    gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
+    # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
+    return;
+    }
+    $$textref = "";
+    if (!open (FILE, $filename)) {
+    gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
+    die "\n";
+    }
+    if ($encoding eq "ascii") {
+    undef $/;
+    $$textref = <FILE>;
+    $/ = "\n";
+    } else {
+    my $reader = new multiread();
+    $reader->set_handle ('BasPlug::FILE');
+    $reader->set_encoding ($encoding);
+    $reader->read_file ($textref);
+        #Now segments chinese if the separate_cjk option is set
+    if ($self->{'separate_cjk'}) {
+        # segment the Chinese words
+        $$textref = &cnseg::segment($$textref);
+    }
+    }
+    close FILE;
+}
+# overwrite this method to delete any temp files that we have created
+sub clean_up_after_doc_obj_processing {
+    my $self = shift(@_);
+}
 # write_file -- used by ConvertToPlug, for example in post processing
+#
+# where should this go, is here the best place??
 sub utf8_write_file {
     my $self = shift (@_);
 …
     my ($doc_obj,$section,$file) = @_;
+    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
+    my $file_derived_title = $self->filename_based_title($file);
+    $doc_obj->add_utf8_metadata ($section, "Title", $self->filename_to_metadata($file_derived_title));
+    }
+}
+sub textcat_get_language_encoding {
+    my $self = shift (@_);
+    my ($filename) = @_;
+    my ($language, $encoding, $extracted_encoding);
+    if ($self->{'input_encoding'} eq "auto") {
+        # use textcat to automatically work out the input encoding and language
+        ($language, $encoding) = $self->get_language_encoding ($filename);
+    } elsif ($self->{'extract_language'}) {
+        # use textcat to get language metadata
+        ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
+        $encoding = $self->{'input_encoding'};
+    # don't print this message for english... english in utf8 is identical
+    # to english in iso-8859-1 (except for some punctuation). We don't have
+    # a language model for en_utf8, so textcat always says iso-8859-1!
+        if ($extracted_encoding ne $encoding && $language ne "en"
+        && $self->{'verbosity'}) {
+        my $plugin_name = ref ($self);
+        my $outhandle = $self->{'outhandle'};
+        gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
+        }
+    } else {
+        $language = $self->{'default_language'};
+        $encoding = $self->{'input_encoding'};
+    }
+    return ($language, $encoding);
+}
+# Uses textcat to work out the encoding and language of the text in
+# $filename. All html tags are removed before processing.
+# returns an array containing "language" and "encoding"
+sub get_language_encoding {
+    my $self = shift (@_);
+    my ($filename) = @_;
+    my $outhandle = $self->{'outhandle'};
+    my $unicode_format = "";
+    my $best_language = "";
+    my $best_encoding = "";
+    # read in file
+    if (!open (FILE, $filename)) {
+    gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
+    # this is a pretty bad error, but try to continue anyway
+    return ($self->{'default_language'}, $self->{'input_encoding'});
+    }
+    undef $/;
+    my $text = <FILE>;
+    $/ = "\n";
+    close FILE;
+    # check if first few bytes have a Byte Order Marker
+    my $bom=substr($text,0,2); # check 16bit unicode
+    if ($bom eq "\xff\xfe") { # little endian 16bit unicode
+    $unicode_format="unicode";
+    } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
+    $unicode_format="unicode";
+    } else {
+    $bom=substr($text,0,3); # check utf-8
+    if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
+        $unicode_format="utf8";
+#   } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
+#       $unicode_format="utf8";
+    }
+    }
+    # handle html files specially
+    # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
+    if (ref($self) eq 'HTMLPlug' ||
+    (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
+    # remove <title>stuff</title> -- as titles tend often to be in English
+    # for foreign language documents
+    $text =~ s!<title>.*?</title>!!si;
+    # see if this html file specifies its encoding
+    if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
+        $best_encoding = $1;
+    } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#"
+        $best_encoding = $1;
+    }
+    if ($best_encoding) { # we extracted an encoding
+        $best_encoding =~ s/-+/_/g;
+        $best_encoding = lc($best_encoding); # lowercase
+        if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
+        $self->{'input_encoding'} = $best_encoding;
+    }
+    # remove all HTML tags
+    $text =~ s/<[^>]*>//sg;
+    }
+    # get the language/encoding
+    $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
+    my $results = $self->{'textcat'}->classify(\$text);
+    # if textcat returns 3 or less possibilities we'll use the
+    # first one in the list - otherwise use the defaults
+    if (scalar @$results > 3) {
+    if ($unicode_format) { # in case the first had a BOM
+        $best_encoding=$unicode_format;
+    } else {
+        my %guessed_encodings = ();
+        foreach my $result (@$results) {
+        $result =~ /([^\-]+)$/;
+        my $enc=$1;
+        if (!defined($guessed_encodings{$enc})) {
+            $guessed_encodings{$enc}=0;
+        }
+        $guessed_encodings{$enc}++;
+        }
+        $guessed_encodings{""}=-1; # for default best_encoding of ""
+        foreach my $enc (keys %guessed_encodings) {
+        if ($guessed_encodings{$enc} >
+            $guessed_encodings{$best_encoding}){
+            $best_encoding=$enc;
+        }
+        }
+    }
+    if ($self->{'input_encoding'} ne 'auto') {
+        if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
+        gsprintf($outhandle,
+             "BasPlug: {BasPlug.could_not_extract_language}\n",
+             $filename, $self->{'default_language'});
+        }
+        $best_language = $self->{'default_language'};
+        $best_encoding = $self->{'input_encoding'};
+    } else {
+        if ($self->{'verbosity'}>2) {
+        gsprintf($outhandle,
+             "BasPlug: {BasPlug.could_not_extract_language}\n",
+             $filename, $self->{'default_language'});
+        }
+        $best_language = $self->{'default_language'};
+    }
+    } else { # <= 3 suggestions
+    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
+    if (!defined $language) {
+        if ($self->{'verbosity'}>2) {
+        gsprintf($outhandle,
+            "BasPlug: {BasPlug.could_not_extract_language}\n",
+            $filename, $self->{'default_language'});
+        }
+        $language = $self->{'default_language'};
+    }
+    if (!defined $encoding) {
+        if ($self->{'verbosity'}>2) {
+        gsprintf($outhandle,
+            "BasPlug: {BasPlug.could_not_extract_encoding}\n",
+            $filename, $self->{'default_encoding'});
+        }
+        $encoding = $self->{'default_encoding'};
+    }
+    $best_language = $language;
+    if (! $best_encoding ) { # may already be set... eg from html meta tag
+        $best_encoding = $encoding;
+    }
+    }
+    my $text_copy = $text;
+    if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
+    # the text is valid utf8, so assume that's the real encoding
+    # (since textcat is based on probabilities)
+    $best_encoding = 'utf8';
+    }
+    # check for equivalents where textcat doesn't have some encodings...
+    # eg MS versions of standard encodings
+    if ($best_encoding =~ /^iso_8859_(\d+)/) {
+    my $iso = $1; # which variant of the iso standard?
+    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
+    if ($text =~ /[\x80-\x9f]/) {
+        # Western Europe
+        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
+        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
+        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
+        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
+        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
+        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
+        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
+    }
+    }
+    if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
+    !defined $encodings::encodings->{$best_encoding}) {
+    if ($self->{'verbosity'}) {
+        gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
+             $filename, $best_encoding, $self->{'default_encoding'});
+    }
+    $best_encoding = $self->{'default_encoding'};
+    }
+    return ($best_language, $best_encoding);
+}
+    if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
+    my $file_derived_title = $self->filename_to_metadata($self->filename_based_title($file));
+    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
+        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
+    }
+    else {
+        $doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title);
+    }
+    }
+}
 # add any extra metadata that's been passed around from one
 # plugin to another.
 …
+}
-# initialise metadata extractors
-sub initialise_extractors {
-    my $self = shift (@_);
-    if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
-    &acronym::initialise_acronyms();
+    }
+}
-# finalise metadata extractors
-sub finalise_extractors {
-    my $self = shift (@_);
-    if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
-    &acronym::finalise_acronyms();
+    }
+}
-# FIRSTNNN: extract the first NNN characters as metadata
-sub extract_first_NNNN_characters {
-    my $self = shift (@_);
-    my ($textref, $doc_obj, $thissection) = @_;
-    foreach my $size (split /,/, $self->{'first'}) {
-    my $tmptext =  $$textref;
-    $tmptext =~ s/^\s+//;
-    $tmptext =~ s/\s+$//;
-    $tmptext =~ s/\s+/ /gs;
-    $tmptext = substr ($tmptext, 0, $size);
-    $tmptext =~ s/\s\S*$/&#8230;/;
-    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
+    }
+}
-sub extract_email {
-    my $self = shift (@_);
-    my ($textref, $doc_obj, $thissection) = @_;
-    my $outhandle = $self->{'outhandle'};
-    gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
-    if ($self->{'verbosity'} > 2);
-    my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
-    @email = sort @email;
-#    if($self->{"new_extract_email"} == 0)
-#    {
-#    my @email2 = ();
-#    foreach my $address (@email)
-#   {
-#   if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
-#       {
-#       push @email2, $address;
-#       $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
-#       # print $outhandle "  extracting $address\n"
-#       &gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
-#           if ($self->{'verbosity'} > 3);
-#       }
-#   }
-#    }
-#    else
-#    {
-    my $hashExistMail = {};
-    foreach my $address (@email) {
-    if (!(defined $hashExistMail->{$address}))
+    {
-        $hashExistMail->{$address} = 1;
-        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
-        gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
-        if ($self->{'verbosity'} > 3);
+    }
+    }
-    gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
-    if ($self->{'verbosity'} > 2);
+}
-# extract metadata
-sub auto_extract_metadata {
-    my $self = shift (@_);
-    my ($doc_obj) = @_;
-    if ($self->{'extract_email'}) {
-    my $thissection = $doc_obj->get_top_section();
-    while (defined $thissection) {
-        my $text = $doc_obj->get_text($thissection);
-        $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
-        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
-    if ($self->{'extract_placenames'}) {
-    my $thissection = $doc_obj->get_top_section();
-    while (defined $thissection) {
-        my $text = $doc_obj->get_text($thissection);
-        $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
-        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
-    if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
-    $self->extract_keyphrases($doc_obj);
+    }
-    if ($self->{'first'}) {
-    my $thissection = $doc_obj->get_top_section();
-    while (defined $thissection) {
-        my $text = $doc_obj->get_text($thissection);
-        $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
-        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
-    if ($self->{'extract_acronyms'}) {
-    my $thissection = $doc_obj->get_top_section();
-    while (defined $thissection) {
-        my $text = $doc_obj->get_text($thissection);
-        $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
-        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
-    if ($self->{'markup_acronyms'}) {
-    my $thissection = $doc_obj->get_top_section();
-    while (defined $thissection) {
-        my $text = $doc_obj->get_text($thissection);
-        $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
-        $doc_obj->delete_text($thissection);
-        $doc_obj->add_text($thissection, $text);
-        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
-    if($self->{'extract_historical_years'}) {
-    my $thissection = $doc_obj->get_top_section();
-    while (defined $thissection) {
-        my $text = $doc_obj->get_text($thissection);
-        &DateExtract::get_date_metadata($text, $doc_obj,
-                        $thissection,
-                        $self->{'no_bibliography'},
-                        $self->{'maximum_year'},
-                        $self->{'maximum_century'});
-        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
+}
-#adding kea keyphrases
-sub extract_keyphrases
+{
-    my $self = shift(@_);
-    my $doc_obj = shift(@_);
-    # Use Kea 3.0 unless 4.0 has been specified
-    my $kea_version = "3.0";
-    if ($self->{'extract_keyphrases_kea4'}) {
-    $kea_version = "4.0";
+    }
-    # Check that Kea exists, and tell the user where to get it if not
-    my $keahome = &Kea::get_Kea_directory($kea_version);
-    if (!-e $keahome) {
-    gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
-    return;
+    }
-    my $thissection = $doc_obj->get_top_section();
-    my $text = "";
-    my $list;
-    #loop through sections to gather whole doc
-    while (defined $thissection) {
-    my $sectiontext = $doc_obj->get_text($thissection);
-    $text = $text.$sectiontext;
-    $thissection = $doc_obj->get_next_section ($thissection);
+    }
-    if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
-    $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
-    } else { #otherwise call Kea with no options
-    $list = &Kea::extract_KeyPhrases ($kea_version, $text);
+    }
-    if ($list){
-    # if a list of kea keyphrases was returned (ie not empty)
-    if ($self->{'verbosity'}) {
-        gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
+    }
-    #add metadata to top section
-    $thissection = $doc_obj->get_top_section();
-    # add all key phrases as one metadata
-    $doc_obj->add_metadata($thissection, "Keyphrases", $list);
-    # add individual key phrases as multiple metadata
-    foreach my $keyphrase (split(',', $list)) {
-        $keyphrase =~ s/^\s+|\s+$//g;
-        $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
+    }
+    }
+}
-# extract acronyms from a section in a document. progress is
-# reported to outhandle based on the verbosity. both the Acronym
-# and the AcronymKWIC metadata items are created.
-sub extract_acronyms {
-    my $self = shift (@_);
-    my ($textref, $doc_obj, $thissection) = @_;
-    my $outhandle = $self->{'outhandle'};
-    # print $outhandle " extracting acronyms ...\n"
-    gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
-    if ($self->{'verbosity'} > 2);
-    my $acro_array =  &acronym::acronyms($textref);
-    foreach my $acro (@$acro_array) {
-    #check that this is the first time ...
-    my $seen_before = "false";
-    my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
-    foreach my $thisAcro (@$previous_data) {
-        if ($thisAcro eq $acro->to_string()) {
-        $seen_before = "true";
-        if ($self->{'verbosity'} >= 4) {
-            gsprintf($outhandle, " {BasPlug.already_seen} " .
-                 $acro->to_string() . "\n");
+        }
+        }
+    }
-    if ($seen_before eq "false") {
-        #write it to the file ...
-        $acro->write_to_file();
-        #do the normal acronym
-        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
-        gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
-        if ($self->{'verbosity'} > 3);
+    }
+    }
-    gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
-    if ($self->{'verbosity'} > 2);
+}
-sub markup_acronyms {
-    my $self = shift (@_);
-    my ($text, $doc_obj, $thissection) = @_;
-    my $outhandle = $self->{'outhandle'};
-    gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
-    if ($self->{'verbosity'} > 2);
-    #self is passed in to check for verbosity ...
-    $text = &acronym::markup_acronyms($text, $self);
-    gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
-    if ($self->{'verbosity'} > 2);
-    return $text;
+}
 sub compile_stats {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 15868

Legend:

gsdl/trunk/perllib/plugins/BasePlugin.pm

Download in other formats: