Changeset 15868


Ignore:
Timestamp:
2008-06-05T09:21:21+12:00 (16 years ago)
Author:
kjdon
Message:

plugin overhaul: BasPlug has been split into several base plugins: PrintInfo just does the printing for pluginfo.pl, and does the argument parsing in the constructor. All plugins and supporting extractors etc inherit directly or indirectly from this. AbstractPlugin adds a few methods to this, is used by Directory and ArchivesInf plugins. These are not really plugins so can we remove them? anyway, not sure if AbstractPlugin will live for very long. BasePlugin is a proper base plugin, has read and read_into_doc_obj methods. It does nothing with reading in the file or textcat stuff. Makes a basic doc obj and adds some metadata. It also handles all the blocking stuff, associate ext stuff etc. Binary plugins can implement the process method to do file specific stuff. AutoExtractMetadata inherits BasePlugin and adds automatic metadata extraction using hte new Extractor plugins. ReadTextFile is the equivalent in functionality to the old BasPlug - does lang and encoding extraction, and reading in the file. It inherits from AutoExtractMetadata. If your file type is binary and will have no text, then inherit from BasePlugin. If its binary but ends up with text (eg using convert_to) then inherit from AutoExtractMetadata. If your file is a text type file, then inherit from ReadTextFile.

Location:
gsdl/trunk/perllib/plugins
Files:
3 added
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r15865 r15868  
    11###########################################################################
    22#
    3 # BasPlug.pm -- base class for all the import plugins
     3# BasePlugin.pm -- base class for all the import plugins
    44# A component of the Greenstone digital library software
    55# from the New Zealand Digital Library Project at the
     
    2424###########################################################################
    2525
    26 package BasPlug;
    27 
    28 BEGIN {
    29     die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    30 }
    31 
    32 eval {require bytes};
    33 
    34 # suppress the annoying "subroutine redefined" warning that various
    35 # plugins cause under perl 5.6
    36 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
     26package BasePlugin;
    3727
    3828use strict;
     
    4232use File::Basename;
    4333
    44 use Kea;
    4534use multiread;
    4635use encodings;
    4736use unicode;
    48 use cnseg;
    49 use acronym;
    50 use textcat;
    5137use doc;
    5238eval "require diagnostics"; # some perl distros (eg mac) don't have this
    53 use DateExtract;
    5439use ghtml;
    5540use gsprintf 'gsprintf';
    56 use printusage;
    57 use parse2;
    58 
    59 
    60 use GISBasPlug;
    61 
    62 @BasPlug::ISA = ( GISBasPlug );
    63 
    64 my $unicode_list =
     41
     42use PrintInfo;
     43
     44BEGIN {
     45    @BasePlugin::ISA = ( 'PrintInfo' );
     46}
     47
     48our $encoding_list =
    6549    [ { 'name' => "ascii",
    66     'desc' => "{BasPlug.input_encoding.ascii}" },
     50    'desc' => "{ReadTextFile.input_encoding.ascii}" },
    6751      { 'name' => "utf8",
    68     'desc' => "{BasPlug.input_encoding.utf8}" },
     52    'desc' => "{ReadTextFile.input_encoding.utf8}" },
    6953      { 'name' => "unicode",
    70     'desc' => "{BasPlug.input_encoding.unicode}" } ];
    71 
    72 my $auto_unicode_list =
    73     [ { 'name' => "auto",
    74     'desc' => "{BasPlug.input_encoding.auto}" } ];
     54    'desc' => "{ReadTextFile.input_encoding.unicode}" } ];
    7555
    7656my $e = $encodings::encodings;
     
    8161     'desc' => $e->{$enc}->{'name'}};
    8262   
    83     push(@{$unicode_list},$hashEncode);
    84 }
    85 
    86 push(@{$auto_unicode_list},@{$unicode_list});
     63    push(@{$encoding_list},$hashEncode);
     64}
     65
     66our $encoding_plus_auto_list =
     67    [ { 'name' => "auto",
     68    'desc' => "{ReadTextFile.input_encoding.auto}" } ];
     69
     70push(@{$encoding_plus_auto_list},@{$encoding_list});
    8771
    8872my $arguments =
    8973    [ { 'name' => "process_exp",
    90     'desc' => "{BasPlug.process_exp}",
     74    'desc' => "{BasePlugin.process_exp}",
    9175    'type' => "regexp",
    9276    'deft' => "",
    9377    'reqd' => "no" },
    9478      { 'name' => "block_exp",
    95     'desc' => "{BasPlug.block_exp}",
     79    'desc' => "{BasePlugin.block_exp}",
    9680    'type' => "regexp",
    9781    'deft' => "",
    9882    'reqd' => "no" },
    9983      { 'name' => "smart_block",
    100     'desc' => "{BasPlug.smart_block}",
     84    'desc' => "{BasePlugin.smart_block}",
    10185    'type' => "flag",
    10286    'reqd' => "no" },
    10387      { 'name' => "associate_ext",
    104     'desc' => "{BasPlug.associate_ext}",
     88    'desc' => "{BasePlugin.associate_ext}",
    10589    'type' => "string",
    10690    'reqd' => "no" },
    10791      { 'name' => "associate_tail_re",
    108     'desc' => "{BasPlug.associate_tail_re}",
     92    'desc' => "{BasePlugin.associate_tail_re}",
    10993    'type' => "string",
    11094    'reqd' => "no" },
    11195      { 'name' => "use_as_doc_identifier",
    112     'desc' => "{BasPlug.use_as_doc_identifier}",
     96    'desc' => "{BasePlugin.use_as_doc_identifier}",
    11397    'type' => "string",
    11498    'reqd' => "no" ,
    11599    'deft' => "" } ,
    116       { 'name' => "input_encoding",
    117     'desc' => "{BasPlug.input_encoding}",
    118     'type' => "enum",
    119     'list' => $auto_unicode_list,
    120     'reqd' => "no" ,
    121     'deft' => "auto" } ,
    122       { 'name' => "default_encoding",
    123     'desc' => "{BasPlug.default_encoding}",
    124     'type' => "enum",
    125     'list' => $unicode_list,
    126     'reqd' => "no",
    127         'deft' => "utf8" },
    128       { 'name' => "extract_language",
    129     'desc' => "{BasPlug.extract_language}",
     100     { 'name' => "no_cover_image",
     101    'desc' => "{BasePlugin.no_cover_image}",
    130102    'type' => "flag",
    131103    'reqd' => "no" },
    132       { 'name' => "default_language",
    133     'desc' => "{BasPlug.default_language}",
    134     'type' => "string",
    135     'deft' => "en",
    136     'reqd' => "no" },
    137       { 'name' => "extract_acronyms",
    138     'desc' => "{BasPlug.extract_acronyms}",
    139     'type' => "flag",
    140     'reqd' => "no" },
    141       { 'name' => "markup_acronyms",
    142     'desc' => "{BasPlug.markup_acronyms}",
    143     'type' => "flag",
    144     'reqd' => "no" },
    145       { 'name' => "extract_keyphrases",
    146     'desc' => "{BasPlug.extract_keyphrases}",
    147     'type' => "flag",
    148     'reqd' => "no" },
    149       { 'name' => "extract_keyphrases_kea4",
    150     'desc' => "{BasPlug.extract_keyphrases_kea4}",
    151     'type' => "flag",
    152     'reqd' => "no" },
    153       { 'name' => "extract_keyphrase_options",
    154     'desc' => "{BasPlug.extract_keyphrase_options}",
    155     'type' => "string",
    156     'deft' => "",
    157     'reqd' => "no" },
    158       { 'name' => "first",
    159     'desc' => "{BasPlug.first}",
    160     'type' => "string",
    161     'reqd' => "no" },
    162       { 'name' => "extract_email",
    163     'desc' => "{BasPlug.extract_email}",
    164     'type' => "flag",
    165     'reqd' => "no" },
    166       { 'name' => "extract_historical_years",
    167     'desc' => "{BasPlug.extract_historical_years}",
    168     'type' => "flag",
    169     'reqd' => "no" },
    170       { 'name' => "maximum_year",
    171     'desc' => "{BasPlug.maximum_year}",
    172     'type' => "int",
    173     'deft' => (localtime)[5]+1900,
    174     'char_length' => "4",
    175     #'range' => "2,100",
    176     'reqd' => "no"},
    177       { 'name' => "maximum_century",
    178     'desc' => "{BasPlug.maximum_century}",
    179     'type' => "string",
    180     'deft' => "-1",
    181     'reqd' => "no" },
    182       { 'name' => "no_bibliography",
    183     'desc' => "{BasPlug.no_bibliography}",
    184     'type' => "flag",
    185     'reqd' => "no"},
    186       { 'name' => "no_cover_image",
    187     'desc' => "{BasPlug.no_cover_image}",
    188     'type' => "flag",
    189     'reqd' => "no" },
    190       { 'name' => "separate_cjk",
    191     'desc' => "{BasPlug.separate_cjk}",
    192     'type' => "flag",
    193     'reqd' => "no",
    194     'hiddengli' => "yes" },
    195       { 'name' => "new_extract_email",
    196     'desc' => "",
    197     'type' => "flag",
    198     'reqd' => "no",
    199     'hiddengli' => "yes" } ];
    200 
    201 my $gis_arguments =
    202     [ { 'name' => "extract_placenames",
    203     'desc' => "{GISBasPlug.extract_placenames}",
    204     'type' => "flag",
    205     'reqd' => "no" },
    206       { 'name' => "gazetteer",
    207     'desc' => "{GISBasPlug.gazetteer}",
    208     'type' => "string",
    209     'reqd' => "no" },
    210       { 'name' => "place_list",
    211     'desc' => "{GISBasPlug.place_list}",
    212     'type' => "flag",
    213     'reqd' => "no" } ];
    214 
    215 
    216 my $options = { 'name'     => "BasPlug",
    217         'desc'     => "{BasPlug.desc}",
     104      { 'name' => "filename_encoding",
     105    'desc' => "{BasePlugin.filename_encoding}",
     106    'type' => "enum",
     107    'deft' => "auto",
     108    'list' => $encoding_plus_auto_list,
     109    'reqd' => "no" }
     110     
     111      ];
     112
     113
     114my $options = { 'name'     => "BasePlugin",
     115        'desc'     => "{BasePlugin.desc}",
    218116        'abstract' => "yes",
    219117        'inherits' => "no",
     
    221119
    222120
    223 sub set_incremental {
    224     my $self = shift(@_);
    225     my ($incremental) = @_;
    226 
    227     $self->{'incremental'} = $incremental;
    228 }
    229 
    230 sub get_arguments
    231 {
    232     my $self = shift(@_);
    233     my $optionlistref = $self->{'option_list'};
    234     my @optionlist = @$optionlistref;
    235     my $pluginoptions = pop(@$optionlistref);
    236     my $pluginarguments = $pluginoptions->{'args'};
    237     return $pluginarguments;
    238 }
    239 
    240 
    241 sub print_xml_usage
    242 {
    243     my $self = shift(@_);
    244     my $header = shift(@_);
    245     my $high_level_information_only = shift(@_);
    246    
    247     # XML output is always in UTF-8
    248     gsprintf::output_strings_in_UTF8;
    249 
    250     if ($header) {
    251     &PrintUsage::print_xml_header("plugin");
    252     }
    253     $self->print_xml($high_level_information_only);
    254 }
    255 
    256 
    257 sub print_xml
    258 {
    259     my $self = shift(@_);
    260     my $high_level_information_only = shift(@_);
    261 
    262     my $optionlistref = $self->{'option_list'};
    263     my @optionlist = @$optionlistref;
    264     my $pluginoptions = shift(@$optionlistref);
    265     return if (!defined($pluginoptions));
    266 
    267     # Find the process and block default expressions in the plugin arguments
    268     my $process_exp = "";
    269     my $block_exp = "";
    270     if (defined($pluginoptions->{'args'})) {
    271     foreach my $option (@{$pluginoptions->{'args'}}) {
    272         if ($option->{'name'} eq "process_exp") {
    273         $process_exp = $option->{'deft'};
    274         }
    275         if ($option->{'name'} eq "block_exp") {
    276         $block_exp = $option->{'deft'};
    277         }
    278     }
    279     }
    280 
    281     gsprintf(STDERR, "<PlugInfo>\n");
    282     gsprintf(STDERR, "  <Name>$pluginoptions->{'name'}</Name>\n");
    283     my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
    284     $desc =~ s/</&amp;lt;/g; # doubly escaped
    285     $desc =~ s/>/&amp;gt;/g;
    286     gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
    287     gsprintf(STDERR, "  <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
    288     gsprintf(STDERR, "  <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
    289     gsprintf(STDERR, "  <Processes>$process_exp</Processes>\n");
    290     gsprintf(STDERR, "  <Blocks>$block_exp</Blocks>\n");
    291     gsprintf(STDERR, "  <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
    292     # adding new option that works with replace_srcdoc_with_html.pl
    293     gsprintf(STDERR, "  <SourceReplaceable>" . ($pluginoptions->{'srcreplaceable'} || "no") . "</SourceReplaceable>\n");
    294     unless (defined($high_level_information_only)) {
    295     gsprintf(STDERR, "  <Arguments>\n");
    296     if (defined($pluginoptions->{'args'})) {
    297         &PrintUsage::print_options_xml($pluginoptions->{'args'});
    298     }
    299     gsprintf(STDERR, "  </Arguments>\n");
    300 
    301     # Recurse up the plugin hierarchy
    302     $self->print_xml();
    303     }
    304     gsprintf(STDERR, "</PlugInfo>\n");
    305 }
    306 
    307 
    308 sub print_txt_usage
    309 {
    310     my $self = shift(@_);
    311     # Print the usage message for a plugin (recursively)
    312     my $descoffset = $self->determine_description_offset(0);
    313     $self->print_plugin_usage($descoffset, 1);
    314 }
    315 
    316 
    317 sub determine_description_offset
    318 {
    319     my $self = shift(@_);
    320     my $maxoffset = shift(@_);
    321 
    322     my $optionlistref = $self->{'option_list'};
    323     my @optionlist = @$optionlistref;
    324     my $pluginoptions = shift(@$optionlistref);
    325     return $maxoffset if (!defined($pluginoptions));
    326 
    327     # Find the length of the longest option string of this plugin
    328     my $pluginargs = $pluginoptions->{'args'};
    329     if (defined($pluginargs)) {
    330     my $longest = &PrintUsage::find_longest_option_string($pluginargs);
    331     if ($longest > $maxoffset) {
    332         $maxoffset = $longest;
    333     }
    334     }
    335 
    336     # Recurse up the plugin hierarchy
    337     $maxoffset = $self->determine_description_offset($maxoffset);
    338     $self->{'option_list'} = \@optionlist;
    339     return $maxoffset;
    340 }
    341 
    342 
    343 sub print_plugin_usage
    344 {
    345     my $self = shift(@_);
    346     my $descoffset = shift(@_);
    347     my $isleafclass = shift(@_);
    348 
    349     my $optionlistref = $self->{'option_list'};
    350     my @optionlist = @$optionlistref;
    351     my $pluginoptions = shift(@$optionlistref);
    352     return if (!defined($pluginoptions));
    353 
    354     my $pluginname = $pluginoptions->{'name'};
    355     my $pluginargs = $pluginoptions->{'args'};
    356     my $plugindesc = $pluginoptions->{'desc'};
    357 
    358     # Produce the usage information using the data structure above
    359     if ($isleafclass) {
    360     if (defined($plugindesc)) {
    361         gsprintf(STDERR, "$plugindesc\n\n");
    362     }
    363     gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
    364     }
    365 
    366     # Display the plugin options, if there are some
    367     if (defined($pluginargs)) {
    368     # Calculate the column offset of the option descriptions
    369     my $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions
    370 
    371     if ($isleafclass) {
    372         gsprintf(STDERR, " {common.specific_options}:\n");
    373     }
    374     else {
    375         gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
    376     }
    377 
    378     # Display the plugin options
    379     &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
    380     }
    381 
    382     # Recurse up the plugin hierarchy
    383     $self->print_plugin_usage($descoffset, 0);
    384     $self->{'option_list'} = \@optionlist;
    385 }
    386 
    387 
    388121sub new {
    389     # Set Encodings to the list!!
    390 
    391 
    392     # Start the BasPlug Constructor
    393     my $class = shift (@_);
    394     my ($pluginlist,$args,$hashArgOptLists) = @_;
     122
     123    my ($class) = shift (@_);
     124    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    395125    push(@$pluginlist, $class);
     126
     127    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     128    push(@{$hashArgOptLists->{"OptList"}},$options);
     129
     130    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
     131
    396132    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
    397 
    398     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    399     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    400 
    401     if (GISBasPlug::has_mapdata()) {
    402     push(@$arguments,@$gis_arguments);
    403     }
    404    
    405     my $self = {};
    406     $self->{'outhandle'} = STDERR;
    407     $self->{'option_list'} = $hashArgOptLists->{"OptList"};
    408     $self->{"info_only"} = 0;
    409 
    410     # Check if gsdlinfo is in the argument list or not - if it is, don't parse
    411     # the args, just return the object. 
    412     foreach my $strArg (@{$args})
    413     {
    414     if($strArg eq "-gsdlinfo")
    415     {
    416         $self->{"info_only"} = 1;
    417         return bless $self, $class;
    418     }
    419     }
    420 
    421     if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
    422     {
    423     my $classTempClass = bless $self, $class;
    424     print STDERR "<BadPlugin p=$plugin_name>\n";
    425     &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
    426     $classTempClass->print_txt_usage("");  # Use default resource bundle
    427     die "\n";
    428     }
    429 
    430 
    431     delete $self->{"info_only"};
    432     # else parsing was successful.
    433 
    434133    $self->{'plugin_type'} = $plugin_name;
    435     #$self->{'outhandle'} = STDERR;
     134
    436135    $self->{'num_processed'} = 0;
    437136    $self->{'num_not_processed'} = 0;
     
    465164    $self->{'file_blocks'} = {};
    466165
    467     if ($self->{'extract_placenames'}) {
    468 
    469     my $outhandle = $self->{'outhandle'};
    470    
    471     my $places_ref
    472         = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
    473    
    474     if (!defined $places_ref) {
    475         print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
    476         print $outhandle "         No placename extraction will take place.\n";
    477         $self->{'extract_placenames'} = undef;
    478     }
    479     else {
    480         $self->{'places'} = $places_ref;
    481     }
    482     }
    483166
    484167    return bless $self, $class;
    485    
    486 }
    487 
    488 # initialize BasPlug options
    489 # if init() is overridden in a sub-class, remember to call BasPlug::init()
     168
     169}
     170
     171# initialize BasePlugin options
     172# if init() is overridden in a sub-class, remember to call BasePlugin::init()
    490173sub init {
    491174    my $self = shift (@_);
     
    520203    my $self = shift (@_);
    521204    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
    522 
    523    #my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
    524    #print STDERR "Calling method; $cfilename:$cline $cpackage->$csubr\n";
    525 
    526     $self->initialise_extractors();
    527205}
    528206
     
    531209    # import.pl only has one plugin pass, but buildcol.pl has multiple ones
    532210
    533     my ($self) = @_;
    534     $self->finalise_extractors();
     211    my ($self) = shift (@_);
    535212}
    536213
     
    539216
    540217    my ($self) = @_;
     218}
     219
     220sub set_incremental {
     221    my $self = shift(@_);
     222    my ($incremental) = @_;
     223
     224    $self->{'incremental'} = $incremental;
    541225}
    542226
     
    752436}
    753437
     438sub get_full_filenames {
     439    my $self = shift (@_);
     440    my ($base_dir, $file) = @_;
     441
     442    my $filename_full_path = $file;
     443    # add on directory if present
     444    $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     445    my $filename_no_path = $file;
     446    # remove directory if present
     447    $filename_no_path =~ s/^.*[\/\\]//;
     448    return ($filename_full_path, $filename_no_path);
     449}
    754450
    755451sub read_block {
     
    759455
    760456
    761     my $filename = $file;
    762     $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    763 
    764     if ($self->associate_with($file,$filename,$metadata)) {
     457    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
     458
     459    if ($self->associate_with($file,$filename_full_path,$metadata)) {
    765460    # a form of smart block
    766461    $self->{'num_blocked'} ++;
     
    772467   
    773468    if ($smart_block || $smart_block_BN) {
    774     if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
     469    if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
    775470        $self->{'num_blocked'} ++;
    776471        return (0,undef); # blocked
    777472    }
    778473    } else {
    779     if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
     474    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
    780475        $self->{'num_blocked'} ++;
    781476        return (0,undef); # blocked
    782477    }
    783478    if ($self->{'cover_image'}) {
    784         if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
     479        if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
    785480        $self->{'num_blocked'} ++;
    786481        return (0,undef); # blocked
     
    789484    }
    790485
    791     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     486    if ($filename_full_path !~ /$self->{'process_exp'}/ || !-f $filename_full_path) {
    792487    return (undef,undef); # can't recognise
    793488    }
    794489   
    795     return (1,$filename);
    796 }
    797 
    798 sub read_tidy_file {
    799 
     490    ##why are we returning the full filename - do we need this??
     491    return (1,$filename_full_path);
     492}
     493
     494
     495#filename_encoding set by user
     496sub filename_to_utf8_metadata
     497{
    800498    my $self = shift (@_); 
    801  
    802     my ($file) = @_;
    803 
    804     $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    805 
    806     return $file;
    807 }
    808 
    809 
    810 sub filename_to_metadata
    811 {
    812     my $self = shift (@_); 
    813     my ($file, $encoding) = @_;
     499    my ($file, $file_encoding) = @_;
    814500
    815501    my $outhandle = $self->{'outhandle'};
    816502
     503    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
     504
     505    my $filename_encoding = $self->{'filename_encoding'};
     506    if ($filename_encoding eq "auto") {
     507    # we check the locale first
     508    if (!defined $self->{'filesystem_encoding'}) {
     509        $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
     510        $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
     511    }
     512    if ($self->{'filesystem_encoding'} ne "undefined") {
     513        $filename_encoding = $self->{'filesystem_encoding'};
     514    } else {
     515        # try the encoding of the document, if available
     516        if (defined $file_encoding) {
     517        $filename_encoding = $file_encoding;
     518        } else {
     519        # use utf8
     520        $filename_encoding = "utf8";
     521        }
     522    }
     523   
     524    }
     525
     526    if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) {
     527    $filemeta = unicode::unicode2utf8(
     528      unicode::convert2unicode($filename_encoding, \$filemeta)
     529    );
     530    }
     531    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
     532
     533    return $dmsafe_filemeta;
     534
     535}
     536
     537
     538sub get_filesystem_encoding {
     539
     540    my $self = shift(@_);
     541
     542    my $outhandle = $self->{'outhandle'};
    817543    my $filesystem_encoding = undef;
    818544
    819545    eval {
    820546    use POSIX qw(locale_h);
    821 
    822     # With only one parameter, setlocale retrieves the current value
     547   
     548    # With only one parameter, setlocale retrieves the
     549    # current value
    823550    my $current_locale = setlocale(LC_CTYPE);
    824 
     551   
    825552    if ($current_locale =~ m/^.*\.(.*?)$/) {
    826553        my $char_encoding = lc($1);
     
    831558        $char_encoding =~ s/-/_/g;
    832559        $char_encoding =~ s/^utf_8$/utf8/;
    833 
     560       
    834561        if ($char_encoding =~ m/^\d+$/) {
    835562        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
     
    840567        }
    841568        }
    842 
     569       
    843570        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
    844571        || (defined $encodings::encodings->{$char_encoding})) {
     
    849576        }
    850577    }
    851 
     578   
    852579
    853580    };
     
    857584   
    858585    }
    859    
    860     my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
    861    
    862     # how do we know what encoding the filename is in?
    863     # => one answer is to check the locale
    864 
    865     if (defined $filesystem_encoding) {
    866     if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) {
    867         $filemeta = unicode::unicode2utf8(
    868             unicode::convert2unicode($filesystem_encoding, \$filemeta)
    869               );
    870     }
    871     }
    872     # assume it is in the same encoding as its contents
    873     elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) {
    874     $filemeta = unicode::unicode2utf8(
    875         unicode::convert2unicode($encoding, \$filemeta)
    876     );
    877     }
    878    
    879     my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
    880 
    881     return $dmsafe_filemeta;
    882 }
    883 
    884 
    885 sub add_OID
    886 {
     586    return $filesystem_encoding;
     587}
     588
     589# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
     590sub set_Source_metadata {
     591    my $self = shift (@_); 
     592    my ($doc_obj, $filename_no_path, $file_encoding) = @_;
     593
     594    my $top_section = $doc_obj->get_top_section();
     595   
     596    # the original encoding filename
     597    $doc_obj->set_metadata_element($top_section, "Source", $filename_no_path);
     598    # UTF-8 version of filename
     599    my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
     600    $doc_obj->set_utf8_metadata_element($top_section, "SourceUTF8", $filemeta);
     601
     602}
     603     
     604sub add_OID {
     605
    887606    my $self = shift (@_); 
    888607    my ($doc_obj) = @_;
     
    911630}
    912631
    913 # The BasPlug read_into_doc_obj() function. This function does all the
    914 # right things to make general options work for a given plugin.  It reads in
     632
     633
     634# The BasePlugin read_into_doc_obj() function. This function does all the
     635# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
    915636# a file and sets up a slew of metadata all saved in doc_obj, which
    916637# it then returns as part of a tuple (process_status,doc_obj)
     
    926647# Note that $base_dir might be "" and that $file might
    927648# include directories
     649
     650# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
    928651sub read_into_doc_obj {
    929652    my $self = shift (@_); 
    930653    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    931654
    932     if ($self->is_recursive()) {
    933     gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
    934     }
    935 
    936     my $outhandle   = $self->{'outhandle'};
    937 
    938     my ($block_status,$filename) = $self->read_block(@_);   
    939     return $block_status if ((!defined $block_status) || ($block_status==0));
    940     $file = $self->read_tidy_file($file);
    941 
    942     # Do encoding stuff
    943     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    944     if ($self->{'verbosity'} > 2) {
    945     print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
    946     }
    947 
     655    my $outhandle = $self->{'outhandle'};
     656
     657    # should we move this to read? What about secondary plugins?
     658    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
     659    print $outhandle "$self->{'plugin_type'} processing $file\n"
     660        if $self->{'verbosity'} > 1;
     661
     662    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
    948663    # create a new document
    949     my $doc_obj = new doc ($filename, "indexed_doc");
     664    my $doc_obj = new doc ($filename_full_path, "indexed_doc");
    950665    my $top_section = $doc_obj->get_top_section();
    951666
    952     $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    953     $doc_obj->add_utf8_metadata($top_section, "Language", $language);
    954     $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
     667    # this should look at the plugin option too...
     668    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});   
    955669    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
    956     $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
    957 
    958     my $filemeta = $self->filename_to_metadata($file,$encoding);
    959     $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
     670    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
     671 
     672    $self->Set_Source_metadata($doc_obj, $filename_no_path);
     673
     674    # plugin specific stuff - what args do we need here??
     675    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
     676    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
     677    return -1;
     678    }
     679   
     680    # include any metadata passed in from previous plugins
     681    # note that this metadata is associated with the top level section
     682    my $section = $doc_obj->get_top_section();
     683    # can we merge these two methods??
     684    $self->add_associated_files($doc_obj, $filename_full_path);
     685    $self->extra_metadata ($doc_obj, $section, $metadata);
     686    $self->auto_extract_metadata($doc_obj);
     687
     688    # if we haven't found any Title so far, assign one
     689    # this was shifted to here from inside read()
     690    $self->title_fallback($doc_obj,$section,$filename_no_path);
     691   
     692    $self->add_OID($doc_obj);
     693   
     694    return (1,$doc_obj);
     695}
     696
     697sub add_dummy_text {
     698    my $self = shift(@_);
     699    my ($doc_obj, $section) = @_;
     700
     701    # add NoText metadata so we can hide this dummy text in format statements
     702    $doc_obj->add_metadata($section, "NoText", "1");
     703    $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
     704   
     705}
     706
     707# does nothing. Can be overridden by subclass
     708sub auto_extract_metadata {
     709    my $self = shift(@_);
     710    my ($doc_obj) = @_;
     711}
     712
     713# adds cover image, associate_file options stuff. Should be called by sub class
     714# read_into_doc_obj
     715sub add_associated_files {
     716    my $self = shift(@_);
     717    # whatis filename??
     718    my ($doc_obj, $filename) = @_;
     719   
     720    # add in the cover image
    960721    if ($self->{'cover_image'}) {
    961722    $self->associate_cover_image($doc_obj, $filename);
    962723    }
    963    
    964     # read in file ($text will be in utf8)
    965     my $text = "";
    966     $self->read_file ($filename, $encoding, $language, \$text);
    967 
    968     if (!length ($text)) {
    969     my $plugin_name = ref ($self);
    970     if ($gli) {
    971         print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
    972     }
    973     gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
    974 
    975     my $failhandle = $self->{'failhandle'};
    976     gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
    977     # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
    978     $self->{'num_not_processed'} ++;
    979 
    980     return (0,undef); # what should we return here?? error but don't want to pass it on
    981     }
    982724   
    983     # include any metadata passed in from previous plugins
    984     # note that this metadata is associated with the top level section
    985 
    986     my $associate_tail_re = $self->{'associate_tail_re'};
    987 
    988     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    989 
    990     # do plugin specific processing of doc_obj
    991     unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
    992     $text = '';
    993     undef $text;
    994     print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    995     return (-1,undef);
    996     }
    997     $text='';
    998     undef $text;
    999    
    1000     # do any automatic metadata extraction
    1001     $self->auto_extract_metadata ($doc_obj);
    1002 
    1003     $self->add_OID($doc_obj);
    1004    
    1005     return (1,$doc_obj);
    1006 }
    1007 
    1008 
    1009 # The BasPlug read() function. This function calls read_into_doc_obj()
     725
     726}
     727
     728# The BasePlugin read() function. This function calls read_into_doc_obj()
    1010729# to ensure all the right things to make general options work for a
    1011730# given plugin are done. It then calls the process() function which
     
    1026745    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    1027746
     747    # check that we are not blocked
     748    my ($block_status,$filename) = $self->read_block(@_);   
     749    return $block_status if ((!defined $block_status) || ($block_status==0));
     750
    1028751    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
    1029     
     752   
    1030753    if ((defined $process_status) && ($process_status == 1)) {
     754
    1031755    # process the document
    1032756    $processor->process($doc_obj);
    1033    
    1034     if(defined($self->{'places_filename'})){
    1035         &util::rm($self->{'places_filename'});
    1036         $self->{'places_filename'} = undef;
    1037     }
    1038    
     757
    1039758    $self->{'num_processed'} ++;
    1040759    undef $doc_obj;
    1041760    }
     761    # delete any temp files that we may have created
     762    $self->clean_up_after_doc_obj_processing();
    1042763
    1043764    # if process_status == 1, then the file has been processed.
     
    1051772    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    1052773
    1053     gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
    1054     # die "Basplug::process function must be implemented in sub-class\n";
     774    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
     775    # die "BasePlugin::process function must be implemented in sub-class\n";
    1055776
    1056777    return undef; # never gets here
    1057778}
    1058779
    1059 # uses the multiread package to read in the entire file pointed to
    1060 # by filename and loads the resulting text into $$textref. Input text
    1061 # may be in any of the encodings handled by multiread, output text
    1062 # will be in utf8
    1063 sub read_file {
    1064     my $self = shift (@_);
    1065     my ($filename, $encoding, $language, $textref) = @_;
    1066 
    1067     if (!-r $filename)
    1068     {
    1069     my $outhandle = $self->{'outhandle'};
    1070     gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
    1071     # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
    1072     return;
    1073     }
    1074     $$textref = "";
    1075     if (!open (FILE, $filename)) {
    1076     gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
    1077     die "\n";
    1078     }
    1079      
    1080     if ($encoding eq "ascii") {
    1081     undef $/;
    1082     $$textref = <FILE>;
    1083     $/ = "\n";
    1084     } else {
    1085     my $reader = new multiread();
    1086     $reader->set_handle ('BasPlug::FILE');
    1087     $reader->set_encoding ($encoding);
    1088     $reader->read_file ($textref);
    1089         #Now segments chinese if the separate_cjk option is set
    1090     if ($self->{'separate_cjk'}) {
    1091         # segment the Chinese words
    1092         $$textref = &cnseg::segment($$textref);
    1093     }
    1094     }
    1095     close FILE;
    1096 }
    1097 
     780# overwrite this method to delete any temp files that we have created
     781sub clean_up_after_doc_obj_processing {
     782    my $self = shift(@_);
     783
     784}
    1098785# write_file -- used by ConvertToPlug, for example in post processing
    1099786#
     787# where should this go, is here the best place??
    1100788sub utf8_write_file {
    1101789    my $self = shift (@_);
     
    1130818    my ($doc_obj,$section,$file) = @_;
    1131819
    1132     if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
    1133 
    1134     my $file_derived_title = $self->filename_based_title($file);
    1135    
    1136     $doc_obj->add_utf8_metadata ($section, "Title", $self->filename_to_metadata($file_derived_title));
    1137     }
    1138 }
    1139 
    1140 sub textcat_get_language_encoding {
    1141     my $self = shift (@_);
    1142     my ($filename) = @_;
    1143 
    1144    
    1145     my ($language, $encoding, $extracted_encoding);
    1146     if ($self->{'input_encoding'} eq "auto") {
    1147         # use textcat to automatically work out the input encoding and language
    1148         ($language, $encoding) = $self->get_language_encoding ($filename);
    1149     } elsif ($self->{'extract_language'}) {
    1150         # use textcat to get language metadata
    1151         ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
    1152         $encoding = $self->{'input_encoding'};
    1153     # don't print this message for english... english in utf8 is identical
    1154     # to english in iso-8859-1 (except for some punctuation). We don't have
    1155     # a language model for en_utf8, so textcat always says iso-8859-1!
    1156         if ($extracted_encoding ne $encoding && $language ne "en"
    1157         && $self->{'verbosity'}) {
    1158         my $plugin_name = ref ($self);
    1159         my $outhandle = $self->{'outhandle'};
    1160         gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
    1161         }
    1162     } else {
    1163         $language = $self->{'default_language'};
    1164         $encoding = $self->{'input_encoding'};
    1165     }
    1166 
    1167     return ($language, $encoding);
    1168 }
    1169 
    1170 # Uses textcat to work out the encoding and language of the text in
    1171 # $filename. All html tags are removed before processing.
    1172 # returns an array containing "language" and "encoding"
    1173 sub get_language_encoding {
    1174     my $self = shift (@_);
    1175     my ($filename) = @_;
    1176     my $outhandle = $self->{'outhandle'};
    1177     my $unicode_format = "";
    1178     my $best_language = "";
    1179     my $best_encoding = "";
    1180    
    1181     # read in file
    1182     if (!open (FILE, $filename)) {
    1183     gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
    1184     # this is a pretty bad error, but try to continue anyway
    1185     return ($self->{'default_language'}, $self->{'input_encoding'});
    1186     }
    1187     undef $/;
    1188     my $text = <FILE>;
    1189     $/ = "\n";
    1190     close FILE;
    1191 
    1192     # check if first few bytes have a Byte Order Marker
    1193     my $bom=substr($text,0,2); # check 16bit unicode
    1194     if ($bom eq "\xff\xfe") { # little endian 16bit unicode
    1195     $unicode_format="unicode";
    1196     } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
    1197     $unicode_format="unicode";
    1198     } else {
    1199     $bom=substr($text,0,3); # check utf-8
    1200     if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
    1201         $unicode_format="utf8";
    1202 #   } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
    1203 #       $unicode_format="utf8";
    1204     }
    1205     }
    1206    
    1207 
    1208     # handle html files specially
    1209     # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
    1210     if (ref($self) eq 'HTMLPlug' ||
    1211     (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
    1212 
    1213     # remove <title>stuff</title> -- as titles tend often to be in English
    1214     # for foreign language documents
    1215     $text =~ s!<title>.*?</title>!!si;
    1216 
    1217     # see if this html file specifies its encoding
    1218     if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
    1219         $best_encoding = $1;
    1220     } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#"
    1221         $best_encoding = $1;
    1222     }
    1223     if ($best_encoding) { # we extracted an encoding
    1224         $best_encoding =~ s/-+/_/g;
    1225         $best_encoding = lc($best_encoding); # lowercase
    1226         if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
    1227         $self->{'input_encoding'} = $best_encoding;
    1228     }
    1229        
    1230     # remove all HTML tags
    1231     $text =~ s/<[^>]*>//sg;
    1232     }
    1233 
    1234     # get the language/encoding
    1235     $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
    1236     my $results = $self->{'textcat'}->classify(\$text);
    1237 
    1238     # if textcat returns 3 or less possibilities we'll use the
    1239     # first one in the list - otherwise use the defaults
    1240     if (scalar @$results > 3) {
    1241     if ($unicode_format) { # in case the first had a BOM
    1242         $best_encoding=$unicode_format;
    1243     } else {
    1244         my %guessed_encodings = ();
    1245         foreach my $result (@$results) {
    1246         $result =~ /([^\-]+)$/;
    1247         my $enc=$1;
    1248         if (!defined($guessed_encodings{$enc})) {
    1249             $guessed_encodings{$enc}=0;
    1250         }
    1251         $guessed_encodings{$enc}++;
    1252         }
    1253 
    1254         $guessed_encodings{""}=-1; # for default best_encoding of ""
    1255         foreach my $enc (keys %guessed_encodings) {
    1256         if ($guessed_encodings{$enc} >
    1257             $guessed_encodings{$best_encoding}){
    1258             $best_encoding=$enc;
    1259         }
    1260         }
    1261     }
    1262 
    1263     if ($self->{'input_encoding'} ne 'auto') {
    1264         if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
    1265         gsprintf($outhandle,
    1266              "BasPlug: {BasPlug.could_not_extract_language}\n",
    1267              $filename, $self->{'default_language'});
    1268         }       
    1269         $best_language = $self->{'default_language'};
    1270         $best_encoding = $self->{'input_encoding'};
    1271 
    1272     } else {
    1273         if ($self->{'verbosity'}>2) {
    1274         gsprintf($outhandle,
    1275              "BasPlug: {BasPlug.could_not_extract_language}\n",
    1276              $filename, $self->{'default_language'});
    1277         }
    1278         $best_language = $self->{'default_language'};
    1279     }
    1280     } else { # <= 3 suggestions
    1281     my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
    1282     if (!defined $language) {
    1283         if ($self->{'verbosity'}>2) {
    1284         gsprintf($outhandle,
    1285             "BasPlug: {BasPlug.could_not_extract_language}\n",
    1286             $filename, $self->{'default_language'});
    1287         }
    1288         $language = $self->{'default_language'};
    1289     }
    1290     if (!defined $encoding) {
    1291         if ($self->{'verbosity'}>2) {
    1292         gsprintf($outhandle,
    1293             "BasPlug: {BasPlug.could_not_extract_encoding}\n",
    1294             $filename, $self->{'default_encoding'});
    1295         }
    1296         $encoding = $self->{'default_encoding'};
    1297     }
    1298     $best_language = $language;
    1299     if (! $best_encoding ) { # may already be set... eg from html meta tag
    1300         $best_encoding = $encoding;
    1301     }
    1302     }
    1303 
    1304     my $text_copy = $text;
    1305     if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
    1306     # the text is valid utf8, so assume that's the real encoding
    1307     # (since textcat is based on probabilities)
    1308     $best_encoding = 'utf8';
    1309     }
    1310 
    1311     # check for equivalents where textcat doesn't have some encodings...
    1312     # eg MS versions of standard encodings
    1313     if ($best_encoding =~ /^iso_8859_(\d+)/) {
    1314     my $iso = $1; # which variant of the iso standard?
    1315     # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
    1316     if ($text =~ /[\x80-\x9f]/) {
    1317         # Western Europe
    1318         if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
    1319         elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
    1320         elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
    1321         elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
    1322         elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
    1323         elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
    1324         elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
    1325     }
    1326     }
    1327 
    1328     if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
    1329     !defined $encodings::encodings->{$best_encoding}) {
    1330     if ($self->{'verbosity'}) {
    1331         gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
    1332              $filename, $best_encoding, $self->{'default_encoding'});
    1333     }
    1334     $best_encoding = $self->{'default_encoding'};
    1335     }
    1336 
    1337     return ($best_language, $best_encoding);
    1338 }
    1339 
     820    if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
     821
     822    my $file_derived_title = $self->filename_to_metadata($self->filename_based_title($file));
     823    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
     824        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
     825    }
     826    else {
     827        $doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title);
     828    }
     829    }
     830       
     831}
     832 
    1340833# add any extra metadata that's been passed around from one
    1341834# plugin to another.
     
    1401894}
    1402895
    1403 # initialise metadata extractors
    1404 sub initialise_extractors {
    1405     my $self = shift (@_);
    1406 
    1407     if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
    1408     &acronym::initialise_acronyms();
    1409     }
    1410 }
    1411 
    1412 # finalise metadata extractors
    1413 sub finalise_extractors {
    1414     my $self = shift (@_);
    1415 
    1416     if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
    1417     &acronym::finalise_acronyms();
    1418     }
    1419 }
    1420 
    1421 # FIRSTNNN: extract the first NNN characters as metadata
    1422 sub extract_first_NNNN_characters {
    1423     my $self = shift (@_);
    1424     my ($textref, $doc_obj, $thissection) = @_;
    1425    
    1426     foreach my $size (split /,/, $self->{'first'}) {
    1427     my $tmptext =  $$textref;
    1428     $tmptext =~ s/^\s+//;
    1429     $tmptext =~ s/\s+$//;
    1430     $tmptext =~ s/\s+/ /gs;
    1431     $tmptext = substr ($tmptext, 0, $size);
    1432     $tmptext =~ s/\s\S*$/&#8230;/;
    1433     $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
    1434     }
    1435 }
    1436 
    1437 sub extract_email {
    1438     my $self = shift (@_);
    1439     my ($textref, $doc_obj, $thissection) = @_;
    1440     my $outhandle = $self->{'outhandle'};
    1441 
    1442     gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
    1443     if ($self->{'verbosity'} > 2);
    1444    
    1445     my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
    1446     @email = sort @email;
    1447    
    1448 #    if($self->{"new_extract_email"} == 0)
    1449 #    {
    1450 #    my @email2 = ();
    1451 #    foreach my $address (@email)
    1452 #   {
    1453 #   if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
    1454 #       {
    1455 #       push @email2, $address;
    1456 #       $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
    1457 #       # print $outhandle "  extracting $address\n"
    1458 #       &gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
    1459 #           if ($self->{'verbosity'} > 3);
    1460 #       }
    1461 #   }
    1462 #    }
    1463 #    else
    1464 #    {
    1465     my $hashExistMail = {};
    1466     foreach my $address (@email) {
    1467     if (!(defined $hashExistMail->{$address}))
    1468     {
    1469         $hashExistMail->{$address} = 1;
    1470         $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
    1471         gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
    1472         if ($self->{'verbosity'} > 3);
    1473     }
    1474     }
    1475     gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
    1476     if ($self->{'verbosity'} > 2);
    1477 }
    1478 
    1479 # extract metadata
    1480 sub auto_extract_metadata {
    1481 
    1482     my $self = shift (@_);
    1483     my ($doc_obj) = @_;
    1484    
    1485     if ($self->{'extract_email'}) {
    1486     my $thissection = $doc_obj->get_top_section();
    1487     while (defined $thissection) {
    1488         my $text = $doc_obj->get_text($thissection);
    1489         $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
    1490         $thissection = $doc_obj->get_next_section ($thissection);
    1491     }
    1492     }
    1493     if ($self->{'extract_placenames'}) {
    1494     my $thissection = $doc_obj->get_top_section();
    1495     while (defined $thissection) {
    1496         my $text = $doc_obj->get_text($thissection);
    1497         $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
    1498         $thissection = $doc_obj->get_next_section ($thissection);
    1499     }
    1500     }
    1501 
    1502     if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
    1503     $self->extract_keyphrases($doc_obj);
    1504     }
    1505 
    1506     if ($self->{'first'}) {
    1507     my $thissection = $doc_obj->get_top_section();
    1508     while (defined $thissection) {
    1509         my $text = $doc_obj->get_text($thissection);
    1510         $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
    1511         $thissection = $doc_obj->get_next_section ($thissection);
    1512     }
    1513     }   
    1514    
    1515     if ($self->{'extract_acronyms'}) {
    1516     my $thissection = $doc_obj->get_top_section();
    1517     while (defined $thissection) {
    1518         my $text = $doc_obj->get_text($thissection);
    1519         $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
    1520         $thissection = $doc_obj->get_next_section ($thissection);
    1521     }
    1522     }
    1523    
    1524     if ($self->{'markup_acronyms'}) {
    1525     my $thissection = $doc_obj->get_top_section();
    1526     while (defined $thissection) {
    1527         my $text = $doc_obj->get_text($thissection);
    1528         $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
    1529         $doc_obj->delete_text($thissection);
    1530         $doc_obj->add_text($thissection, $text);
    1531         $thissection = $doc_obj->get_next_section ($thissection);
    1532     }
    1533     }
    1534 
    1535     if($self->{'extract_historical_years'}) {
    1536     my $thissection = $doc_obj->get_top_section();
    1537     while (defined $thissection) {
    1538 
    1539         my $text = $doc_obj->get_text($thissection);
    1540         &DateExtract::get_date_metadata($text, $doc_obj,
    1541                         $thissection,
    1542                         $self->{'no_bibliography'},
    1543                         $self->{'maximum_year'},
    1544                         $self->{'maximum_century'});
    1545         $thissection = $doc_obj->get_next_section ($thissection);
    1546     }
    1547     }
    1548 }
    1549 
    1550 
    1551 #adding kea keyphrases
    1552 sub extract_keyphrases
    1553 {
    1554     my $self = shift(@_);
    1555     my $doc_obj = shift(@_);
    1556 
    1557     # Use Kea 3.0 unless 4.0 has been specified
    1558     my $kea_version = "3.0";
    1559     if ($self->{'extract_keyphrases_kea4'}) {
    1560     $kea_version = "4.0";
    1561     }
    1562 
    1563     # Check that Kea exists, and tell the user where to get it if not
    1564     my $keahome = &Kea::get_Kea_directory($kea_version);
    1565     if (!-e $keahome) {
    1566     gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
    1567     return;
    1568     }
    1569 
    1570     my $thissection = $doc_obj->get_top_section();
    1571     my $text = "";
    1572     my $list;
    1573 
    1574     #loop through sections to gather whole doc
    1575     while (defined $thissection) {
    1576     my $sectiontext = $doc_obj->get_text($thissection);   
    1577     $text = $text.$sectiontext;
    1578     $thissection = $doc_obj->get_next_section ($thissection);
    1579     }
    1580    
    1581     if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
    1582     $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
    1583     } else { #otherwise call Kea with no options
    1584     $list = &Kea::extract_KeyPhrases ($kea_version, $text);
    1585     }
    1586  
    1587     if ($list){
    1588     # if a list of kea keyphrases was returned (ie not empty)
    1589     if ($self->{'verbosity'}) {
    1590         gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
    1591     }
    1592 
    1593     #add metadata to top section
    1594     $thissection = $doc_obj->get_top_section();
    1595 
    1596     # add all key phrases as one metadata
    1597     $doc_obj->add_metadata($thissection, "Keyphrases", $list);
    1598 
    1599     # add individual key phrases as multiple metadata
    1600     foreach my $keyphrase (split(',', $list)) {
    1601         $keyphrase =~ s/^\s+|\s+$//g;
    1602         $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
    1603     }
    1604     }
    1605 }
    1606 
    1607 
    1608 # extract acronyms from a section in a document. progress is
    1609 # reported to outhandle based on the verbosity. both the Acronym
    1610 # and the AcronymKWIC metadata items are created.
    1611 
    1612 sub extract_acronyms {
    1613     my $self = shift (@_);
    1614     my ($textref, $doc_obj, $thissection) = @_;
    1615     my $outhandle = $self->{'outhandle'};
    1616 
    1617     # print $outhandle " extracting acronyms ...\n"
    1618     gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
    1619     if ($self->{'verbosity'} > 2);
    1620 
    1621     my $acro_array =  &acronym::acronyms($textref);
    1622    
    1623     foreach my $acro (@$acro_array) {
    1624 
    1625     #check that this is the first time ...
    1626     my $seen_before = "false";
    1627     my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
    1628     foreach my $thisAcro (@$previous_data) {
    1629         if ($thisAcro eq $acro->to_string()) {
    1630         $seen_before = "true";
    1631         if ($self->{'verbosity'} >= 4) {
    1632             gsprintf($outhandle, " {BasPlug.already_seen} " .
    1633                  $acro->to_string() . "\n");
    1634         }
    1635         }
    1636     }
    1637 
    1638     if ($seen_before eq "false") {
    1639         #write it to the file ...
    1640         $acro->write_to_file();
    1641 
    1642         #do the normal acronym
    1643         $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
    1644         gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
    1645         if ($self->{'verbosity'} > 3);
    1646     }
    1647     }
    1648 
    1649     gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
    1650     if ($self->{'verbosity'} > 2);
    1651 }
    1652 
    1653 sub markup_acronyms {
    1654     my $self = shift (@_);
    1655     my ($text, $doc_obj, $thissection) = @_;
    1656     my $outhandle = $self->{'outhandle'};
    1657 
    1658     gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
    1659     if ($self->{'verbosity'} > 2);
    1660 
    1661     #self is passed in to check for verbosity ...
    1662     $text = &acronym::markup_acronyms($text, $self);
    1663 
    1664     gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
    1665     if ($self->{'verbosity'} > 2);
    1666 
    1667     return $text;
    1668 }
    1669896
    1670897sub compile_stats {
Note: See TracChangeset for help on using the changeset viewer.