########################################################################### # # BasPlug.pm -- base class for all the import plugins # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package BasPlug; eval {require bytes}; # suppress the annoying "subroutine redefined" warning that various # plugins cause under perl 5.6 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)}; use Kea; use parsargv; use multiread; use encodings; use cnseg; use acronym; use textcat; use doc; use diagnostics; use DateExtract; use ghtml; use printusage; my $unicode_list = [ { 'name' => "auto", 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } , { 'name' => "ascii", 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." }, { 'name' => "utf8", 'desc' => "either utf8 or unicode -- automatically detected." }, { 'name' => "unicode", 'desc' => "just unicode" } ]; my $arguments = [ { 'name' => "process_exp", 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 'type' => "string", 'deft' => "", 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 'type' => 'string', 'deft' => "", 'reqd' => "no" }, { 'name' => "input_encoding", 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.", 'type' => "enum", 'list' => $unicode_list, 'reqd' => "no" , 'deft' => "auto" } , { 'name' => "default_encoding", 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.", 'type' => "enum", 'reqd' => "no", 'deft' => "utf8" }, { 'name' => "extract_language", 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "default_language", 'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.", 'type' => "language", 'deft' => "en", 'reqd' => "no" }, { 'name' => "extract_acronyms", 'desc' => "Extract acronyms from within text and set as metadata.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "markup_acronyms", 'desc' => "Add acronym metadata into document text.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "first", 'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.", 'type' => "string", 'reqd' => "no" }, { 'name' => "extract_email", 'desc' => "Extract email addresses as metadata.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "extract_historical_years", 'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "maximum_year", 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).", 'type' => "int", 'deft' => (localtime)[5]+1900, 'reqd' => "no"}, { 'name' => "maximum_century", 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).", 'type' => "int", 'deft' => "-1", 'reqd' => "no" }, { 'name' => "no_bibliography", 'desc' => "Do not try to block bibliographic dates when extracting historical dates.", 'type' => "flag", 'reqd' => "no"}, { 'name' => "cover_image", 'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "BasPlug", 'desc' => "Base class for all the import plugins.", 'inherits' => "No", 'args' => $arguments }; sub print_xml_usage { local $self = shift(@_); print STDERR "\n\n"; $self->print_xml(); } sub print_xml { local $self = shift(@_); local $optionlistref = $self->{'option_list'}; local @optionlist = @$optionlistref; local $pluginoptions = pop(@$optionlistref); return if (!defined($pluginoptions)); print STDERR "\n"; print STDERR " $pluginoptions->{'name'}\n"; print STDERR " $pluginoptions->{'desc'}\n"; print STDERR " $pluginoptions->{'inherits'}\n"; print STDERR " \n"; if (defined($pluginoptions->{'args'})) { &PrintUsage::print_options_xml($pluginoptions->{'args'}); } # Recurse up the plugin hierarchy $self->print_xml(); print STDERR " \n"; print STDERR "\n"; } sub print_txt_usage { local $self = shift(@_); # Print the usage message for a plugin (recursively) local $descoffset = $self->determine_description_offset(0); $self->print_plugin_usage($descoffset, 1); } sub determine_description_offset { local $self = shift(@_); local $maxoffset = shift(@_); local $optionlistref = $self->{'option_list'}; local @optionlist = @$optionlistref; local $pluginoptions = pop(@$optionlistref); return $maxoffset if (!defined($pluginoptions)); # Find the length of the longest option string of this plugin local $pluginargs = $pluginoptions->{'args'}; if (defined($pluginargs)) { local $longest = &PrintUsage::find_longest_option_string($pluginargs); if ($longest > $maxoffset) { $maxoffset = $longest; } } # Recurse up the plugin hierarchy $maxoffset = $self->determine_description_offset($maxoffset); $self->{'option_list'} = \@optionlist; return $maxoffset; } sub print_plugin_usage { local $self = shift(@_); local $descoffset = shift(@_); local $isleafclass = shift(@_); local $optionlistref = $self->{'option_list'}; local @optionlist = @$optionlistref; local $pluginoptions = pop(@$optionlistref); return if (!defined($pluginoptions)); local $pluginname = $pluginoptions->{'name'}; local $pluginargs = $pluginoptions->{'args'}; # Produce the usage information using the data structure above if ($isleafclass) { print STDERR " usage: plugin $pluginname [options]\n\n"; } # Display the plugin options, if there are some if (defined($pluginargs)) { # Calculate the column offset of the option descriptions local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions if ($isleafclass) { print STDERR " specific options:\n"; } else { print STDERR " general options (from $pluginname):\n"; } # Display the plugin options &PrintUsage::print_options_txt($pluginargs, $optiondescoffset); } # Recurse up the plugin hierarchy $self->print_plugin_usage($descoffset, 0); $self->{'option_list'} = \@optionlist; } # sub print_general_usage { # my ($plugin_name) = @_; # print STDERR "\n usage: plugin $plugin_name [options]\n\n"; # print STDERR " -process_exp A perl regular expression to match against filenames.\n"; # print STDERR " Matching filenames will be processed by this plugin.\n"; # print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n"; # print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n"; # print STDERR " .htm or .html (case-insensitive).\n\n"; # print STDERR " -block_exp Files matching this regular expression will be blocked from\n"; # print STDERR " being passed to any later plugins in the list. This has no\n"; # print STDERR " real effect other than to prevent lots of warning messages\n"; # print STDERR " about input files you don't care about. Each plugin might\n"; # print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n"; # print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n"; # print STDERR " file extensions.\n\n"; # print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; # print STDERR " converted from these encodings and stored internally as\n"; # print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n"; # print STDERR " are:\n"; # print STDERR " auto: Use text categorization algorithm to automatically\n"; # print STDERR " identify the encoding of each source document. This\n"; # print STDERR " will be slower than explicitly setting the encoding\n"; # print STDERR " but will work where more than one encoding is used\n"; # print STDERR " within the same collection.\n"; # print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n"; # print STDERR " using iso_8859_1. Beware of using this on a collection\n"; # print STDERR " of documents that may contain characters outside the\n"; # print STDERR " plain 7 bit ascii set though (e.g. German or French\n"; # print STDERR " documents containing accents), use iso_8859_1 instead.\n"; # print STDERR " utf8: either utf8 or unicode -- automatically detected\n"; # print STDERR " unicode: just unicode\n"; # my $e = $encodings::encodings; # foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { # print STDERR " $enc: $e->{$enc}->{'name'}\n"; # } # print STDERR "\n"; # print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n"; # print STDERR " the text categorization algorithm fails to extract the\n"; # print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n"; # print STDERR " The default is iso_8859_1.\n\n"; # print STDERR " -extract_language Identify the language of each document and set 'Language'\n"; # print STDERR " metadata. Note that this will be done automatically if\n"; # print STDERR " -input_encoding is 'auto'.\n\n"; # print STDERR " -default_language If Greenstone fails to work out what language a document is\n"; # print STDERR " the 'Language' metadata element will be set to this value.\n"; # print STDERR " The default is 'en' (ISO 639 language symbols are used:\n"; # print STDERR " en = English). Note that if -input_encoding is not set to\n"; # print STDERR " 'auto' and -extract_language is not set, all documents will\n"; # print STDERR " have their 'Language' metadata set to this value.\n\n"; # print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n"; # print STDERR " -markup_acronyms Add acronym metadata into document text\n\n"; # print STDERR " -first Comma separated list of first sizes to extract from the\n"; # print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n"; # print STDERR " -extract_email Extract email addresses as metadata\n\n"; # print STDERR " -extract_historical_years Extract time-period information from historical\n"; # print STDERR " documents. This is stored as metadata with the document.\n"; # print STDERR " There is a search interface for this metadata, which you \n"; # print STDERR " can include in your collection by adding the statement:\n"; # print STDERR " format QueryInterface DateSearch\n"; # print STDERR " to your collection configuration file\n"; # print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n"; # print STDERR " Common Era date, such as 1950)\n"; # print STDERR " -maximum_century The maximum named century to be extracted as historical\n"; # print STDERR " metadata (e.g. 14 will extract all references up to the\n"; # print STDERR " 14th century)\n"; # print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n"; # print STDERR " historical dates.\n"; # print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n"; # print STDERR " prefix as the file being processed) and associate it as a\n"; # print STDERR " cover image\n\n"; # } # sub print_usage { # print STDERR "\nThis plugin has no plugin specific options\n\n"; # } sub new { my $class = shift (@_); my $plugin_name = shift (@_); my $self = {}; my $enc = "^("; map {$enc .= "$_|";} keys %$encodings::encodings; my $denc = $enc . "ascii|utf8|unicode)\$"; $enc .= "ascii|utf8|unicode|auto)\$"; $self->{'outhandle'} = STDERR; my $year = (localtime)[5]+1900; $self->{'textcat'} = new textcat(); $self->{'num_processed'} = 0; $self->{'num_not_processed'} = 0; $self->{'num_blocked'} = 0; $self->{'num_archives'} = 0; # 14-05-02 To allow for proper inheritance of arguments - John Thompson $self->{'option_list'} = [ $options ]; # general options available to all plugins if (!parsargv::parse(\@_, q^process_exp/.*/^, \$self->{'process_exp'}, q^block_exp/.*/^, \$self->{'block_exp'}, q^extract_language^, \$self->{'extract_language'}, q^extract_acronyms^, \$self->{'extract_acronyms'}, q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED) q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED) qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'}, q^extract_email^, \$self->{'extract_email'}, q^markup_acronyms^, \$self->{'markup_acronyms'}, q^default_language/.{2}/en^, \$self->{'default_language'}, q^first/.*/^, \$self->{'first'}, q^extract_historical_years^, \$self->{'date_extract'}, qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'}, q^no_bibliography^, \$self->{'no_biblio'}, qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'}, q^cover_image^, \$self->{'cover_image'}, "allow_extra_options")) { print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n"; print STDERR "available to all plugins). Check your collect.cfg configuration file.\n"; # &print_general_usage($plugin_name); $self->print_txt_usage(); die "\n"; } return bless $self, $class; } # initialize BasPlug options # if init() is overridden in a sub-class, remember to call BasPlug::init() sub init { my $self = shift (@_); my ($verbosity, $outhandle, $failhandle) = @_; # verbosity is passed through from the processor $self->{'verbosity'} = $verbosity; # as are the outhandle and failhandle $self->{'outhandle'} = $outhandle if defined $outhandle; $self->{'failhandle'} = $failhandle; # set process_exp and block_exp to defaults unless they were # explicitly set if ((!$self->is_recursive()) and (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) { $self->{'process_exp'} = $self->get_default_process_exp (); if ($self->{'process_exp'} eq "") { warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n"; } } if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) { $self->{'block_exp'} = $self->get_default_block_exp (); } } sub begin { my $self = shift (@_); my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; $self->initialise_extractors(); } sub end { my ($self) = @_; $self->finalise_extractors(); } # this function should be overridden to return 1 # in recursive plugins sub is_recursive { my $self = shift (@_); return 0; } sub get_default_block_exp { my $self = shift (@_); return ""; } sub get_default_process_exp { my $self = shift (@_); return ""; } # The BasPlug read() function. This function does all the right things # to make general options work for a given plugin. It calls the process() # function which does all the work specific to a plugin (like the old # read functions used to do). Most plugins should define their own # process() function and let this read() function keep control. # # recursive plugins (e.g. RecPlug) and specialized plugins like those # capable of processing many documents within a single file (e.g. # GMLPlug) should normally implement their own version of read() # # Return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; if ($self->is_recursive()) { die "BasPlug::read function must be implemented in sub-class for recursive plugins\n"; } my $outhandle = $self->{'outhandle'}; my $filename = $file; $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { $self->{'num_blocked'} ++; return 0; } if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { return undef; } $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up # Do encoding stuff my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); # create a new document my $doc_obj = new doc ($filename, "indexed_doc"); $doc_obj->set_OIDtype ($processor->{'OIDtype'}); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); my ($filemeta) = $file =~ /([^\\\/]+)$/; # how do we know what encoding the filename is in? $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); if ($self->{'cover_image'}) { $self->associate_cover_image($doc_obj, $filename); } # read in file ($text will be in utf8) my $text = ""; $self->read_file ($filename, $encoding, $language, \$text); if (!length ($text)) { my $plugin_name = ref ($self); print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; my $failhandle = $self->{'failhandle'}; print $failhandle "$file: " . ref($self) . ": file contains no text\n"; $self->{'num_not_processed'} ++; return 0; } # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); # do plugin specific processing of doc_obj return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); # do any automatic metadata extraction $self->auto_extract_metadata ($doc_obj); # add an OID # see if there is a plugin-specific set_OID function... if (defined ($self->can(set_OID))) { # it will need $doc_obj to set the Identifier metadata... $self->set_OID($doc_obj); } else { # use the default set_OID() in doc.pm $doc_obj->set_OID(); } # process the document $processor->process($doc_obj); $self->{'num_processed'} ++; return 1; # processed the file } # returns undef if file is rejected by the plugin sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; die "Basplug::process function must be implemented in sub-class\n"; return undef; # never gets here } # uses the multiread package to read in the entire file pointed to # by filename and loads the resulting text into $$textref. Input text # may be in any of the encodings handled by multiread, output text # will be in utf8 sub read_file { my $self = shift (@_); my ($filename, $encoding, $language, $textref) = @_; if (!-r $filename) { my $outhandle = $self->{'outhandle'}; print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; return; } $$textref = ""; open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; if ($encoding eq "ascii") { undef $/; $$textref = ; $/ = "\n"; } else { my $reader = new multiread(); $reader->set_handle ('BasPlug::FILE'); $reader->set_encoding ($encoding); $reader->read_file ($textref); if ($language eq "zh") { # segment the Chinese words $$textref = &cnseg::segment($$textref); } } close FILE; } sub textcat_get_language_encoding { my $self = shift (@_); my ($filename) = @_; my ($language, $encoding, $extracted_encoding); if ($self->{'input_encoding'} eq "auto") { # use textcat to automatically work out the input encoding and language ($language, $encoding) = $self->get_language_encoding ($filename); } elsif ($self->{'extract_language'}) { # use textcat to get language metadata ($language, $extracted_encoding) = $self->get_language_encoding ($filename); $encoding = $self->{'input_encoding'}; if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { my $plugin_name = ref ($self); my $outhandle = $self->{'outhandle'}; print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but "; print $outhandle "appears to be encoded as $extracted_encoding.\n"; } } else { $language = $self->{'default_language'}; $encoding = $self->{'input_encoding'}; } return ($language, $encoding); } # Uses textcat to work out the encoding and language of the text in # $filename. All html tags are removed before processing. # returns an array containing "language" and "encoding" sub get_language_encoding { my $self = shift (@_); my ($filename) = @_; my $outhandle = $self->{'outhandle'}; # read in file open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n"; undef $/; my $text = ; $/ = "\n"; close FILE; # remove stuff -- as titles tend often to be in English # for foreign language documents $text =~ s/.*?<\/title>//i; # remove all HTML tags $text =~ s/<[^>]*>//sg; # get the language/encoding my $results = $self->{'textcat'}->classify(\$text); # if textcat returns 3 or less possibilities we'll use the # first one in the list - otherwise use the defaults if (scalar @$results > 3) { # changed 12 Feb 2003 by jrm21 # use the most popular encoding at least... otherwise we might # generate invalid archive files! my %guessed_encodings = (); foreach my $result (@$results) { $result =~ /([^\-]+)$/; my $enc=$1; if (!defined($guessed_encodings{$enc})) { $guessed_encodings{$enc}=0; } $guessed_encodings{$enc}++; } my $best_encoding=""; $guessed_encodings{""}=-1; foreach my $enc (keys %guessed_encodings) { if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){ $best_encoding=$enc; } } if ($self->{'input_encoding'} ne 'auto') { if ($self->{'extract_language'} && $self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_language'}\n"; } return ($self->{'default_language'}, $self->{'input_encoding'}); } else { if ($self->{'verbosity'}) { print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_language'}.\n"; } return ($self->{'default_language'}, $best_encoding); } } # format language/encoding my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; if (!defined $language) { if ($self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_language'}\n"; } $language = $self->{'default_language'}; } if (!defined $encoding) { if ($self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_encoding'}\n"; } $encoding = $self->{'default_encoding'}; } if ($encoding !~ /^(ascii|utf8|unicode)$/ && !defined $encodings::encodings->{$encoding}) { if ($self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - "; print $outhandle "using $self->{'default_encoding'}\n"; } $encoding = $self->{'default_encoding'}; } return ($language, $encoding); } # add any extra metadata that's been passed around from one # plugin to another. # extra_metadata uses add_utf8_metadata so it expects metadata values # to already be in utf8 sub extra_metadata { my $self = shift (@_); my ($doc_obj, $cursection, $metadata) = @_; foreach my $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_utf8_metadata ($cursection, $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field}); } } } # initialise metadata extractors sub initialise_extractors { my $self = shift (@_); if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) { &acronym::initialise_acronyms(); } } # finalise metadata extractors sub finalise_extractors { my $self = shift (@_); if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) { &acronym::finalise_acronyms(); } } # FIRSTNNN: extract the first NNN characters as metadata sub extract_first_NNNN_characters { my $self = shift (@_); my ($textref, $doc_obj, $thissection) = @_; foreach my $size (split /,/, $self->{'first'}) { my $tmptext = $$textref; $tmptext =~ s/^\s+//; $tmptext =~ s/\s+$//; $tmptext =~ s/\s+/ /gs; $tmptext = substr ($tmptext, 0, $size); $tmptext =~ s/\s\S*$/…/; $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); } } sub extract_email { my $self = shift (@_); my ($textref, $doc_obj, $thissection) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle " extracting email addresses ...\n" if ($self->{'verbosity'} > 2); my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g); @email = sort @email; my @email2 = (); foreach my $address (@email) { if (!(join(" ",@email2) =~ m/$address/ )) { push @email2, $address; $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); print $outhandle " extracting $address\n" if ($self->{'verbosity'} > 3); } } print $outhandle " done extracting email addresses.\n" if ($self->{'verbosity'} > 2); } # extract metadata sub auto_extract_metadata { my $self = shift (@_); my ($doc_obj) = @_; if ($self->{'extract_email'}) { my $thissection = $doc_obj->get_top_section(); while (defined $thissection) { my $text = $doc_obj->get_text($thissection); $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./; $thissection = $doc_obj->get_next_section ($thissection); } } #adding kea keyphrases if ($self->{'kea'}) { my $thissection = $doc_obj->get_top_section(); my $text = ""; my @list; while (defined $thissection) { #loop through sections to gather whole doc my $sectiontext = $doc_obj->get_text($thissection); $text = $text.$sectiontext; $thissection = $doc_obj->get_next_section ($thissection); } if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); } else { #otherwise call Kea with no options @list = &Kea::extract_KeyPhrases ($text); } if(@list){ #if a list of kea keyphrases was returned (ie not empty) my $keyphrases = $list[0]; #first arg is keyphrase list my $stems = $list[1]; #second arg is stemmed keyphrase list print STDERR "keyphrases: $keyphrases\n"; print STDERR "stems: $stems\n"; $thissection = $doc_obj->get_top_section(); #add metadata to top section $doc_obj->add_metadata($thissection, "kea", $keyphrases); $doc_obj->add_metadata($thissection, "stems", $stems); } } #end of kea if ($self->{'first'}) { my $thissection = $doc_obj->get_top_section(); while (defined $thissection) { my $text = $doc_obj->get_text($thissection); $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./; $thissection = $doc_obj->get_next_section ($thissection); } } if ($self->{'extract_acronyms'}) { my $thissection = $doc_obj->get_top_section(); while (defined $thissection) { my $text = $doc_obj->get_text($thissection); $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./; $thissection = $doc_obj->get_next_section ($thissection); } } if ($self->{'markup_acronyms'}) { my $thissection = $doc_obj->get_top_section(); while (defined $thissection) { my $text = $doc_obj->get_text($thissection); $text = $self->markup_acronyms ($text, $doc_obj, $thissection); $doc_obj->delete_text($thissection); $doc_obj->add_text($thissection, $text); $thissection = $doc_obj->get_next_section ($thissection); } } if($self->{'date_extract'}) { my $thissection = $doc_obj->get_top_section(); while (defined $thissection) { my $text = $doc_obj->get_text($thissection); &DateExtract::get_date_metadata($text, $doc_obj, $thissection, $self->{'no_biblio'}, $self->{'max_year'}, $self->{'max_century'}); $thissection = $doc_obj->get_next_section ($thissection); } } } # extract acronyms from a section in a document. progress is # reported to outhandle based on the verbosity. both the Acronym # and the AcronymKWIC metadata items are created. sub extract_acronyms { my $self = shift (@_); my ($textref, $doc_obj, $thissection) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle " extracting acronyms ...\n" if ($self->{'verbosity'} > 2); my $acro_array = &acronym::acronyms($textref); foreach my $acro (@$acro_array) { #check that this is the first time ... my $seen_before = "false"; my $previous_data = $doc_obj->get_metadata($thissection, "Acronym"); foreach my $thisAcro (@$previous_data) { if ($thisAcro eq $acro->to_string()) { $seen_before = "true"; print $outhandle " already seen ". $acro->to_string() . "\n" if ($self->{'verbosity'} >= 4); } } if ($seen_before eq "false") { #write it to the file ... $acro->write_to_file(); #do the normal acronym $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string()); print $outhandle " adding ". $acro->to_string() . "\n" if ($self->{'verbosity'} > 3); } } print $outhandle " done extracting acronyms. \n" if ($self->{'verbosity'} > 2); } sub markup_acronyms { my $self = shift (@_); my ($text, $doc_obj, $thissection) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle " marking up acronyms ...\n" if ($self->{'verbosity'} > 2); #self is passed in to check for verbosity ... $text = &acronym::markup_acronyms($text, $self); print $outhandle " done marking up acronyms. \n" if ($self->{'verbosity'} > 2); return $text; } sub compile_stats { my $self = shift(@_); my ($stats) = @_; $stats->{'num_processed'} += $self->{'num_processed'}; $stats->{'num_not_processed'} += $self->{'num_not_processed'}; $stats->{'num_archives'} += $self->{'num_archives'}; } sub associate_cover_image { my $self = shift(@_); my ($doc_obj, $filename) = @_; $filename =~ s/\.[^\\\/\.]+$/\.jpg/; if (-e $filename) { $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); } else { $filename =~ s/jpg$/JPG/; if (-e $filename) { $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); } } } 1;