Changeset 15868
- Timestamp:
- 2008-06-05T09:21:21+12:00 (16 years ago)
- Location:
- gsdl/trunk/perllib/plugins
- Files:
-
- 3 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/BasePlugin.pm
r15865 r15868 1 1 ########################################################################### 2 2 # 3 # Bas Plug.pm -- base class for all the import plugins3 # BasePlugin.pm -- base class for all the import plugins 4 4 # A component of the Greenstone digital library software 5 5 # from the New Zealand Digital Library Project at the … … 24 24 ########################################################################### 25 25 26 package BasPlug; 27 28 BEGIN { 29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; 30 } 31 32 eval {require bytes}; 33 34 # suppress the annoying "subroutine redefined" warning that various 35 # plugins cause under perl 5.6 36 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)}; 26 package BasePlugin; 37 27 38 28 use strict; … … 42 32 use File::Basename; 43 33 44 use Kea;45 34 use multiread; 46 35 use encodings; 47 36 use unicode; 48 use cnseg;49 use acronym;50 use textcat;51 37 use doc; 52 38 eval "require diagnostics"; # some perl distros (eg mac) don't have this 53 use DateExtract;54 39 use ghtml; 55 40 use gsprintf 'gsprintf'; 56 use printusage; 57 use parse2; 58 59 60 use GISBasPlug; 61 62 @BasPlug::ISA = ( GISBasPlug ); 63 64 my $unicode_list = 41 42 use PrintInfo; 43 44 BEGIN { 45 @BasePlugin::ISA = ( 'PrintInfo' ); 46 } 47 48 our $encoding_list = 65 49 [ { 'name' => "ascii", 66 'desc' => "{ BasPlug.input_encoding.ascii}" },50 'desc' => "{ReadTextFile.input_encoding.ascii}" }, 67 51 { 'name' => "utf8", 68 'desc' => "{ BasPlug.input_encoding.utf8}" },52 'desc' => "{ReadTextFile.input_encoding.utf8}" }, 69 53 { 'name' => "unicode", 70 'desc' => "{BasPlug.input_encoding.unicode}" } ]; 71 72 my $auto_unicode_list = 73 [ { 'name' => "auto", 74 'desc' => "{BasPlug.input_encoding.auto}" } ]; 54 'desc' => "{ReadTextFile.input_encoding.unicode}" } ]; 75 55 76 56 my $e = $encodings::encodings; … … 81 61 'desc' => $e->{$enc}->{'name'}}; 82 62 83 push(@{$unicode_list},$hashEncode); 84 } 85 86 push(@{$auto_unicode_list},@{$unicode_list}); 63 push(@{$encoding_list},$hashEncode); 64 } 65 66 our $encoding_plus_auto_list = 67 [ { 'name' => "auto", 68 'desc' => "{ReadTextFile.input_encoding.auto}" } ]; 69 70 push(@{$encoding_plus_auto_list},@{$encoding_list}); 87 71 88 72 my $arguments = 89 73 [ { 'name' => "process_exp", 90 'desc' => "{Bas Plug.process_exp}",74 'desc' => "{BasePlugin.process_exp}", 91 75 'type' => "regexp", 92 76 'deft' => "", 93 77 'reqd' => "no" }, 94 78 { 'name' => "block_exp", 95 'desc' => "{Bas Plug.block_exp}",79 'desc' => "{BasePlugin.block_exp}", 96 80 'type' => "regexp", 97 81 'deft' => "", 98 82 'reqd' => "no" }, 99 83 { 'name' => "smart_block", 100 'desc' => "{Bas Plug.smart_block}",84 'desc' => "{BasePlugin.smart_block}", 101 85 'type' => "flag", 102 86 'reqd' => "no" }, 103 87 { 'name' => "associate_ext", 104 'desc' => "{Bas Plug.associate_ext}",88 'desc' => "{BasePlugin.associate_ext}", 105 89 'type' => "string", 106 90 'reqd' => "no" }, 107 91 { 'name' => "associate_tail_re", 108 'desc' => "{Bas Plug.associate_tail_re}",92 'desc' => "{BasePlugin.associate_tail_re}", 109 93 'type' => "string", 110 94 'reqd' => "no" }, 111 95 { 'name' => "use_as_doc_identifier", 112 'desc' => "{Bas Plug.use_as_doc_identifier}",96 'desc' => "{BasePlugin.use_as_doc_identifier}", 113 97 'type' => "string", 114 98 'reqd' => "no" , 115 99 'deft' => "" } , 116 { 'name' => "input_encoding", 117 'desc' => "{BasPlug.input_encoding}", 118 'type' => "enum", 119 'list' => $auto_unicode_list, 120 'reqd' => "no" , 121 'deft' => "auto" } , 122 { 'name' => "default_encoding", 123 'desc' => "{BasPlug.default_encoding}", 124 'type' => "enum", 125 'list' => $unicode_list, 126 'reqd' => "no", 127 'deft' => "utf8" }, 128 { 'name' => "extract_language", 129 'desc' => "{BasPlug.extract_language}", 100 { 'name' => "no_cover_image", 101 'desc' => "{BasePlugin.no_cover_image}", 130 102 'type' => "flag", 131 103 'reqd' => "no" }, 132 { 'name' => "default_language", 133 'desc' => "{BasPlug.default_language}", 134 'type' => "string", 135 'deft' => "en", 136 'reqd' => "no" }, 137 { 'name' => "extract_acronyms", 138 'desc' => "{BasPlug.extract_acronyms}", 139 'type' => "flag", 140 'reqd' => "no" }, 141 { 'name' => "markup_acronyms", 142 'desc' => "{BasPlug.markup_acronyms}", 143 'type' => "flag", 144 'reqd' => "no" }, 145 { 'name' => "extract_keyphrases", 146 'desc' => "{BasPlug.extract_keyphrases}", 147 'type' => "flag", 148 'reqd' => "no" }, 149 { 'name' => "extract_keyphrases_kea4", 150 'desc' => "{BasPlug.extract_keyphrases_kea4}", 151 'type' => "flag", 152 'reqd' => "no" }, 153 { 'name' => "extract_keyphrase_options", 154 'desc' => "{BasPlug.extract_keyphrase_options}", 155 'type' => "string", 156 'deft' => "", 157 'reqd' => "no" }, 158 { 'name' => "first", 159 'desc' => "{BasPlug.first}", 160 'type' => "string", 161 'reqd' => "no" }, 162 { 'name' => "extract_email", 163 'desc' => "{BasPlug.extract_email}", 164 'type' => "flag", 165 'reqd' => "no" }, 166 { 'name' => "extract_historical_years", 167 'desc' => "{BasPlug.extract_historical_years}", 168 'type' => "flag", 169 'reqd' => "no" }, 170 { 'name' => "maximum_year", 171 'desc' => "{BasPlug.maximum_year}", 172 'type' => "int", 173 'deft' => (localtime)[5]+1900, 174 'char_length' => "4", 175 #'range' => "2,100", 176 'reqd' => "no"}, 177 { 'name' => "maximum_century", 178 'desc' => "{BasPlug.maximum_century}", 179 'type' => "string", 180 'deft' => "-1", 181 'reqd' => "no" }, 182 { 'name' => "no_bibliography", 183 'desc' => "{BasPlug.no_bibliography}", 184 'type' => "flag", 185 'reqd' => "no"}, 186 { 'name' => "no_cover_image", 187 'desc' => "{BasPlug.no_cover_image}", 188 'type' => "flag", 189 'reqd' => "no" }, 190 { 'name' => "separate_cjk", 191 'desc' => "{BasPlug.separate_cjk}", 192 'type' => "flag", 193 'reqd' => "no", 194 'hiddengli' => "yes" }, 195 { 'name' => "new_extract_email", 196 'desc' => "", 197 'type' => "flag", 198 'reqd' => "no", 199 'hiddengli' => "yes" } ]; 200 201 my $gis_arguments = 202 [ { 'name' => "extract_placenames", 203 'desc' => "{GISBasPlug.extract_placenames}", 204 'type' => "flag", 205 'reqd' => "no" }, 206 { 'name' => "gazetteer", 207 'desc' => "{GISBasPlug.gazetteer}", 208 'type' => "string", 209 'reqd' => "no" }, 210 { 'name' => "place_list", 211 'desc' => "{GISBasPlug.place_list}", 212 'type' => "flag", 213 'reqd' => "no" } ]; 214 215 216 my $options = { 'name' => "BasPlug", 217 'desc' => "{BasPlug.desc}", 104 { 'name' => "filename_encoding", 105 'desc' => "{BasePlugin.filename_encoding}", 106 'type' => "enum", 107 'deft' => "auto", 108 'list' => $encoding_plus_auto_list, 109 'reqd' => "no" } 110 111 ]; 112 113 114 my $options = { 'name' => "BasePlugin", 115 'desc' => "{BasePlugin.desc}", 218 116 'abstract' => "yes", 219 117 'inherits' => "no", … … 221 119 222 120 223 sub set_incremental {224 my $self = shift(@_);225 my ($incremental) = @_;226 227 $self->{'incremental'} = $incremental;228 }229 230 sub get_arguments231 {232 my $self = shift(@_);233 my $optionlistref = $self->{'option_list'};234 my @optionlist = @$optionlistref;235 my $pluginoptions = pop(@$optionlistref);236 my $pluginarguments = $pluginoptions->{'args'};237 return $pluginarguments;238 }239 240 241 sub print_xml_usage242 {243 my $self = shift(@_);244 my $header = shift(@_);245 my $high_level_information_only = shift(@_);246 247 # XML output is always in UTF-8248 gsprintf::output_strings_in_UTF8;249 250 if ($header) {251 &PrintUsage::print_xml_header("plugin");252 }253 $self->print_xml($high_level_information_only);254 }255 256 257 sub print_xml258 {259 my $self = shift(@_);260 my $high_level_information_only = shift(@_);261 262 my $optionlistref = $self->{'option_list'};263 my @optionlist = @$optionlistref;264 my $pluginoptions = shift(@$optionlistref);265 return if (!defined($pluginoptions));266 267 # Find the process and block default expressions in the plugin arguments268 my $process_exp = "";269 my $block_exp = "";270 if (defined($pluginoptions->{'args'})) {271 foreach my $option (@{$pluginoptions->{'args'}}) {272 if ($option->{'name'} eq "process_exp") {273 $process_exp = $option->{'deft'};274 }275 if ($option->{'name'} eq "block_exp") {276 $block_exp = $option->{'deft'};277 }278 }279 }280 281 gsprintf(STDERR, "<PlugInfo>\n");282 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");283 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});284 $desc =~ s/</&lt;/g; # doubly escaped285 $desc =~ s/>/&gt;/g;286 gsprintf(STDERR, " <Desc>$desc</Desc>\n");287 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");288 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");289 gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");290 gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");291 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");292 # adding new option that works with replace_srcdoc_with_html.pl293 gsprintf(STDERR, " <SourceReplaceable>" . ($pluginoptions->{'srcreplaceable'} || "no") . "</SourceReplaceable>\n");294 unless (defined($high_level_information_only)) {295 gsprintf(STDERR, " <Arguments>\n");296 if (defined($pluginoptions->{'args'})) {297 &PrintUsage::print_options_xml($pluginoptions->{'args'});298 }299 gsprintf(STDERR, " </Arguments>\n");300 301 # Recurse up the plugin hierarchy302 $self->print_xml();303 }304 gsprintf(STDERR, "</PlugInfo>\n");305 }306 307 308 sub print_txt_usage309 {310 my $self = shift(@_);311 # Print the usage message for a plugin (recursively)312 my $descoffset = $self->determine_description_offset(0);313 $self->print_plugin_usage($descoffset, 1);314 }315 316 317 sub determine_description_offset318 {319 my $self = shift(@_);320 my $maxoffset = shift(@_);321 322 my $optionlistref = $self->{'option_list'};323 my @optionlist = @$optionlistref;324 my $pluginoptions = shift(@$optionlistref);325 return $maxoffset if (!defined($pluginoptions));326 327 # Find the length of the longest option string of this plugin328 my $pluginargs = $pluginoptions->{'args'};329 if (defined($pluginargs)) {330 my $longest = &PrintUsage::find_longest_option_string($pluginargs);331 if ($longest > $maxoffset) {332 $maxoffset = $longest;333 }334 }335 336 # Recurse up the plugin hierarchy337 $maxoffset = $self->determine_description_offset($maxoffset);338 $self->{'option_list'} = \@optionlist;339 return $maxoffset;340 }341 342 343 sub print_plugin_usage344 {345 my $self = shift(@_);346 my $descoffset = shift(@_);347 my $isleafclass = shift(@_);348 349 my $optionlistref = $self->{'option_list'};350 my @optionlist = @$optionlistref;351 my $pluginoptions = shift(@$optionlistref);352 return if (!defined($pluginoptions));353 354 my $pluginname = $pluginoptions->{'name'};355 my $pluginargs = $pluginoptions->{'args'};356 my $plugindesc = $pluginoptions->{'desc'};357 358 # Produce the usage information using the data structure above359 if ($isleafclass) {360 if (defined($plugindesc)) {361 gsprintf(STDERR, "$plugindesc\n\n");362 }363 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");364 }365 366 # Display the plugin options, if there are some367 if (defined($pluginargs)) {368 # Calculate the column offset of the option descriptions369 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions370 371 if ($isleafclass) {372 gsprintf(STDERR, " {common.specific_options}:\n");373 }374 else {375 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);376 }377 378 # Display the plugin options379 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);380 }381 382 # Recurse up the plugin hierarchy383 $self->print_plugin_usage($descoffset, 0);384 $self->{'option_list'} = \@optionlist;385 }386 387 388 121 sub new { 389 # Set Encodings to the list!! 390 391 392 # Start the BasPlug Constructor 393 my $class = shift (@_); 394 my ($pluginlist,$args,$hashArgOptLists) = @_; 122 123 my ($class) = shift (@_); 124 my ($pluginlist,$inputargs,$hashArgOptLists) = @_; 395 125 push(@$pluginlist, $class); 126 127 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 128 push(@{$hashArgOptLists->{"OptList"}},$options); 129 130 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists); 131 396 132 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class; 397 398 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}399 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};400 401 if (GISBasPlug::has_mapdata()) {402 push(@$arguments,@$gis_arguments);403 }404 405 my $self = {};406 $self->{'outhandle'} = STDERR;407 $self->{'option_list'} = $hashArgOptLists->{"OptList"};408 $self->{"info_only"} = 0;409 410 # Check if gsdlinfo is in the argument list or not - if it is, don't parse411 # the args, just return the object.412 foreach my $strArg (@{$args})413 {414 if($strArg eq "-gsdlinfo")415 {416 $self->{"info_only"} = 1;417 return bless $self, $class;418 }419 }420 421 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)422 {423 my $classTempClass = bless $self, $class;424 print STDERR "<BadPlugin p=$plugin_name>\n";425 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);426 $classTempClass->print_txt_usage(""); # Use default resource bundle427 die "\n";428 }429 430 431 delete $self->{"info_only"};432 # else parsing was successful.433 434 133 $self->{'plugin_type'} = $plugin_name; 435 #$self->{'outhandle'} = STDERR; 134 436 135 $self->{'num_processed'} = 0; 437 136 $self->{'num_not_processed'} = 0; … … 465 164 $self->{'file_blocks'} = {}; 466 165 467 if ($self->{'extract_placenames'}) {468 469 my $outhandle = $self->{'outhandle'};470 471 my $places_ref472 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});473 474 if (!defined $places_ref) {475 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";476 print $outhandle " No placename extraction will take place.\n";477 $self->{'extract_placenames'} = undef;478 }479 else {480 $self->{'places'} = $places_ref;481 }482 }483 166 484 167 return bless $self, $class; 485 486 } 487 488 # initialize Bas Plugoptions489 # if init() is overridden in a sub-class, remember to call Bas Plug::init()168 169 } 170 171 # initialize BasePlugin options 172 # if init() is overridden in a sub-class, remember to call BasePlugin::init() 490 173 sub init { 491 174 my $self = shift (@_); … … 520 203 my $self = shift (@_); 521 204 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 522 523 #my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);524 #print STDERR "Calling method; $cfilename:$cline $cpackage->$csubr\n";525 526 $self->initialise_extractors();527 205 } 528 206 … … 531 209 # import.pl only has one plugin pass, but buildcol.pl has multiple ones 532 210 533 my ($self) = @_; 534 $self->finalise_extractors(); 211 my ($self) = shift (@_); 535 212 } 536 213 … … 539 216 540 217 my ($self) = @_; 218 } 219 220 sub set_incremental { 221 my $self = shift(@_); 222 my ($incremental) = @_; 223 224 $self->{'incremental'} = $incremental; 541 225 } 542 226 … … 752 436 } 753 437 438 sub get_full_filenames { 439 my $self = shift (@_); 440 my ($base_dir, $file) = @_; 441 442 my $filename_full_path = $file; 443 # add on directory if present 444 $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 445 my $filename_no_path = $file; 446 # remove directory if present 447 $filename_no_path =~ s/^.*[\/\\]//; 448 return ($filename_full_path, $filename_no_path); 449 } 754 450 755 451 sub read_block { … … 759 455 760 456 761 my $filename = $file; 762 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 763 764 if ($self->associate_with($file,$filename,$metadata)) { 457 my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file); 458 459 if ($self->associate_with($file,$filename_full_path,$metadata)) { 765 460 # a form of smart block 766 461 $self->{'num_blocked'} ++; … … 772 467 773 468 if ($smart_block || $smart_block_BN) { 774 if (defined $self->{'file_blocks'}->{$filename } && $self->{'file_blocks'}->{$filename} == 1){469 if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){ 775 470 $self->{'num_blocked'} ++; 776 471 return (0,undef); # blocked 777 472 } 778 473 } else { 779 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {474 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) { 780 475 $self->{'num_blocked'} ++; 781 476 return (0,undef); # blocked 782 477 } 783 478 if ($self->{'cover_image'}) { 784 if (defined $self->{'file_blocks'}->{$filename } && $self->{'file_blocks'}->{$filename} == 1){479 if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){ 785 480 $self->{'num_blocked'} ++; 786 481 return (0,undef); # blocked … … 789 484 } 790 485 791 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {486 if ($filename_full_path !~ /$self->{'process_exp'}/ || !-f $filename_full_path) { 792 487 return (undef,undef); # can't recognise 793 488 } 794 489 795 return (1,$filename); 796 } 797 798 sub read_tidy_file { 799 490 ##why are we returning the full filename - do we need this?? 491 return (1,$filename_full_path); 492 } 493 494 495 #filename_encoding set by user 496 sub filename_to_utf8_metadata 497 { 800 498 my $self = shift (@_); 801 802 my ($file) = @_; 803 804 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 805 806 return $file; 807 } 808 809 810 sub filename_to_metadata 811 { 812 my $self = shift (@_); 813 my ($file, $encoding) = @_; 499 my ($file, $file_encoding) = @_; 814 500 815 501 my $outhandle = $self->{'outhandle'}; 816 502 503 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 504 505 my $filename_encoding = $self->{'filename_encoding'}; 506 if ($filename_encoding eq "auto") { 507 # we check the locale first 508 if (!defined $self->{'filesystem_encoding'}) { 509 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding(); 510 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'}; 511 } 512 if ($self->{'filesystem_encoding'} ne "undefined") { 513 $filename_encoding = $self->{'filesystem_encoding'}; 514 } else { 515 # try the encoding of the document, if available 516 if (defined $file_encoding) { 517 $filename_encoding = $file_encoding; 518 } else { 519 # use utf8 520 $filename_encoding = "utf8"; 521 } 522 } 523 524 } 525 526 if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) { 527 $filemeta = unicode::unicode2utf8( 528 unicode::convert2unicode($filename_encoding, \$filemeta) 529 ); 530 } 531 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta); 532 533 return $dmsafe_filemeta; 534 535 } 536 537 538 sub get_filesystem_encoding { 539 540 my $self = shift(@_); 541 542 my $outhandle = $self->{'outhandle'}; 817 543 my $filesystem_encoding = undef; 818 544 819 545 eval { 820 546 use POSIX qw(locale_h); 821 822 # With only one parameter, setlocale retrieves the current value 547 548 # With only one parameter, setlocale retrieves the 549 # current value 823 550 my $current_locale = setlocale(LC_CTYPE); 824 551 825 552 if ($current_locale =~ m/^.*\.(.*?)$/) { 826 553 my $char_encoding = lc($1); … … 831 558 $char_encoding =~ s/-/_/g; 832 559 $char_encoding =~ s/^utf_8$/utf8/; 833 560 834 561 if ($char_encoding =~ m/^\d+$/) { 835 562 if (defined $encodings::encodings->{"windows_$char_encoding"}) { … … 840 567 } 841 568 } 842 569 843 570 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/) 844 571 || (defined $encodings::encodings->{$char_encoding})) { … … 849 576 } 850 577 } 851 578 852 579 853 580 }; … … 857 584 858 585 } 859 860 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 861 862 # how do we know what encoding the filename is in? 863 # => one answer is to check the locale 864 865 if (defined $filesystem_encoding) { 866 if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) { 867 $filemeta = unicode::unicode2utf8( 868 unicode::convert2unicode($filesystem_encoding, \$filemeta) 869 ); 870 } 871 } 872 # assume it is in the same encoding as its contents 873 elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) { 874 $filemeta = unicode::unicode2utf8( 875 unicode::convert2unicode($encoding, \$filemeta) 876 ); 877 } 878 879 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta); 880 881 return $dmsafe_filemeta; 882 } 883 884 885 sub add_OID 886 { 586 return $filesystem_encoding; 587 } 588 589 # is there ever only one Source? Sometimes this will be called twice, for images etc that are converted. 590 sub set_Source_metadata { 591 my $self = shift (@_); 592 my ($doc_obj, $filename_no_path, $file_encoding) = @_; 593 594 my $top_section = $doc_obj->get_top_section(); 595 596 # the original encoding filename 597 $doc_obj->set_metadata_element($top_section, "Source", $filename_no_path); 598 # UTF-8 version of filename 599 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding); 600 $doc_obj->set_utf8_metadata_element($top_section, "SourceUTF8", $filemeta); 601 602 } 603 604 sub add_OID { 605 887 606 my $self = shift (@_); 888 607 my ($doc_obj) = @_; … … 911 630 } 912 631 913 # The BasPlug read_into_doc_obj() function. This function does all the 914 # right things to make general options work for a given plugin. It reads in 632 633 634 # The BasePlugin read_into_doc_obj() function. This function does all the 635 # right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in 915 636 # a file and sets up a slew of metadata all saved in doc_obj, which 916 637 # it then returns as part of a tuple (process_status,doc_obj) … … 926 647 # Note that $base_dir might be "" and that $file might 927 648 # include directories 649 650 # currently blocking has been done before it gets here - does this affect secondary plugin stuff?? 928 651 sub read_into_doc_obj { 929 652 my $self = shift (@_); 930 653 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 931 654 932 if ($self->is_recursive()) { 933 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n"; 934 } 935 936 my $outhandle = $self->{'outhandle'}; 937 938 my ($block_status,$filename) = $self->read_block(@_); 939 return $block_status if ((!defined $block_status) || ($block_status==0)); 940 $file = $self->read_tidy_file($file); 941 942 # Do encoding stuff 943 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 944 if ($self->{'verbosity'} > 2) { 945 print $outhandle "BasPlug: reading $file as ($encoding,$language)\n"; 946 } 947 655 my $outhandle = $self->{'outhandle'}; 656 657 # should we move this to read? What about secondary plugins? 658 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 659 print $outhandle "$self->{'plugin_type'} processing $file\n" 660 if $self->{'verbosity'} > 1; 661 662 my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file); 948 663 # create a new document 949 my $doc_obj = new doc ($filename , "indexed_doc");664 my $doc_obj = new doc ($filename_full_path, "indexed_doc"); 950 665 my $top_section = $doc_obj->get_top_section(); 951 666 952 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 953 $doc_obj->add_utf8_metadata($top_section, "Language", $language); 954 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 667 # this should look at the plugin option too... 668 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 955 669 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 956 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename)); 957 958 my $filemeta = $self->filename_to_metadata($file,$encoding); 959 $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta); 670 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 671 672 $self->Set_Source_metadata($doc_obj, $filename_no_path); 673 674 # plugin specific stuff - what args do we need here?? 675 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { 676 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 677 return -1; 678 } 679 680 # include any metadata passed in from previous plugins 681 # note that this metadata is associated with the top level section 682 my $section = $doc_obj->get_top_section(); 683 # can we merge these two methods?? 684 $self->add_associated_files($doc_obj, $filename_full_path); 685 $self->extra_metadata ($doc_obj, $section, $metadata); 686 $self->auto_extract_metadata($doc_obj); 687 688 # if we haven't found any Title so far, assign one 689 # this was shifted to here from inside read() 690 $self->title_fallback($doc_obj,$section,$filename_no_path); 691 692 $self->add_OID($doc_obj); 693 694 return (1,$doc_obj); 695 } 696 697 sub add_dummy_text { 698 my $self = shift(@_); 699 my ($doc_obj, $section) = @_; 700 701 # add NoText metadata so we can hide this dummy text in format statements 702 $doc_obj->add_metadata($section, "NoText", "1"); 703 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1)); 704 705 } 706 707 # does nothing. Can be overridden by subclass 708 sub auto_extract_metadata { 709 my $self = shift(@_); 710 my ($doc_obj) = @_; 711 } 712 713 # adds cover image, associate_file options stuff. Should be called by sub class 714 # read_into_doc_obj 715 sub add_associated_files { 716 my $self = shift(@_); 717 # whatis filename?? 718 my ($doc_obj, $filename) = @_; 719 720 # add in the cover image 960 721 if ($self->{'cover_image'}) { 961 722 $self->associate_cover_image($doc_obj, $filename); 962 723 } 963 964 # read in file ($text will be in utf8)965 my $text = "";966 $self->read_file ($filename, $encoding, $language, \$text);967 968 if (!length ($text)) {969 my $plugin_name = ref ($self);970 if ($gli) {971 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";972 }973 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};974 975 my $failhandle = $self->{'failhandle'};976 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");977 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";978 $self->{'num_not_processed'} ++;979 980 return (0,undef); # what should we return here?? error but don't want to pass it on981 }982 724 983 # include any metadata passed in from previous plugins 984 # note that this metadata is associated with the top level section 985 986 my $associate_tail_re = $self->{'associate_tail_re'}; 987 988 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 989 990 # do plugin specific processing of doc_obj 991 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { 992 $text = ''; 993 undef $text; 994 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 995 return (-1,undef); 996 } 997 $text=''; 998 undef $text; 999 1000 # do any automatic metadata extraction 1001 $self->auto_extract_metadata ($doc_obj); 1002 1003 $self->add_OID($doc_obj); 1004 1005 return (1,$doc_obj); 1006 } 1007 1008 1009 # The BasPlug read() function. This function calls read_into_doc_obj() 725 726 } 727 728 # The BasePlugin read() function. This function calls read_into_doc_obj() 1010 729 # to ensure all the right things to make general options work for a 1011 730 # given plugin are done. It then calls the process() function which … … 1026 745 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 1027 746 747 # check that we are not blocked 748 my ($block_status,$filename) = $self->read_block(@_); 749 return $block_status if ((!defined $block_status) || ($block_status==0)); 750 1028 751 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_); 1029 752 1030 753 if ((defined $process_status) && ($process_status == 1)) { 754 1031 755 # process the document 1032 756 $processor->process($doc_obj); 1033 1034 if(defined($self->{'places_filename'})){ 1035 &util::rm($self->{'places_filename'}); 1036 $self->{'places_filename'} = undef; 1037 } 1038 757 1039 758 $self->{'num_processed'} ++; 1040 759 undef $doc_obj; 1041 760 } 761 # delete any temp files that we may have created 762 $self->clean_up_after_doc_obj_processing(); 1042 763 1043 764 # if process_status == 1, then the file has been processed. … … 1051 772 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 1052 773 1053 gsprintf(STDERR, "Bas Plug::process {common.must_be_implemented}\n") && die "\n";1054 # die "Bas plug::process function must be implemented in sub-class\n";774 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n"; 775 # die "BasePlugin::process function must be implemented in sub-class\n"; 1055 776 1056 777 return undef; # never gets here 1057 778 } 1058 779 1059 # uses the multiread package to read in the entire file pointed to 1060 # by filename and loads the resulting text into $$textref. Input text 1061 # may be in any of the encodings handled by multiread, output text 1062 # will be in utf8 1063 sub read_file { 1064 my $self = shift (@_); 1065 my ($filename, $encoding, $language, $textref) = @_; 1066 1067 if (!-r $filename) 1068 { 1069 my $outhandle = $self->{'outhandle'}; 1070 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'}; 1071 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 1072 return; 1073 } 1074 $$textref = ""; 1075 if (!open (FILE, $filename)) { 1076 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename); 1077 die "\n"; 1078 } 1079 1080 if ($encoding eq "ascii") { 1081 undef $/; 1082 $$textref = <FILE>; 1083 $/ = "\n"; 1084 } else { 1085 my $reader = new multiread(); 1086 $reader->set_handle ('BasPlug::FILE'); 1087 $reader->set_encoding ($encoding); 1088 $reader->read_file ($textref); 1089 #Now segments chinese if the separate_cjk option is set 1090 if ($self->{'separate_cjk'}) { 1091 # segment the Chinese words 1092 $$textref = &cnseg::segment($$textref); 1093 } 1094 } 1095 close FILE; 1096 } 1097 780 # overwrite this method to delete any temp files that we have created 781 sub clean_up_after_doc_obj_processing { 782 my $self = shift(@_); 783 784 } 1098 785 # write_file -- used by ConvertToPlug, for example in post processing 1099 786 # 787 # where should this go, is here the best place?? 1100 788 sub utf8_write_file { 1101 789 my $self = shift (@_); … … 1130 818 my ($doc_obj,$section,$file) = @_; 1131 819 1132 if (!defined $doc_obj->get_metadata_element ($section, "Title")) { 1133 1134 my $file_derived_title = $self->filename_based_title($file); 1135 1136 $doc_obj->add_utf8_metadata ($section, "Title", $self->filename_to_metadata($file_derived_title)); 1137 } 1138 } 1139 1140 sub textcat_get_language_encoding { 1141 my $self = shift (@_); 1142 my ($filename) = @_; 1143 1144 1145 my ($language, $encoding, $extracted_encoding); 1146 if ($self->{'input_encoding'} eq "auto") { 1147 # use textcat to automatically work out the input encoding and language 1148 ($language, $encoding) = $self->get_language_encoding ($filename); 1149 } elsif ($self->{'extract_language'}) { 1150 # use textcat to get language metadata 1151 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 1152 $encoding = $self->{'input_encoding'}; 1153 # don't print this message for english... english in utf8 is identical 1154 # to english in iso-8859-1 (except for some punctuation). We don't have 1155 # a language model for en_utf8, so textcat always says iso-8859-1! 1156 if ($extracted_encoding ne $encoding && $language ne "en" 1157 && $self->{'verbosity'}) { 1158 my $plugin_name = ref ($self); 1159 my $outhandle = $self->{'outhandle'}; 1160 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding); 1161 } 1162 } else { 1163 $language = $self->{'default_language'}; 1164 $encoding = $self->{'input_encoding'}; 1165 } 1166 1167 return ($language, $encoding); 1168 } 1169 1170 # Uses textcat to work out the encoding and language of the text in 1171 # $filename. All html tags are removed before processing. 1172 # returns an array containing "language" and "encoding" 1173 sub get_language_encoding { 1174 my $self = shift (@_); 1175 my ($filename) = @_; 1176 my $outhandle = $self->{'outhandle'}; 1177 my $unicode_format = ""; 1178 my $best_language = ""; 1179 my $best_encoding = ""; 1180 1181 # read in file 1182 if (!open (FILE, $filename)) { 1183 gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename); 1184 # this is a pretty bad error, but try to continue anyway 1185 return ($self->{'default_language'}, $self->{'input_encoding'}); 1186 } 1187 undef $/; 1188 my $text = <FILE>; 1189 $/ = "\n"; 1190 close FILE; 1191 1192 # check if first few bytes have a Byte Order Marker 1193 my $bom=substr($text,0,2); # check 16bit unicode 1194 if ($bom eq "\xff\xfe") { # little endian 16bit unicode 1195 $unicode_format="unicode"; 1196 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode 1197 $unicode_format="unicode"; 1198 } else { 1199 $bom=substr($text,0,3); # check utf-8 1200 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom 1201 $unicode_format="utf8"; 1202 # } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!? 1203 # $unicode_format="utf8"; 1204 } 1205 } 1206 1207 1208 # handle html files specially 1209 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo) 1210 if (ref($self) eq 'HTMLPlug' || 1211 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){ 1212 1213 # remove <title>stuff</title> -- as titles tend often to be in English 1214 # for foreign language documents 1215 $text =~ s!<title>.*?</title>!!si; 1216 1217 # see if this html file specifies its encoding 1218 if ($text =~ /^<\?xml.*encoding="(.+?)"/) { 1219 $best_encoding = $1; 1220 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#" 1221 $best_encoding = $1; 1222 } 1223 if ($best_encoding) { # we extracted an encoding 1224 $best_encoding =~ s/-+/_/g; 1225 $best_encoding = lc($best_encoding); # lowercase 1226 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" } 1227 $self->{'input_encoding'} = $best_encoding; 1228 } 1229 1230 # remove all HTML tags 1231 $text =~ s/<[^>]*>//sg; 1232 } 1233 1234 # get the language/encoding 1235 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 1236 my $results = $self->{'textcat'}->classify(\$text); 1237 1238 # if textcat returns 3 or less possibilities we'll use the 1239 # first one in the list - otherwise use the defaults 1240 if (scalar @$results > 3) { 1241 if ($unicode_format) { # in case the first had a BOM 1242 $best_encoding=$unicode_format; 1243 } else { 1244 my %guessed_encodings = (); 1245 foreach my $result (@$results) { 1246 $result =~ /([^\-]+)$/; 1247 my $enc=$1; 1248 if (!defined($guessed_encodings{$enc})) { 1249 $guessed_encodings{$enc}=0; 1250 } 1251 $guessed_encodings{$enc}++; 1252 } 1253 1254 $guessed_encodings{""}=-1; # for default best_encoding of "" 1255 foreach my $enc (keys %guessed_encodings) { 1256 if ($guessed_encodings{$enc} > 1257 $guessed_encodings{$best_encoding}){ 1258 $best_encoding=$enc; 1259 } 1260 } 1261 } 1262 1263 if ($self->{'input_encoding'} ne 'auto') { 1264 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) { 1265 gsprintf($outhandle, 1266 "BasPlug: {BasPlug.could_not_extract_language}\n", 1267 $filename, $self->{'default_language'}); 1268 } 1269 $best_language = $self->{'default_language'}; 1270 $best_encoding = $self->{'input_encoding'}; 1271 1272 } else { 1273 if ($self->{'verbosity'}>2) { 1274 gsprintf($outhandle, 1275 "BasPlug: {BasPlug.could_not_extract_language}\n", 1276 $filename, $self->{'default_language'}); 1277 } 1278 $best_language = $self->{'default_language'}; 1279 } 1280 } else { # <= 3 suggestions 1281 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 1282 if (!defined $language) { 1283 if ($self->{'verbosity'}>2) { 1284 gsprintf($outhandle, 1285 "BasPlug: {BasPlug.could_not_extract_language}\n", 1286 $filename, $self->{'default_language'}); 1287 } 1288 $language = $self->{'default_language'}; 1289 } 1290 if (!defined $encoding) { 1291 if ($self->{'verbosity'}>2) { 1292 gsprintf($outhandle, 1293 "BasPlug: {BasPlug.could_not_extract_encoding}\n", 1294 $filename, $self->{'default_encoding'}); 1295 } 1296 $encoding = $self->{'default_encoding'}; 1297 } 1298 $best_language = $language; 1299 if (! $best_encoding ) { # may already be set... eg from html meta tag 1300 $best_encoding = $encoding; 1301 } 1302 } 1303 1304 my $text_copy = $text; 1305 if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) { 1306 # the text is valid utf8, so assume that's the real encoding 1307 # (since textcat is based on probabilities) 1308 $best_encoding = 'utf8'; 1309 } 1310 1311 # check for equivalents where textcat doesn't have some encodings... 1312 # eg MS versions of standard encodings 1313 if ($best_encoding =~ /^iso_8859_(\d+)/) { 1314 my $iso = $1; # which variant of the iso standard? 1315 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 1316 if ($text =~ /[\x80-\x9f]/) { 1317 # Western Europe 1318 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } 1319 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe 1320 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic 1321 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic 1322 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek 1323 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 1324 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 1325 } 1326 } 1327 1328 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ && 1329 !defined $encodings::encodings->{$best_encoding}) { 1330 if ($self->{'verbosity'}) { 1331 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", 1332 $filename, $best_encoding, $self->{'default_encoding'}); 1333 } 1334 $best_encoding = $self->{'default_encoding'}; 1335 } 1336 1337 return ($best_language, $best_encoding); 1338 } 1339 820 if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") { 821 822 my $file_derived_title = $self->filename_to_metadata($self->filename_based_title($file)); 823 if (!defined $doc_obj->get_metadata_element ($section, "Title")) { 824 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title); 825 } 826 else { 827 $doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title); 828 } 829 } 830 831 } 832 1340 833 # add any extra metadata that's been passed around from one 1341 834 # plugin to another. … … 1401 894 } 1402 895 1403 # initialise metadata extractors1404 sub initialise_extractors {1405 my $self = shift (@_);1406 1407 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {1408 &acronym::initialise_acronyms();1409 }1410 }1411 1412 # finalise metadata extractors1413 sub finalise_extractors {1414 my $self = shift (@_);1415 1416 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {1417 &acronym::finalise_acronyms();1418 }1419 }1420 1421 # FIRSTNNN: extract the first NNN characters as metadata1422 sub extract_first_NNNN_characters {1423 my $self = shift (@_);1424 my ($textref, $doc_obj, $thissection) = @_;1425 1426 foreach my $size (split /,/, $self->{'first'}) {1427 my $tmptext = $$textref;1428 $tmptext =~ s/^\s+//;1429 $tmptext =~ s/\s+$//;1430 $tmptext =~ s/\s+/ /gs;1431 $tmptext = substr ($tmptext, 0, $size);1432 $tmptext =~ s/\s\S*$/…/;1433 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);1434 }1435 }1436 1437 sub extract_email {1438 my $self = shift (@_);1439 my ($textref, $doc_obj, $thissection) = @_;1440 my $outhandle = $self->{'outhandle'};1441 1442 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")1443 if ($self->{'verbosity'} > 2);1444 1445 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);1446 @email = sort @email;1447 1448 # if($self->{"new_extract_email"} == 0)1449 # {1450 # my @email2 = ();1451 # foreach my $address (@email)1452 # {1453 # if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))1454 # {1455 # push @email2, $address;1456 # $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);1457 # # print $outhandle " extracting $address\n"1458 # &gsprintf($outhandle, " {BasPlug.extracting} $address\n")1459 # if ($self->{'verbosity'} > 3);1460 # }1461 # }1462 # }1463 # else1464 # {1465 my $hashExistMail = {};1466 foreach my $address (@email) {1467 if (!(defined $hashExistMail->{$address}))1468 {1469 $hashExistMail->{$address} = 1;1470 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);1471 gsprintf($outhandle, " {BasPlug.extracting} $address\n")1472 if ($self->{'verbosity'} > 3);1473 }1474 }1475 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")1476 if ($self->{'verbosity'} > 2);1477 }1478 1479 # extract metadata1480 sub auto_extract_metadata {1481 1482 my $self = shift (@_);1483 my ($doc_obj) = @_;1484 1485 if ($self->{'extract_email'}) {1486 my $thissection = $doc_obj->get_top_section();1487 while (defined $thissection) {1488 my $text = $doc_obj->get_text($thissection);1489 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;1490 $thissection = $doc_obj->get_next_section ($thissection);1491 }1492 }1493 if ($self->{'extract_placenames'}) {1494 my $thissection = $doc_obj->get_top_section();1495 while (defined $thissection) {1496 my $text = $doc_obj->get_text($thissection);1497 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;1498 $thissection = $doc_obj->get_next_section ($thissection);1499 }1500 }1501 1502 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {1503 $self->extract_keyphrases($doc_obj);1504 }1505 1506 if ($self->{'first'}) {1507 my $thissection = $doc_obj->get_top_section();1508 while (defined $thissection) {1509 my $text = $doc_obj->get_text($thissection);1510 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;1511 $thissection = $doc_obj->get_next_section ($thissection);1512 }1513 }1514 1515 if ($self->{'extract_acronyms'}) {1516 my $thissection = $doc_obj->get_top_section();1517 while (defined $thissection) {1518 my $text = $doc_obj->get_text($thissection);1519 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;1520 $thissection = $doc_obj->get_next_section ($thissection);1521 }1522 }1523 1524 if ($self->{'markup_acronyms'}) {1525 my $thissection = $doc_obj->get_top_section();1526 while (defined $thissection) {1527 my $text = $doc_obj->get_text($thissection);1528 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);1529 $doc_obj->delete_text($thissection);1530 $doc_obj->add_text($thissection, $text);1531 $thissection = $doc_obj->get_next_section ($thissection);1532 }1533 }1534 1535 if($self->{'extract_historical_years'}) {1536 my $thissection = $doc_obj->get_top_section();1537 while (defined $thissection) {1538 1539 my $text = $doc_obj->get_text($thissection);1540 &DateExtract::get_date_metadata($text, $doc_obj,1541 $thissection,1542 $self->{'no_bibliography'},1543 $self->{'maximum_year'},1544 $self->{'maximum_century'});1545 $thissection = $doc_obj->get_next_section ($thissection);1546 }1547 }1548 }1549 1550 1551 #adding kea keyphrases1552 sub extract_keyphrases1553 {1554 my $self = shift(@_);1555 my $doc_obj = shift(@_);1556 1557 # Use Kea 3.0 unless 4.0 has been specified1558 my $kea_version = "3.0";1559 if ($self->{'extract_keyphrases_kea4'}) {1560 $kea_version = "4.0";1561 }1562 1563 # Check that Kea exists, and tell the user where to get it if not1564 my $keahome = &Kea::get_Kea_directory($kea_version);1565 if (!-e $keahome) {1566 gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);1567 return;1568 }1569 1570 my $thissection = $doc_obj->get_top_section();1571 my $text = "";1572 my $list;1573 1574 #loop through sections to gather whole doc1575 while (defined $thissection) {1576 my $sectiontext = $doc_obj->get_text($thissection);1577 $text = $text.$sectiontext;1578 $thissection = $doc_obj->get_next_section ($thissection);1579 }1580 1581 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options1582 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});1583 } else { #otherwise call Kea with no options1584 $list = &Kea::extract_KeyPhrases ($kea_version, $text);1585 }1586 1587 if ($list){1588 # if a list of kea keyphrases was returned (ie not empty)1589 if ($self->{'verbosity'}) {1590 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");1591 }1592 1593 #add metadata to top section1594 $thissection = $doc_obj->get_top_section();1595 1596 # add all key phrases as one metadata1597 $doc_obj->add_metadata($thissection, "Keyphrases", $list);1598 1599 # add individual key phrases as multiple metadata1600 foreach my $keyphrase (split(',', $list)) {1601 $keyphrase =~ s/^\s+|\s+$//g;1602 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);1603 }1604 }1605 }1606 1607 1608 # extract acronyms from a section in a document. progress is1609 # reported to outhandle based on the verbosity. both the Acronym1610 # and the AcronymKWIC metadata items are created.1611 1612 sub extract_acronyms {1613 my $self = shift (@_);1614 my ($textref, $doc_obj, $thissection) = @_;1615 my $outhandle = $self->{'outhandle'};1616 1617 # print $outhandle " extracting acronyms ...\n"1618 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")1619 if ($self->{'verbosity'} > 2);1620 1621 my $acro_array = &acronym::acronyms($textref);1622 1623 foreach my $acro (@$acro_array) {1624 1625 #check that this is the first time ...1626 my $seen_before = "false";1627 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");1628 foreach my $thisAcro (@$previous_data) {1629 if ($thisAcro eq $acro->to_string()) {1630 $seen_before = "true";1631 if ($self->{'verbosity'} >= 4) {1632 gsprintf($outhandle, " {BasPlug.already_seen} " .1633 $acro->to_string() . "\n");1634 }1635 }1636 }1637 1638 if ($seen_before eq "false") {1639 #write it to the file ...1640 $acro->write_to_file();1641 1642 #do the normal acronym1643 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());1644 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")1645 if ($self->{'verbosity'} > 3);1646 }1647 }1648 1649 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")1650 if ($self->{'verbosity'} > 2);1651 }1652 1653 sub markup_acronyms {1654 my $self = shift (@_);1655 my ($text, $doc_obj, $thissection) = @_;1656 my $outhandle = $self->{'outhandle'};1657 1658 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")1659 if ($self->{'verbosity'} > 2);1660 1661 #self is passed in to check for verbosity ...1662 $text = &acronym::markup_acronyms($text, $self);1663 1664 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")1665 if ($self->{'verbosity'} > 2);1666 1667 return $text;1668 }1669 896 1670 897 sub compile_stats {
Note:
See TracChangeset
for help on using the changeset viewer.