source: gsdl/trunk/perllib/plugins/BasPlug.pm@ 14961

Last change on this file since 14961 was 14961, checked in by davidb, 16 years ago

Setting filename metadata (Source) in BasPlug.pm looks to user its locale, in the first instance, to resolve what character encoding the file system uses.

  • Property svn:keywords set to Author Date Id Revision
File size: 48.6 KB
RevLine 
[537]1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[9413]8# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
26package BasPlug;
[2219]27
[9398]28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
[3834]32eval {require bytes};
[3767]33
[2219]34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
[10254]38use strict;
39no strict 'subs';
40no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]41
[8892]42use File::Basename;
43
[1954]44use Kea;
[1219]45use multiread;
[1870]46use encodings;
[11389]47use unicode;
[1219]48use cnseg;
[1242]49use acronym;
[1317]50use textcat;
[1242]51use doc;
[7645]52eval "require diagnostics"; # some perl distros (eg mac) don't have this
[1411]53use DateExtract;
[2751]54use ghtml;
[9413]55use gsprintf 'gsprintf';
[4778]56use printusage;
[10218]57use parse2;
[4]58
[10218]59
[9398]60use GISBasPlug;
[5681]61
[9413]62@BasPlug::ISA = ( GISBasPlug );
[9398]63
[4873]64my $unicode_list =
[10218]65 [ { 'name' => "ascii",
[4873]66 'desc' => "{BasPlug.input_encoding.ascii}" },
[4744]67 { 'name' => "utf8",
[4873]68 'desc' => "{BasPlug.input_encoding.utf8}" },
[4744]69 { 'name' => "unicode",
[4873]70 'desc' => "{BasPlug.input_encoding.unicode}" } ];
[3540]71
[10218]72my $auto_unicode_list =
73 [ { 'name' => "auto",
74 'desc' => "{BasPlug.input_encoding.auto}" } ];
75
[10620]76my $e = $encodings::encodings;
77foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78{
79 my $hashEncode =
80 {'name' => $enc,
81 'desc' => $e->{$enc}->{'name'}};
82
83 push(@{$unicode_list},$hashEncode);
84}
85
86push(@{$auto_unicode_list},@{$unicode_list});
87
[4873]88my $arguments =
[3540]89 [ { 'name' => "process_exp",
[4873]90 'desc' => "{BasPlug.process_exp}",
[6408]91 'type' => "regexp",
[3540]92 'deft' => "",
93 'reqd' => "no" },
94 { 'name' => "block_exp",
[4873]95 'desc' => "{BasPlug.block_exp}",
[6408]96 'type' => "regexp",
[3540]97 'deft' => "",
98 'reqd' => "no" },
[9067]99 { 'name' => "smart_block",
100 'desc' => "{BasPlug.smart_block}",
101 'type' => "flag",
[11368]102 'reqd' => "no" },
[8892]103 { 'name' => "associate_ext",
104 'desc' => "{BasPlug.associate_ext}",
105 'type' => "string",
106 'reqd' => "no" },
[11122]107 { 'name' => "associate_tail_re",
108 'desc' => "{BasPlug.associate_tail_re}",
109 'type' => "string",
110 'reqd' => "no" },
[3540]111 { 'name' => "input_encoding",
[4873]112 'desc' => "{BasPlug.input_encoding}",
[3540]113 'type' => "enum",
[10218]114 'list' => $auto_unicode_list,
[3540]115 'reqd' => "no" ,
116 'deft' => "auto" } ,
117 { 'name' => "default_encoding",
[4873]118 'desc' => "{BasPlug.default_encoding}",
[4744]119 'type' => "enum",
[6332]120 'list' => $unicode_list,
[4744]121 'reqd' => "no",
122 'deft' => "utf8" },
[3540]123 { 'name' => "extract_language",
[4873]124 'desc' => "{BasPlug.extract_language}",
[3540]125 'type' => "flag",
126 'reqd' => "no" },
127 { 'name' => "default_language",
[4873]128 'desc' => "{BasPlug.default_language}",
[10329]129 'type' => "string",
[3540]130 'deft' => "en",
131 'reqd' => "no" },
132 { 'name' => "extract_acronyms",
[4873]133 'desc' => "{BasPlug.extract_acronyms}",
[3540]134 'type' => "flag",
135 'reqd' => "no" },
136 { 'name' => "markup_acronyms",
[4873]137 'desc' => "{BasPlug.markup_acronyms}",
[3540]138 'type' => "flag",
[9398]139 'reqd' => "no" },
[8789]140 { 'name' => "extract_keyphrases",
141 'desc' => "{BasPlug.extract_keyphrases}",
142 'type' => "flag",
[8814]143 'reqd' => "no" },
[11069]144 { 'name' => "extract_keyphrases_kea4",
145 'desc' => "{BasPlug.extract_keyphrases_kea4}",
146 'type' => "flag",
147 'reqd' => "no" },
[8789]148 { 'name' => "extract_keyphrase_options",
149 'desc' => "{BasPlug.extract_keyphrase_options}",
150 'type' => "string",
151 'deft' => "",
[8814]152 'reqd' => "no" },
[3540]153 { 'name' => "first",
[4873]154 'desc' => "{BasPlug.first}",
[3540]155 'type' => "string",
156 'reqd' => "no" },
157 { 'name' => "extract_email",
[4873]158 'desc' => "{BasPlug.extract_email}",
[3540]159 'type' => "flag",
160 'reqd' => "no" },
161 { 'name' => "extract_historical_years",
[4873]162 'desc' => "{BasPlug.extract_historical_years}",
[3540]163 'type' => "flag",
164 'reqd' => "no" },
165 { 'name' => "maximum_year",
[4873]166 'desc' => "{BasPlug.maximum_year}",
[3540]167 'type' => "int",
[4744]168 'deft' => (localtime)[5]+1900,
[10218]169 'char_length' => "4",
170 #'range' => "2,100",
[3540]171 'reqd' => "no"},
172 { 'name' => "maximum_century",
[4873]173 'desc' => "{BasPlug.maximum_century}",
[7105]174 'type' => "string",
[10218]175 'deft' => "-1",
[3540]176 'reqd' => "no" },
177 { 'name' => "no_bibliography",
[4873]178 'desc' => "{BasPlug.no_bibliography}",
[3540]179 'type' => "flag",
180 'reqd' => "no"},
[8678]181 { 'name' => "no_cover_image",
182 'desc' => "{BasPlug.no_cover_image}",
[3540]183 'type' => "flag",
[10218]184 'reqd' => "no" },
185 { 'name' => "separate_cjk",
186 'desc' => "{BasPlug.separate_cjk}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190 { 'name' => "new_extract_email",
191 'desc' => "",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "yes" } ];
[3540]195
[9398]196my $gis_arguments =
197 [ { 'name' => "extract_placenames",
198 'desc' => "{GISBasPlug.extract_placenames}",
199 'type' => "flag",
200 'reqd' => "no" },
201 { 'name' => "gazetteer",
202 'desc' => "{GISBasPlug.gazetteer}",
203 'type' => "string",
204 'reqd' => "no" },
205 { 'name' => "place_list",
206 'desc' => "{GISBasPlug.place_list}",
207 'type' => "flag",
208 'reqd' => "no" } ];
209
210
[3540]211my $options = { 'name' => "BasPlug",
[5681]212 'desc' => "{BasPlug.desc}",
[6408]213 'abstract' => "yes",
214 'inherits' => "no",
[4750]215 'args' => $arguments };
[3540]216
[4778]217
[12970]218sub set_incremental {
[10478]219 my $self = shift(@_);
[12970]220 my ($incremental) = @_;
[10478]221
[12970]222 $self->{'incremental'} = $incremental;
[10478]223}
224
[4873]225sub get_arguments
226{
[8716]227 my $self = shift(@_);
228 my $optionlistref = $self->{'option_list'};
229 my @optionlist = @$optionlistref;
230 my $pluginoptions = pop(@$optionlistref);
231 my $pluginarguments = $pluginoptions->{'args'};
[4873]232 return $pluginarguments;
233}
234
235
[4778]236sub print_xml_usage
237{
[8716]238 my $self = shift(@_);
[11681]239 my $header = shift(@_);
[12624]240 my $high_level_information_only = shift(@_);
[11681]241
[6945]242 # XML output is always in UTF-8
[9413]243 gsprintf::output_strings_in_UTF8;
[6945]244
[11681]245 if ($header) {
246 &PrintUsage::print_xml_header("plugin");
247 }
[12624]248 $self->print_xml($high_level_information_only);
[3540]249}
250
[4778]251
252sub print_xml
253{
[8716]254 my $self = shift(@_);
[12624]255 my $high_level_information_only = shift(@_);
[4778]256
[8716]257 my $optionlistref = $self->{'option_list'};
258 my @optionlist = @$optionlistref;
[10229]259 my $pluginoptions = shift(@$optionlistref);
[4778]260 return if (!defined($pluginoptions));
261
[12630]262 # Find the process and block default expressions in the plugin arguments
263 my $process_exp = "";
264 my $block_exp = "";
265 if (defined($pluginoptions->{'args'})) {
266 foreach my $option (@{$pluginoptions->{'args'}}) {
267 if ($option->{'name'} eq "process_exp") {
268 $process_exp = $option->{'deft'};
269 }
270 if ($option->{'name'} eq "block_exp") {
271 $block_exp = $option->{'deft'};
272 }
273 }
274 }
275
[9413]276 gsprintf(STDERR, "<PlugInfo>\n");
277 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
278 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
[7023]279 $desc =~ s/</&amp;lt;/g; # doubly escaped
280 $desc =~ s/>/&amp;gt;/g;
[9413]281 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
282 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
283 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
[12630]284 gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");
285 gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");
[9413]286 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
[12624]287 unless (defined($high_level_information_only)) {
288 gsprintf(STDERR, " <Arguments>\n");
289 if (defined($pluginoptions->{'args'})) {
290 &PrintUsage::print_options_xml($pluginoptions->{'args'});
291 }
292 gsprintf(STDERR, " </Arguments>\n");
293
294 # Recurse up the plugin hierarchy
295 $self->print_xml();
[3540]296 }
[9413]297 gsprintf(STDERR, "</PlugInfo>\n");
[3540]298}
299
[4744]300
[4778]301sub print_txt_usage
[4744]302{
[8716]303 my $self = shift(@_);
[4750]304 # Print the usage message for a plugin (recursively)
[8716]305 my $descoffset = $self->determine_description_offset(0);
[6925]306 $self->print_plugin_usage($descoffset, 1);
[4750]307}
[4744]308
309
[4750]310sub determine_description_offset
311{
[8716]312 my $self = shift(@_);
313 my $maxoffset = shift(@_);
[4750]314
[8716]315 my $optionlistref = $self->{'option_list'};
316 my @optionlist = @$optionlistref;
[10229]317 my $pluginoptions = shift(@$optionlistref);
[4750]318 return $maxoffset if (!defined($pluginoptions));
319
320 # Find the length of the longest option string of this plugin
[8716]321 my $pluginargs = $pluginoptions->{'args'};
[4744]322 if (defined($pluginargs)) {
[8716]323 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
[4778]324 if ($longest > $maxoffset) {
325 $maxoffset = $longest;
[4744]326 }
[4750]327 }
[4744]328
[4750]329 # Recurse up the plugin hierarchy
330 $maxoffset = $self->determine_description_offset($maxoffset);
331 $self->{'option_list'} = \@optionlist;
332 return $maxoffset;
333}
334
335
336sub print_plugin_usage
337{
[8716]338 my $self = shift(@_);
339 my $descoffset = shift(@_);
340 my $isleafclass = shift(@_);
[4750]341
[8716]342 my $optionlistref = $self->{'option_list'};
343 my @optionlist = @$optionlistref;
[10229]344 my $pluginoptions = shift(@$optionlistref);
[4750]345 return if (!defined($pluginoptions));
346
[8716]347 my $pluginname = $pluginoptions->{'name'};
348 my $pluginargs = $pluginoptions->{'args'};
349 my $plugindesc = $pluginoptions->{'desc'};
[4750]350
351 # Produce the usage information using the data structure above
352 if ($isleafclass) {
[6932]353 if (defined($plugindesc)) {
[9413]354 gsprintf(STDERR, "$plugindesc\n\n");
[6932]355 }
[9413]356 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
[4750]357 }
358
359 # Display the plugin options, if there are some
360 if (defined($pluginargs)) {
[4744]361 # Calculate the column offset of the option descriptions
[8716]362 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
[4744]363
[4750]364 if ($isleafclass) {
[9413]365 gsprintf(STDERR, " {common.specific_options}:\n");
[4750]366 }
367 else {
[9413]368 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
[4750]369 }
370
[4744]371 # Display the plugin options
[6925]372 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
[4744]373 }
374
[4750]375 # Recurse up the plugin hierarchy
[6925]376 $self->print_plugin_usage($descoffset, 0);
[4750]377 $self->{'option_list'} = \@optionlist;
[4744]378}
379
380
[4]381sub new {
[10218]382 # Set Encodings to the list!!
383
384
385 # Start the BasPlug Constructor
[1219]386 my $class = shift (@_);
[10218]387 my ($pluginlist,$args,$hashArgOptLists) = @_;
388 push(@$pluginlist, $class);
389 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
[9398]390
[10218]391 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
392 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
393
[9398]394 if (GISBasPlug::has_mapdata()) {
395 push(@$arguments,@$gis_arguments);
396 }
[10218]397
398 my $self = {};
[10579]399 $self->{'outhandle'} = STDERR;
400 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
401 $self->{"info_only"} = 0;
402
403 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
404 # the args, just return the object.
405 foreach my $strArg (@{$args})
406 {
407 if($strArg eq "-gsdlinfo")
408 {
409 $self->{"info_only"} = 1;
410 return bless $self, $class;
411 }
412 }
413
[12546]414 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[10218]415 {
416 my $classTempClass = bless $self, $class;
[10620]417 print STDERR "<BadPlugin p=$plugin_name>\n";
[10218]418 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
419 $classTempClass->print_txt_usage(""); # Use default resource bundle
420 die "\n";
421 }
[9398]422
[10280]423
[10579]424 delete $self->{"info_only"};
[10218]425 # else parsing was successful.
426
427 $self->{'plugin_type'} = $plugin_name;
[10579]428 #$self->{'outhandle'} = STDERR;
[2785]429 $self->{'num_processed'} = 0;
430 $self->{'num_not_processed'} = 0;
431 $self->{'num_blocked'} = 0;
432 $self->{'num_archives'} = 0;
[8678]433 $self->{'cover_image'} = 1; # cover image is on by default
[10218]434 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[10579]435 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]436
[8892]437 my $associate_ext = $self->{'associate_ext'};
438 if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]439
[11122]440 my $associate_tail_re = $self->{'associate_tail_re'};
441 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
442 my $outhandle = $self->{'outhandle'};
443 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
444 print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]445 }
[11122]446 else {
447 my @exts = split(/,/,$associate_ext);
[8892]448
[11122]449 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
450 my $associate_tail_re = join("|",@exts_bracketed);
451 $self->{'associate_tail_re'} = $associate_tail_re;
452 }
453
454 delete $self->{'associate_ext'};
[8892]455 }
456
457 $self->{'shared_fileroot'} = {};
[8510]458 $self->{'file_blocks'} = {};
[1219]459
[9398]460 if ($self->{'extract_placenames'}) {
461
462 my $outhandle = $self->{'outhandle'};
[10218]463
[9398]464 my $places_ref
465 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
[10218]466
[9398]467 if (!defined $places_ref) {
468 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
469 print $outhandle " No placename extraction will take place.\n";
470 $self->{'extract_placenames'} = undef;
471 }
472 else {
473 $self->{'places'} = $places_ref;
474 }
[10280]475 }
[11089]476
[1219]477 return bless $self, $class;
[10218]478
[4]479}
480
[1242]481# initialize BasPlug options
482# if init() is overridden in a sub-class, remember to call BasPlug::init()
483sub init {
484 my $self = shift (@_);
[2785]485 my ($verbosity, $outhandle, $failhandle) = @_;
[1242]486
487 # verbosity is passed through from the processor
488 $self->{'verbosity'} = $verbosity;
489
[2785]490 # as are the outhandle and failhandle
[1424]491 $self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]492 $self->{'failhandle'} = $failhandle;
[1424]493
[1242]494 # set process_exp and block_exp to defaults unless they were
495 # explicitly set
[1244]496
497 if ((!$self->is_recursive()) and
[1242]498 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
[1244]499
[1242]500 $self->{'process_exp'} = $self->get_default_process_exp ();
501 if ($self->{'process_exp'} eq "") {
[1244]502 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]503 }
504 }
505
506 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
507 $self->{'block_exp'} = $self->get_default_block_exp ();
508 }
[11089]509
[1242]510}
511
[839]512sub begin {
513 my $self = shift (@_);
514 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[1396]515 $self->initialise_extractors();
[839]516}
517
518sub end {
[10155]519 # potentially called at the end of each plugin pass
520 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
521
[839]522 my ($self) = @_;
[1396]523 $self->finalise_extractors();
[839]524}
525
[10155]526sub deinit {
527 # called only once, after all plugin passes have been done
528
529 my ($self) = @_;
530}
531
[1242]532# this function should be overridden to return 1
533# in recursive plugins
[4]534sub is_recursive {
535 my $self = shift (@_);
536
[1242]537 return 0;
[4]538}
539
[1242]540sub get_default_block_exp {
541 my $self = shift (@_);
542
543 return "";
544}
545
546sub get_default_process_exp {
547 my $self = shift (@_);
548
549 return "";
550}
551
[9067]552# default implementation is to do nothing.
553sub store_block_files
554{
555 my $self =shift (@_);
556 my ($filename) = @_;
557 return;
558}
559
560#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
561sub block_cover_image
562{
[10833]563 my $self =shift;
564 my $filename = shift;
565
[9067]566 if ($self->{'cover_image'}) {
567 my $coverfile = $filename;
568 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
569 if (!-e $coverfile) {
570 $coverfile =~ s/jpg$/JPG/;
571 }
572 if (-e $coverfile) {
573 $self->{'file_blocks'}->{$coverfile} = 1;
[11089]574 }
[9067]575 }
576
577 return;
578}
[11122]579
580sub root_ext_split
581{
582 my $self = shift (@_);
583 my ($filename,$tail_re) = @_;
[9067]584
[11122]585 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
586
587 if ((!defined $file_prefix) || (!defined $file_ext)) {
588 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
589 }
590
591 return ($file_prefix,$file_ext);
592}
593
[8510]594sub metadata_read {
595 my $self = shift (@_);
596 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[8892]597 # Keep track of filenames with same root but different extensions
[11122]598 # Used to support -associate_ext and the more generalised
599 # -associate_tail_re
[8892]600
[11122]601 my $associate_tail_re = $self->{'associate_tail_re'};
602 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[8892]603
[11122]604 my ($file_prefix,$file_ext)
605 = $self->root_ext_split($file,$associate_tail_re);
606
[8892]607 if ((defined $file_prefix) && (defined $file_ext)) {
[11122]608
[8892]609 my $shared_fileroot = $self->{'shared_fileroot'};
610 if (!defined $shared_fileroot->{$file_prefix}) {
[11122]611 my $file_prefix_rec = { 'tie_to' => undef,
612 'exts' => {} };
[8892]613 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
614 }
615
616 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
617
618 my $process_exp = $self->{'process_exp'};
619
[11122]620 if ($file =~ m/$process_exp/) {
[8892]621 # This is the document the others should be tied to
622 $file_prefix_rec->{'tie_to'} = $file_ext;
623 }
624 else {
[11122]625 if ($file_ext =~ m/$associate_tail_re$/) {
[9351]626 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
627 }
[8892]628 }
[11122]629
[8892]630 }
631 }
[11122]632
[9067]633 # now check whether we are actually processing this
634 my $filename = $file;
635 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
636 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
637 return undef; # can't recognise
638 }
[8892]639
[9067]640 # do smart blocking if appropriate
[11089]641 if ($self->{'smart_block'}) {
[9067]642 $self->store_block_files($filename);
643 }
[11089]644 # block the cover image if there is one
645 if ($self->{'cover_image'}) {
646 $self->block_cover_image($filename);
647 }
[9067]648
649 return 1;
[8510]650}
651
[8892]652sub tie_to_filename
653{
654 my $self = shift (@_);
655
656 my ($file_ext,$file_prefix_rec) = @_;
657
658 if (defined $file_prefix_rec) {
659 my $tie_to = $file_prefix_rec->{'tie_to'};
[9351]660
[8892]661 if (defined $tie_to) {
662 if ($tie_to eq $file_ext) {
663 return 1;
664 }
665 }
666 }
667
668 return 0;
669}
670
671sub tie_to_assoc_file
672{
673 my $self = shift (@_);
674 my ($file_ext,$file_prefix_rec) = @_;
675
676 if (defined $file_prefix_rec) {
677 my $tie_to = $file_prefix_rec->{'tie_to'};
678 if (defined $tie_to) {
679
680 my $exts = $file_prefix_rec->{'exts'};
681
682 my $has_file_ext = $exts->{$file_ext};
[11122]683
[8892]684 if ($has_file_ext) {
685 return 1;
686 }
687 }
688 }
689
690 return 0;
691}
692
693
694sub associate_with
695{
696 my $self = shift (@_);
697 my ($file, $filename, $metadata) = @_;
698
[11122]699 my $associate_tail_re = $self->{'associate_tail_re'};
700 return 0 if (!$associate_tail_re);
[8892]701
702 # If file, see if matches with "tie_to" doc or is one of the
703 # associated filename extensions.
704
[11122]705 my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
706
[8892]707 if ((defined $file_prefix) && (defined $file_ext)) {
708
709 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
[9351]710
[8892]711 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
712
713 # Set up gsdlassocfile_tobe
714
715 my $exts = $file_prefix_rec->{'exts'};
716
717 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
718 $metadata->{'gsdlassocfile_tobe'} = [];
719 }
720
721 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
722
723 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
724 foreach my $e (keys %$exts) {
[11122]725 my $assoc_file = "$full_prefix$e";
726 print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
[8892]727 my $mime_type = ""; # let system auto detect this
728 push(@$assoc_tobe,"$assoc_file:$mime_type:");
729 }
[11122]730
[8892]731 }
732 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
[11122]733
734
735 # a form of smart block
[8892]736 return 1;
737 }
738 }
739
740 return 0;
741}
742
743
[10280]744sub read_block {
[1954]745 my $self = shift (@_);
746
[9853]747 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[4]748
[8908]749
[2795]750 my $filename = $file;
751 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
752
[8892]753 if ($self->associate_with($file,$filename,$metadata)) {
754 # a form of smart block
755 $self->{'num_blocked'} ++;
[10280]756 return (0,undef); # blocked
[8892]757 }
[8510]758
[10280]759 my $smart_block = $self->{'smart_block'};
760 my $smart_block_BN = $self->{'smart_block_BN'};
[11089]761
[8915]762 if ($smart_block || $smart_block_BN) {
[8510]763 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
[8892]764 $self->{'num_blocked'} ++;
[10280]765 return (0,undef); # blocked
[8510]766 }
[11089]767 } else {
768 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
769 $self->{'num_blocked'} ++;
770 return (0,undef); # blocked
771 }
772 if ($self->{'cover_image'}) {
773 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
774 $self->{'num_blocked'} ++;
775 return (0,undef); # blocked
776 }
777 }
[2785]778 }
[11122]779
[1242]780 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
[10280]781 return (undef,undef); # can't recognise
[1242]782 }
[10280]783
784 return (1,$filename);
785}
786
787sub read_tidy_file {
788
789 my $self = shift (@_);
790
791 my ($file) = @_;
792
[1242]793 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1844]794
[10280]795 return $file;
796}
797
798
[14961]799sub filename_to_metadata
800{
801 my $self = shift (@_);
802 my ($file, $encoding) = @_;
[10280]803
[14961]804 my $outhandle = $self->{'outhandle'};
805
806 my $filesystem_encoding = undef;
807
808 eval {
809 use POSIX qw(locale_h);
810
811 # With only one parameter, setlocale retrieves the current value
812 my $current_locale = setlocale(LC_CTYPE);
813
814 if ($current_locale =~ m/^.*\.(.*?)$/) {
815 my $char_encoding = lc($1);
816 $char_encoding =~ s/-/_/g;
817 $char_encoding =~ s/^utf_8$/utf8/;
818
819 if ($char_encoding =~ m/^\d+$/) {
820 if (defined $encodings::encoding->{"windows_$char_encoding"}) {
821 $char_encoding = "windows_$char_encoding";
822 }
823 elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
824 $char_encoding = "dos_$char_encoding";
825 }
826 }
827
828 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
829 || (defined $encodings::encoding->{$char_encoding})) {
830 $filesystem_encoding = $char_encoding;
831 }
832 else {
833 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
834 }
835 }
836
837
838 };
839 if ($@) {
840 print $outhandle "$@\n";
841 print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
842
843 }
844
845 my ($filemeta) = $file =~ /([^\\\/]+)$/;
846
847 # how do we know what encoding the filename is in?
848 # => one answer is to check the locale
849
850 if (defined $filesystem_encoding) {
851 if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) {
852 $filemeta = unicode::unicode2utf8(
853 unicode::convert2unicode($filesystem_encoding, \$filemeta)
854 );
855 }
856 }
857 # assume it is in the same encoding as its contents
858 elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) {
859 $filemeta = unicode::unicode2utf8(
860 unicode::convert2unicode($encoding, \$filemeta)
861 );
862 }
863
864 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
865
866 return $dmsafe_filemeta;
867}
868
869
[10280]870# The BasPlug read_into_doc_obj() function. This function does all the
871# right things to make general options work for a given plugin. It reads in
872# a file and sets up a slew of metadata all saved in doc_obj, which
873# it then returns as part of a tuple (process_status,doc_obj)
874#
875# Much of this functionality used to reside in read, but it was broken
876# down into a supporting routine to make the code more flexible.
877#
878# recursive plugins (e.g. RecPlug) and specialized plugins like those
879# capable of processing many documents within a single file (e.g.
880# GMLPlug) will normally want to implement their own version of
881# read_into_doc_obj()
882#
883# Note that $base_dir might be "" and that $file might
884# include directories
885sub read_into_doc_obj {
886 my $self = shift (@_);
887 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
888
889 if ($self->is_recursive()) {
890 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
891 }
892
893 my $outhandle = $self->{'outhandle'};
894
895 my ($block_status,$filename) = $self->read_block(@_);
896 return $block_status if ((!defined $block_status) || ($block_status==0));
897 $file = $self->read_tidy_file($file);
898
[2811]899 # Do encoding stuff
900 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[11389]901 if ($self->{'verbosity'} > 2) {
902 print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
903 }
[1844]904
[1242]905 # create a new document
[1379]906 my $doc_obj = new doc ($filename, "indexed_doc");
[14961]907 my $top_section = $doc_obj->get_top_section();
908
[12270]909 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[14961]910 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
911 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
912 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
913 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
[8166]914
[14961]915 my $filemeta = $self->filename_to_metadata($file,$encoding);
916 $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
[2816]917 if ($self->{'cover_image'}) {
918 $self->associate_cover_image($doc_obj, $filename);
919 }
[1242]920
921 # read in file ($text will be in utf8)
922 my $text = "";
[2734]923 $self->read_file ($filename, $encoding, $language, \$text);
[1242]924
[1844]925 if (!length ($text)) {
[2811]926 my $plugin_name = ref ($self);
[9586]927 if ($gli) {
928 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
929 }
[9703]930 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
[2785]931
932 my $failhandle = $self->{'failhandle'};
[9413]933 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
[5681]934 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
[2785]935 $self->{'num_not_processed'} ++;
936
[10280]937 return (0,undef); # what should we return here?? error but don't want to pass it on
[1242]938 }
[1954]939
[1242]940 # include any metadata passed in from previous plugins
941 # note that this metadata is associated with the top level section
[8510]942
[11122]943 my $associate_tail_re = $self->{'associate_tail_re'};
944
[1242]945 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
[9398]946
[8716]947 # do plugin specific processing of doc_obj
948 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
949 $text = '';
950 undef $text;
[9584]951 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[10280]952 return (-1,undef);
[8716]953 }
954 $text='';
955 undef $text;
[9398]956
[1242]957 # do any automatic metadata extraction
958 $self->auto_extract_metadata ($doc_obj);
[1954]959
[1242]960 # add an OID
[3515]961 # see if there is a plugin-specific set_OID function...
[8716]962 if (defined ($self->can('set_OID'))) {
[3515]963 # it will need $doc_obj to set the Identifier metadata...
964 $self->set_OID($doc_obj);
965 } else {
966 # use the default set_OID() in doc.pm
967 $doc_obj->set_OID();
968 }
[10280]969
970 return (1,$doc_obj);
971}
[1242]972
973
[10280]974# The BasPlug read() function. This function calls read_into_doc_obj()
975# to ensure all the right things to make general options work for a
976# given plugin are done. It then calls the process() function which
977# does all the work specific to a plugin (like the old read functions
978# used to do). Most plugins should define their own process() function
979# and let this read() function keep control.
980#
981# recursive plugins (e.g. RecPlug) and specialized plugins like those
982# capable of processing many documents within a single file (e.g.
983# GMLPlug) might want to implement their own version of read(), but
984# more likely need to implement their own version of read_into_doc_obj()
985#
986# Return number of files processed, undef if can't recognise, -1 if can't
987# process
988
989sub read {
990 my $self = shift (@_);
991 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
992
993 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
994
995 if ((defined $process_status) && ($process_status == 1)) {
996 # process the document
997 $processor->process($doc_obj);
998
999 if(defined($self->{'places_filename'})){
1000 &util::rm($self->{'places_filename'});
1001 $self->{'places_filename'} = undef;
1002 }
1003
1004 $self->{'num_processed'} ++;
1005 undef $doc_obj;
[9398]1006 }
1007
[10280]1008 # if process_status == 1, then the file has been processed.
1009 return $process_status;
1010
[4]1011}
1012
[1244]1013# returns undef if file is rejected by the plugin
[1242]1014sub process {
1015 my $self = shift (@_);
[11089]1016 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]1017
[9413]1018 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
[5681]1019 # die "Basplug::process function must be implemented in sub-class\n";
[1244]1020
1021 return undef; # never gets here
[1242]1022}
1023
[1219]1024# uses the multiread package to read in the entire file pointed to
1025# by filename and loads the resulting text into $$textref. Input text
1026# may be in any of the encodings handled by multiread, output text
1027# will be in utf8
1028sub read_file {
1029 my $self = shift (@_);
[2734]1030 my ($filename, $encoding, $language, $textref) = @_;
[4]1031
[1756]1032 if (!-r $filename)
1033 {
[1844]1034 my $outhandle = $self->{'outhandle'};
[9413]1035 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
[5681]1036 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
[1756]1037 return;
1038 }
[1219]1039 $$textref = "";
[9413]1040 if (!open (FILE, $filename)) {
1041 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
[10280]1042 die "\n";
1043 }
1044
[1844]1045 if ($encoding eq "ascii") {
[1219]1046 undef $/;
1047 $$textref = <FILE>;
1048 $/ = "\n";
1049 } else {
1050 my $reader = new multiread();
1051 $reader->set_handle ('BasPlug::FILE');
[1844]1052 $reader->set_encoding ($encoding);
[1219]1053 $reader->read_file ($textref);
[10280]1054 #Now segments chinese if the separate_cjk option is set
[6584]1055 if ($self->{'separate_cjk'}) {
[1219]1056 # segment the Chinese words
1057 $$textref = &cnseg::segment($$textref);
1058 }
1059 }
[10280]1060 close FILE;
1061}
[1219]1062
[10280]1063# write_file -- used by ConvertToPlug, for example in post processing
1064#
1065sub utf8_write_file {
1066 my $self = shift (@_);
1067 my ($textref, $filename) = @_;
1068
1069 if (!open (FILE, ">$filename")) {
1070 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1071 die "\n";
1072 }
1073 print FILE $$textref;
1074
[1219]1075 close FILE;
1076}
1077
[10280]1078
[7504]1079sub filename_based_title
1080{
1081 my $self = shift (@_);
1082 my ($file) = @_;
1083
1084 my $file_derived_title = $file;
1085 $file_derived_title =~ s/_/ /g;
1086 $file_derived_title =~ s/\..*?$//;
1087
1088 return $file_derived_title;
1089}
1090
[9398]1091
[7504]1092sub title_fallback
1093{
1094 my $self = shift (@_);
1095 my ($doc_obj,$section,$file) = @_;
1096
1097 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1098
1099 my $file_derived_title = $self->filename_based_title($file);
1100 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
1101 }
1102}
1103
[2811]1104sub textcat_get_language_encoding {
1105 my $self = shift (@_);
1106 my ($filename) = @_;
1107
[10280]1108
[2811]1109 my ($language, $encoding, $extracted_encoding);
1110 if ($self->{'input_encoding'} eq "auto") {
1111 # use textcat to automatically work out the input encoding and language
1112 ($language, $encoding) = $self->get_language_encoding ($filename);
1113 } elsif ($self->{'extract_language'}) {
1114 # use textcat to get language metadata
1115 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1116 $encoding = $self->{'input_encoding'};
[7644]1117 # don't print this message for english... english in utf8 is identical
1118 # to english in iso-8859-1 (except for some punctuation). We don't have
1119 # a language model for en_utf8, so textcat always says iso-8859-1!
1120 if ($extracted_encoding ne $encoding && $language ne "en"
1121 && $self->{'verbosity'}) {
[2811]1122 my $plugin_name = ref ($self);
1123 my $outhandle = $self->{'outhandle'};
[9413]1124 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
[2811]1125 }
1126 } else {
1127 $language = $self->{'default_language'};
1128 $encoding = $self->{'input_encoding'};
1129 }
[10280]1130
[2811]1131 return ($language, $encoding);
1132}
1133
[1844]1134# Uses textcat to work out the encoding and language of the text in
1135# $filename. All html tags are removed before processing.
1136# returns an array containing "language" and "encoding"
1137sub get_language_encoding {
1138 my $self = shift (@_);
1139 my ($filename) = @_;
1140 my $outhandle = $self->{'outhandle'};
[9413]1141 my $unicode_format = "";
[11389]1142 my $best_language = "";
1143 my $best_encoding = "";
1144
[1844]1145 # read in file
[11389]1146 if (!open (FILE, $filename)) {
1147 gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1148 # this is a pretty bad error, but try to continue anyway
1149 return ($self->{'default_language'}, $self->{'input_encoding'});
1150 }
[1844]1151 undef $/;
1152 my $text = <FILE>;
1153 $/ = "\n";
1154 close FILE;
1155
[9413]1156 # check if first few bytes have a Byte Order Marker
1157 my $bom=substr($text,0,2); # check 16bit unicode
1158 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1159 $unicode_format="unicode";
1160 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1161 $unicode_format="unicode";
1162 } else {
1163 $bom=substr($text,0,3); # check utf-8
1164 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1165 $unicode_format="utf8";
1166# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1167# $unicode_format="utf8";
1168 }
1169 }
[10442]1170
[1999]1171
[11389]1172 # handle html files specially
[9413]1173 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1174 if (ref($self) eq 'HTMLPlug' ||
1175 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
[11389]1176
1177 # remove <title>stuff</title> -- as titles tend often to be in English
1178 # for foreign language documents
1179 $text =~ s!<title>.*?</title>!!si;
1180
1181 # see if this html file specifies its encoding
1182 if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1183 $best_encoding = $1;
[11880]1184 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#"
[11389]1185 $best_encoding = $1;
1186 }
1187 if ($best_encoding) { # we extracted an encoding
1188 $best_encoding =~ s/-+/_/g;
1189 $best_encoding = lc($best_encoding); # lowercase
1190 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1191 $self->{'input_encoding'} = $best_encoding;
1192 }
1193
1194 # remove all HTML tags
[9413]1195 $text =~ s/<[^>]*>//sg;
1196 }
[1844]1197
1198 # get the language/encoding
[11966]1199 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
[2235]1200 my $results = $self->{'textcat'}->classify(\$text);
[1844]1201
[1903]1202 # if textcat returns 3 or less possibilities we'll use the
1203 # first one in the list - otherwise use the defaults
[2235]1204 if (scalar @$results > 3) {
[9413]1205 if ($unicode_format) { # in case the first had a BOM
1206 $best_encoding=$unicode_format;
1207 } else {
1208 my %guessed_encodings = ();
1209 foreach my $result (@$results) {
1210 $result =~ /([^\-]+)$/;
1211 my $enc=$1;
1212 if (!defined($guessed_encodings{$enc})) {
1213 $guessed_encodings{$enc}=0;
1214 }
1215 $guessed_encodings{$enc}++;
[3731]1216 }
[9413]1217
1218 $guessed_encodings{""}=-1; # for default best_encoding of ""
1219 foreach my $enc (keys %guessed_encodings) {
1220 if ($guessed_encodings{$enc} >
1221 $guessed_encodings{$best_encoding}){
1222 $best_encoding=$enc;
1223 }
1224 }
[3731]1225 }
1226
[1844]1227 if ($self->{'input_encoding'} ne 'auto') {
[9961]1228 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
[9413]1229 gsprintf($outhandle,
1230 "BasPlug: {BasPlug.could_not_extract_language}\n",
1231 $filename, $self->{'default_language'});
[1844]1232 }
[11389]1233 $best_language = $self->{'default_language'};
1234 $best_encoding = $self->{'input_encoding'};
[1844]1235
1236 } else {
[9961]1237 if ($self->{'verbosity'}>2) {
[9413]1238 gsprintf($outhandle,
1239 "BasPlug: {BasPlug.could_not_extract_language}\n",
1240 $filename, $self->{'default_language'});
[1844]1241 }
[11389]1242 $best_language = $self->{'default_language'};
[1844]1243 }
[11389]1244 } else { # <= 3 suggestions
1245 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
1246 if (!defined $language) {
1247 if ($self->{'verbosity'}>2) {
1248 gsprintf($outhandle,
1249 "BasPlug: {BasPlug.could_not_extract_language}\n",
1250 $filename, $self->{'default_language'});
1251 }
1252 $language = $self->{'default_language'};
[1870]1253 }
[11389]1254 if (!defined $encoding) {
1255 if ($self->{'verbosity'}>2) {
1256 gsprintf($outhandle,
1257 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
1258 $filename, $self->{'default_encoding'});
1259 }
1260 $encoding = $self->{'default_encoding'};
[1870]1261 }
[11389]1262 $best_language = $language;
1263 if (! $best_encoding ) { # may already be set... eg from html meta tag
1264 $best_encoding = $encoding;
1265 }
[1844]1266 }
1267
[11389]1268 my $text_copy = $text;
1269 if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1270 # the text is valid utf8, so assume that's the real encoding
1271 # (since textcat is based on probabilities)
1272 $best_encoding = 'utf8';
1273 }
[7818]1274
1275 # check for equivalents where textcat doesn't have some encodings...
1276 # eg MS versions of standard encodings
[11389]1277 if ($best_encoding =~ /^iso_8859_(\d+)/) {
[7818]1278 my $iso = $1; # which variant of the iso standard?
1279 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1280 if ($text =~ /[\x80-\x9f]/) {
1281 # Western Europe
[11389]1282 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1283 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1284 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1285 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1286 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1287 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1288 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
[7818]1289 }
1290 }
1291
[11389]1292 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
1293 !defined $encodings::encodings->{$best_encoding}) {
[1844]1294 if ($self->{'verbosity'}) {
[7818]1295 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
[11389]1296 $filename, $best_encoding, $self->{'default_encoding'});
[1844]1297 }
[11389]1298 $best_encoding = $self->{'default_encoding'};
[1844]1299 }
1300
[11389]1301 return ($best_language, $best_encoding);
[1844]1302}
1303
[1219]1304# add any extra metadata that's been passed around from one
1305# plugin to another.
1306# extra_metadata uses add_utf8_metadata so it expects metadata values
1307# to already be in utf8
1308sub extra_metadata {
1309 my $self = shift (@_);
1310 my ($doc_obj, $cursection, $metadata) = @_;
1311
[11122]1312 my $associate_tail_re = $self->{'associate_tail_re'};
1313
[1219]1314 foreach my $field (keys(%$metadata)) {
[839]1315 # $metadata->{$field} may be an array reference
[8510]1316 if ($field eq "gsdlassocfile_tobe") {
1317 # 'gsdlassocfile_tobe' is artificially introduced metadata
1318 # that is used to signal that certain additional files should
1319 # be tied to this document. Useful in situations where a
1320 # metadata pass in the plugin pipeline works out some files
1321 # need to be associated with a document, but the document hasn't
1322 # been formed yet.
1323
1324 my $equiv_form = "";
1325 foreach my $gaf (@{$metadata->{$field}}) {
1326 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1327 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1328 my $filename = $full_filename;
1329
1330 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[11122]1331
1332 # work out extended tail extension (i.e. matching tail re)
1333
1334 my ($file_prefix,$file_extended_ext)
1335 = $self->root_ext_split($tail_filename,$associate_tail_re);
1336 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1337
[8510]1338 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
[11834]1339 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
[8510]1340 my $srcicon = "_icon".$doc_ext."_";
1341 my $end_doclink = "</a>";
1342
[11122]1343 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1344
1345 if (defined $pre_doc_ext) {
1346 # for metadata such as [mp3._edited] [mp3._full] ...
1347 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1348 }
1349
1350 # for multiple metadata such as [mp3.assoclink]
1351 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1352
1353 $equiv_form .= " $assoc_form";
[8510]1354 }
1355 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1356 }
1357 elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]1358 map {
[1219]1359 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]1360 } @{$metadata->{$field}};
1361 } else {
[1219]1362 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]1363 }
1364 }
1365}
1366
[1396]1367# initialise metadata extractors
1368sub initialise_extractors {
1369 my $self = shift (@_);
1370
1371 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1372 &acronym::initialise_acronyms();
1373 }
1374}
1375
1376# finalise metadata extractors
1377sub finalise_extractors {
1378 my $self = shift (@_);
1379
1380 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1381 &acronym::finalise_acronyms();
1382 }
1383}
1384
[1602]1385# FIRSTNNN: extract the first NNN characters as metadata
1386sub extract_first_NNNN_characters {
1387 my $self = shift (@_);
1388 my ($textref, $doc_obj, $thissection) = @_;
1389
1390 foreach my $size (split /,/, $self->{'first'}) {
1391 my $tmptext = $$textref;
1392 $tmptext =~ s/^\s+//;
1393 $tmptext =~ s/\s+$//;
1394 $tmptext =~ s/\s+/ /gs;
1395 $tmptext = substr ($tmptext, 0, $size);
1396 $tmptext =~ s/\s\S*$/&#8230;/;
1397 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1398 }
1399}
1400
1401sub extract_email {
1402 my $self = shift (@_);
1403 my ($textref, $doc_obj, $thissection) = @_;
1404 my $outhandle = $self->{'outhandle'};
1405
[9413]1406 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
[1844]1407 if ($self->{'verbosity'} > 2);
[1602]1408
[2604]1409 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
[1602]1410 @email = sort @email;
1411
[10218]1412# if($self->{"new_extract_email"} == 0)
1413# {
1414# my @email2 = ();
1415# foreach my $address (@email)
1416# {
1417# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
1418# {
1419# push @email2, $address;
1420# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1421# # print $outhandle " extracting $address\n"
1422# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1423# if ($self->{'verbosity'} > 3);
1424# }
1425# }
1426# }
1427# else
1428# {
1429 my $hashExistMail = {};
[1602]1430 foreach my $address (@email) {
[10218]1431 if (!(defined $hashExistMail->{$address}))
1432 {
1433 $hashExistMail->{$address} = 1;
[1602]1434 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
[9413]1435 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
[1844]1436 if ($self->{'verbosity'} > 3);
[1602]1437 }
1438 }
[9413]1439 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
[1844]1440 if ($self->{'verbosity'} > 2);
[1602]1441}
1442
1443# extract metadata
[5681]1444sub auto_extract_metadata {
[1954]1445
[1242]1446 my $self = shift (@_);
1447 my ($doc_obj) = @_;
[1602]1448
1449 if ($self->{'extract_email'}) {
1450 my $thissection = $doc_obj->get_top_section();
1451 while (defined $thissection) {
1452 my $text = $doc_obj->get_text($thissection);
1453 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1454 $thissection = $doc_obj->get_next_section ($thissection);
1455 }
[1954]1456 }
[9398]1457 if ($self->{'extract_placenames'}) {
1458 my $thissection = $doc_obj->get_top_section();
1459 while (defined $thissection) {
1460 my $text = $doc_obj->get_text($thissection);
1461 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1462 $thissection = $doc_obj->get_next_section ($thissection);
1463 }
1464 }
[1954]1465
[11069]1466 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
1467 $self->extract_keyphrases($doc_obj);
1468 }
[1954]1469
[1602]1470 if ($self->{'first'}) {
1471 my $thissection = $doc_obj->get_top_section();
1472 while (defined $thissection) {
1473 my $text = $doc_obj->get_text($thissection);
1474 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1475 $thissection = $doc_obj->get_next_section ($thissection);
1476 }
1477 }
1478
[1242]1479 if ($self->{'extract_acronyms'}) {
1480 my $thissection = $doc_obj->get_top_section();
1481 while (defined $thissection) {
1482 my $text = $doc_obj->get_text($thissection);
1483 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1484 $thissection = $doc_obj->get_next_section ($thissection);
1485 }
1486 }
[1602]1487
[1393]1488 if ($self->{'markup_acronyms'}) {
1489 my $thissection = $doc_obj->get_top_section();
1490 while (defined $thissection) {
1491 my $text = $doc_obj->get_text($thissection);
1492 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1493 $doc_obj->delete_text($thissection);
1494 $doc_obj->add_text($thissection, $text);
1495 $thissection = $doc_obj->get_next_section ($thissection);
1496 }
1497 }
1498
[10218]1499 if($self->{'extract_historical_years'}) {
[1317]1500 my $thissection = $doc_obj->get_top_section();
1501 while (defined $thissection) {
[10218]1502
[1317]1503 my $text = $doc_obj->get_text($thissection);
[1846]1504 &DateExtract::get_date_metadata($text, $doc_obj,
1505 $thissection,
[10218]1506 $self->{'no_bibliography'},
1507 $self->{'maximum_year'},
1508 $self->{'maximum_century'});
[1317]1509 $thissection = $doc_obj->get_next_section ($thissection);
1510 }
1511 }
[1242]1512}
1513
[11069]1514
1515#adding kea keyphrases
1516sub extract_keyphrases
1517{
1518 my $self = shift(@_);
1519 my $doc_obj = shift(@_);
1520
1521 # Use Kea 3.0 unless 4.0 has been specified
1522 my $kea_version = "3.0";
1523 if ($self->{'extract_keyphrases_kea4'}) {
1524 $kea_version = "4.0";
1525 }
1526
1527 # Check that Kea exists, and tell the user where to get it if not
1528 my $keahome = &Kea::get_Kea_directory($kea_version);
1529 if (!-e $keahome) {
1530 gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1531 return;
1532 }
1533
1534 my $thissection = $doc_obj->get_top_section();
1535 my $text = "";
1536 my $list;
1537
1538 #loop through sections to gather whole doc
1539 while (defined $thissection) {
1540 my $sectiontext = $doc_obj->get_text($thissection);
1541 $text = $text.$sectiontext;
1542 $thissection = $doc_obj->get_next_section ($thissection);
1543 }
1544
1545 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1546 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1547 } else { #otherwise call Kea with no options
1548 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
1549 }
1550
1551 if ($list){
1552 # if a list of kea keyphrases was returned (ie not empty)
1553 if ($self->{'verbosity'}) {
1554 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1555 }
1556
1557 #add metadata to top section
1558 $thissection = $doc_obj->get_top_section();
1559
1560 # add all key phrases as one metadata
1561 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1562
1563 # add individual key phrases as multiple metadata
1564 foreach my $keyphrase (split(',', $list)) {
1565 $keyphrase =~ s/^\s+|\s+$//g;
1566 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1567 }
1568 }
1569}
1570
1571
[1335]1572# extract acronyms from a section in a document. progress is
[1424]1573# reported to outhandle based on the verbosity. both the Acronym
[1335]1574# and the AcronymKWIC metadata items are created.
1575
[1242]1576sub extract_acronyms {
1577 my $self = shift (@_);
1578 my ($textref, $doc_obj, $thissection) = @_;
[1424]1579 my $outhandle = $self->{'outhandle'};
[1242]1580
[5681]1581 # print $outhandle " extracting acronyms ...\n"
[9413]1582 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
[1844]1583 if ($self->{'verbosity'} > 2);
[1335]1584
[1242]1585 my $acro_array = &acronym::acronyms($textref);
[1360]1586
[1242]1587 foreach my $acro (@$acro_array) {
1588
[1335]1589 #check that this is the first time ...
1590 my $seen_before = "false";
1591 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1592 foreach my $thisAcro (@$previous_data) {
[1602]1593 if ($thisAcro eq $acro->to_string()) {
[1335]1594 $seen_before = "true";
[9413]1595 if ($self->{'verbosity'} >= 4) {
1596 gsprintf($outhandle, " {BasPlug.already_seen} " .
1597 $acro->to_string() . "\n");
1598 }
[5681]1599 }
[1242]1600 }
[1335]1601
[1602]1602 if ($seen_before eq "false") {
[1393]1603 #write it to the file ...
1604 $acro->write_to_file();
1605
[1335]1606 #do the normal acronym
1607 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[9413]1608 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
[1844]1609 if ($self->{'verbosity'} > 3);
[1335]1610 }
[1242]1611 }
[5681]1612
[9413]1613 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
[1844]1614 if ($self->{'verbosity'} > 2);
[1242]1615}
1616
[1393]1617sub markup_acronyms {
1618 my $self = shift (@_);
1619 my ($text, $doc_obj, $thissection) = @_;
[1424]1620 my $outhandle = $self->{'outhandle'};
[1393]1621
[9413]1622 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
[1844]1623 if ($self->{'verbosity'} > 2);
[1393]1624
1625 #self is passed in to check for verbosity ...
1626 $text = &acronym::markup_acronyms($text, $self);
1627
[9413]1628 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
[1844]1629 if ($self->{'verbosity'} > 2);
[1393]1630
1631 return $text;
1632}
1633
[2785]1634sub compile_stats {
1635 my $self = shift(@_);
1636 my ($stats) = @_;
1637
1638 $stats->{'num_processed'} += $self->{'num_processed'};
1639 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]1640 $stats->{'num_archives'} += $self->{'num_archives'};
[2785]1641
1642}
1643
[2816]1644sub associate_cover_image {
[10833]1645 my $self = shift;
[2816]1646 my ($doc_obj, $filename) = @_;
1647
[10833]1648 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1649 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1650 # don't stat() for existence eg for multiple document input files
1651 # (eg SplitPlug)
1652 return;
1653 }
1654
[9413]1655 my $top_section=$doc_obj->get_top_section();
1656
[2816]1657 if (-e $filename) {
[13968]1658 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]1659 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]1660 } else {
[10833]1661 my $upper_filename = $filename;
1662 $upper_filename =~ s/jpg$/JPG/;
1663 if (-e $upper_filename) {
1664 $doc_obj->associate_file($upper_filename, "cover.jpg",
1665 "image/jpeg");
[9413]1666 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]1667 } else {
1668 # file doesn't exist, so record the fact that it's missing so
1669 # we don't stat() again (stat is slow)
1670 $self->{'covers_missing_cache'}->{$filename} = 1;
[3086]1671 }
[2816]1672 }
[10833]1673
[2816]1674}
1675
[11332]1676
1677# Overridden by exploding plugins (eg. ISISPlug)
1678sub clean_up_after_exploding
1679{
1680 my $self = shift(@_);
1681}
1682
1683
[4]16841;
Note: See TracBrowser for help on using the repository browser.