source: gsdl/trunk/perllib/plugins/BasPlug.pm@ 14961

Last change on this file since 14961 was 14961, checked in by davidb, 16 years ago

Setting filename metadata (Source) in BasPlug.pm looks to user its locale, in the first instance, to resolve what character encoding the file system uses.

  • Property svn:keywords set to Author Date Id Revision
File size: 48.6 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict;
39no strict 'subs';
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42use File::Basename;
43
44use Kea;
45use multiread;
46use encodings;
47use unicode;
48use cnseg;
49use acronym;
50use textcat;
51use doc;
52eval "require diagnostics"; # some perl distros (eg mac) don't have this
53use DateExtract;
54use ghtml;
55use gsprintf 'gsprintf';
56use printusage;
57use parse2;
58
59
60use GISBasPlug;
61
62@BasPlug::ISA = ( GISBasPlug );
63
64my $unicode_list =
65 [ { 'name' => "ascii",
66 'desc' => "{BasPlug.input_encoding.ascii}" },
67 { 'name' => "utf8",
68 'desc' => "{BasPlug.input_encoding.utf8}" },
69 { 'name' => "unicode",
70 'desc' => "{BasPlug.input_encoding.unicode}" } ];
71
72my $auto_unicode_list =
73 [ { 'name' => "auto",
74 'desc' => "{BasPlug.input_encoding.auto}" } ];
75
76my $e = $encodings::encodings;
77foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78{
79 my $hashEncode =
80 {'name' => $enc,
81 'desc' => $e->{$enc}->{'name'}};
82
83 push(@{$unicode_list},$hashEncode);
84}
85
86push(@{$auto_unicode_list},@{$unicode_list});
87
88my $arguments =
89 [ { 'name' => "process_exp",
90 'desc' => "{BasPlug.process_exp}",
91 'type' => "regexp",
92 'deft' => "",
93 'reqd' => "no" },
94 { 'name' => "block_exp",
95 'desc' => "{BasPlug.block_exp}",
96 'type' => "regexp",
97 'deft' => "",
98 'reqd' => "no" },
99 { 'name' => "smart_block",
100 'desc' => "{BasPlug.smart_block}",
101 'type' => "flag",
102 'reqd' => "no" },
103 { 'name' => "associate_ext",
104 'desc' => "{BasPlug.associate_ext}",
105 'type' => "string",
106 'reqd' => "no" },
107 { 'name' => "associate_tail_re",
108 'desc' => "{BasPlug.associate_tail_re}",
109 'type' => "string",
110 'reqd' => "no" },
111 { 'name' => "input_encoding",
112 'desc' => "{BasPlug.input_encoding}",
113 'type' => "enum",
114 'list' => $auto_unicode_list,
115 'reqd' => "no" ,
116 'deft' => "auto" } ,
117 { 'name' => "default_encoding",
118 'desc' => "{BasPlug.default_encoding}",
119 'type' => "enum",
120 'list' => $unicode_list,
121 'reqd' => "no",
122 'deft' => "utf8" },
123 { 'name' => "extract_language",
124 'desc' => "{BasPlug.extract_language}",
125 'type' => "flag",
126 'reqd' => "no" },
127 { 'name' => "default_language",
128 'desc' => "{BasPlug.default_language}",
129 'type' => "string",
130 'deft' => "en",
131 'reqd' => "no" },
132 { 'name' => "extract_acronyms",
133 'desc' => "{BasPlug.extract_acronyms}",
134 'type' => "flag",
135 'reqd' => "no" },
136 { 'name' => "markup_acronyms",
137 'desc' => "{BasPlug.markup_acronyms}",
138 'type' => "flag",
139 'reqd' => "no" },
140 { 'name' => "extract_keyphrases",
141 'desc' => "{BasPlug.extract_keyphrases}",
142 'type' => "flag",
143 'reqd' => "no" },
144 { 'name' => "extract_keyphrases_kea4",
145 'desc' => "{BasPlug.extract_keyphrases_kea4}",
146 'type' => "flag",
147 'reqd' => "no" },
148 { 'name' => "extract_keyphrase_options",
149 'desc' => "{BasPlug.extract_keyphrase_options}",
150 'type' => "string",
151 'deft' => "",
152 'reqd' => "no" },
153 { 'name' => "first",
154 'desc' => "{BasPlug.first}",
155 'type' => "string",
156 'reqd' => "no" },
157 { 'name' => "extract_email",
158 'desc' => "{BasPlug.extract_email}",
159 'type' => "flag",
160 'reqd' => "no" },
161 { 'name' => "extract_historical_years",
162 'desc' => "{BasPlug.extract_historical_years}",
163 'type' => "flag",
164 'reqd' => "no" },
165 { 'name' => "maximum_year",
166 'desc' => "{BasPlug.maximum_year}",
167 'type' => "int",
168 'deft' => (localtime)[5]+1900,
169 'char_length' => "4",
170 #'range' => "2,100",
171 'reqd' => "no"},
172 { 'name' => "maximum_century",
173 'desc' => "{BasPlug.maximum_century}",
174 'type' => "string",
175 'deft' => "-1",
176 'reqd' => "no" },
177 { 'name' => "no_bibliography",
178 'desc' => "{BasPlug.no_bibliography}",
179 'type' => "flag",
180 'reqd' => "no"},
181 { 'name' => "no_cover_image",
182 'desc' => "{BasPlug.no_cover_image}",
183 'type' => "flag",
184 'reqd' => "no" },
185 { 'name' => "separate_cjk",
186 'desc' => "{BasPlug.separate_cjk}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190 { 'name' => "new_extract_email",
191 'desc' => "",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "yes" } ];
195
196my $gis_arguments =
197 [ { 'name' => "extract_placenames",
198 'desc' => "{GISBasPlug.extract_placenames}",
199 'type' => "flag",
200 'reqd' => "no" },
201 { 'name' => "gazetteer",
202 'desc' => "{GISBasPlug.gazetteer}",
203 'type' => "string",
204 'reqd' => "no" },
205 { 'name' => "place_list",
206 'desc' => "{GISBasPlug.place_list}",
207 'type' => "flag",
208 'reqd' => "no" } ];
209
210
211my $options = { 'name' => "BasPlug",
212 'desc' => "{BasPlug.desc}",
213 'abstract' => "yes",
214 'inherits' => "no",
215 'args' => $arguments };
216
217
218sub set_incremental {
219 my $self = shift(@_);
220 my ($incremental) = @_;
221
222 $self->{'incremental'} = $incremental;
223}
224
225sub get_arguments
226{
227 my $self = shift(@_);
228 my $optionlistref = $self->{'option_list'};
229 my @optionlist = @$optionlistref;
230 my $pluginoptions = pop(@$optionlistref);
231 my $pluginarguments = $pluginoptions->{'args'};
232 return $pluginarguments;
233}
234
235
236sub print_xml_usage
237{
238 my $self = shift(@_);
239 my $header = shift(@_);
240 my $high_level_information_only = shift(@_);
241
242 # XML output is always in UTF-8
243 gsprintf::output_strings_in_UTF8;
244
245 if ($header) {
246 &PrintUsage::print_xml_header("plugin");
247 }
248 $self->print_xml($high_level_information_only);
249}
250
251
252sub print_xml
253{
254 my $self = shift(@_);
255 my $high_level_information_only = shift(@_);
256
257 my $optionlistref = $self->{'option_list'};
258 my @optionlist = @$optionlistref;
259 my $pluginoptions = shift(@$optionlistref);
260 return if (!defined($pluginoptions));
261
262 # Find the process and block default expressions in the plugin arguments
263 my $process_exp = "";
264 my $block_exp = "";
265 if (defined($pluginoptions->{'args'})) {
266 foreach my $option (@{$pluginoptions->{'args'}}) {
267 if ($option->{'name'} eq "process_exp") {
268 $process_exp = $option->{'deft'};
269 }
270 if ($option->{'name'} eq "block_exp") {
271 $block_exp = $option->{'deft'};
272 }
273 }
274 }
275
276 gsprintf(STDERR, "<PlugInfo>\n");
277 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
278 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
279 $desc =~ s/</&amp;lt;/g; # doubly escaped
280 $desc =~ s/>/&amp;gt;/g;
281 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
282 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
283 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
284 gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");
285 gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");
286 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
287 unless (defined($high_level_information_only)) {
288 gsprintf(STDERR, " <Arguments>\n");
289 if (defined($pluginoptions->{'args'})) {
290 &PrintUsage::print_options_xml($pluginoptions->{'args'});
291 }
292 gsprintf(STDERR, " </Arguments>\n");
293
294 # Recurse up the plugin hierarchy
295 $self->print_xml();
296 }
297 gsprintf(STDERR, "</PlugInfo>\n");
298}
299
300
301sub print_txt_usage
302{
303 my $self = shift(@_);
304 # Print the usage message for a plugin (recursively)
305 my $descoffset = $self->determine_description_offset(0);
306 $self->print_plugin_usage($descoffset, 1);
307}
308
309
310sub determine_description_offset
311{
312 my $self = shift(@_);
313 my $maxoffset = shift(@_);
314
315 my $optionlistref = $self->{'option_list'};
316 my @optionlist = @$optionlistref;
317 my $pluginoptions = shift(@$optionlistref);
318 return $maxoffset if (!defined($pluginoptions));
319
320 # Find the length of the longest option string of this plugin
321 my $pluginargs = $pluginoptions->{'args'};
322 if (defined($pluginargs)) {
323 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
324 if ($longest > $maxoffset) {
325 $maxoffset = $longest;
326 }
327 }
328
329 # Recurse up the plugin hierarchy
330 $maxoffset = $self->determine_description_offset($maxoffset);
331 $self->{'option_list'} = \@optionlist;
332 return $maxoffset;
333}
334
335
336sub print_plugin_usage
337{
338 my $self = shift(@_);
339 my $descoffset = shift(@_);
340 my $isleafclass = shift(@_);
341
342 my $optionlistref = $self->{'option_list'};
343 my @optionlist = @$optionlistref;
344 my $pluginoptions = shift(@$optionlistref);
345 return if (!defined($pluginoptions));
346
347 my $pluginname = $pluginoptions->{'name'};
348 my $pluginargs = $pluginoptions->{'args'};
349 my $plugindesc = $pluginoptions->{'desc'};
350
351 # Produce the usage information using the data structure above
352 if ($isleafclass) {
353 if (defined($plugindesc)) {
354 gsprintf(STDERR, "$plugindesc\n\n");
355 }
356 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
357 }
358
359 # Display the plugin options, if there are some
360 if (defined($pluginargs)) {
361 # Calculate the column offset of the option descriptions
362 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
363
364 if ($isleafclass) {
365 gsprintf(STDERR, " {common.specific_options}:\n");
366 }
367 else {
368 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
369 }
370
371 # Display the plugin options
372 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
373 }
374
375 # Recurse up the plugin hierarchy
376 $self->print_plugin_usage($descoffset, 0);
377 $self->{'option_list'} = \@optionlist;
378}
379
380
381sub new {
382 # Set Encodings to the list!!
383
384
385 # Start the BasPlug Constructor
386 my $class = shift (@_);
387 my ($pluginlist,$args,$hashArgOptLists) = @_;
388 push(@$pluginlist, $class);
389 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
390
391 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
392 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
393
394 if (GISBasPlug::has_mapdata()) {
395 push(@$arguments,@$gis_arguments);
396 }
397
398 my $self = {};
399 $self->{'outhandle'} = STDERR;
400 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
401 $self->{"info_only"} = 0;
402
403 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
404 # the args, just return the object.
405 foreach my $strArg (@{$args})
406 {
407 if($strArg eq "-gsdlinfo")
408 {
409 $self->{"info_only"} = 1;
410 return bless $self, $class;
411 }
412 }
413
414 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
415 {
416 my $classTempClass = bless $self, $class;
417 print STDERR "<BadPlugin p=$plugin_name>\n";
418 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
419 $classTempClass->print_txt_usage(""); # Use default resource bundle
420 die "\n";
421 }
422
423
424 delete $self->{"info_only"};
425 # else parsing was successful.
426
427 $self->{'plugin_type'} = $plugin_name;
428 #$self->{'outhandle'} = STDERR;
429 $self->{'num_processed'} = 0;
430 $self->{'num_not_processed'} = 0;
431 $self->{'num_blocked'} = 0;
432 $self->{'num_archives'} = 0;
433 $self->{'cover_image'} = 1; # cover image is on by default
434 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
435 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
436
437 my $associate_ext = $self->{'associate_ext'};
438 if ((defined $associate_ext) && ($associate_ext ne "")) {
439
440 my $associate_tail_re = $self->{'associate_tail_re'};
441 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
442 my $outhandle = $self->{'outhandle'};
443 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
444 print $outhandle " defaulting to 'associate_tail_re'\n";
445 }
446 else {
447 my @exts = split(/,/,$associate_ext);
448
449 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
450 my $associate_tail_re = join("|",@exts_bracketed);
451 $self->{'associate_tail_re'} = $associate_tail_re;
452 }
453
454 delete $self->{'associate_ext'};
455 }
456
457 $self->{'shared_fileroot'} = {};
458 $self->{'file_blocks'} = {};
459
460 if ($self->{'extract_placenames'}) {
461
462 my $outhandle = $self->{'outhandle'};
463
464 my $places_ref
465 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
466
467 if (!defined $places_ref) {
468 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
469 print $outhandle " No placename extraction will take place.\n";
470 $self->{'extract_placenames'} = undef;
471 }
472 else {
473 $self->{'places'} = $places_ref;
474 }
475 }
476
477 return bless $self, $class;
478
479}
480
481# initialize BasPlug options
482# if init() is overridden in a sub-class, remember to call BasPlug::init()
483sub init {
484 my $self = shift (@_);
485 my ($verbosity, $outhandle, $failhandle) = @_;
486
487 # verbosity is passed through from the processor
488 $self->{'verbosity'} = $verbosity;
489
490 # as are the outhandle and failhandle
491 $self->{'outhandle'} = $outhandle if defined $outhandle;
492 $self->{'failhandle'} = $failhandle;
493
494 # set process_exp and block_exp to defaults unless they were
495 # explicitly set
496
497 if ((!$self->is_recursive()) and
498 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
499
500 $self->{'process_exp'} = $self->get_default_process_exp ();
501 if ($self->{'process_exp'} eq "") {
502 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
503 }
504 }
505
506 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
507 $self->{'block_exp'} = $self->get_default_block_exp ();
508 }
509
510}
511
512sub begin {
513 my $self = shift (@_);
514 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
515 $self->initialise_extractors();
516}
517
518sub end {
519 # potentially called at the end of each plugin pass
520 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
521
522 my ($self) = @_;
523 $self->finalise_extractors();
524}
525
526sub deinit {
527 # called only once, after all plugin passes have been done
528
529 my ($self) = @_;
530}
531
532# this function should be overridden to return 1
533# in recursive plugins
534sub is_recursive {
535 my $self = shift (@_);
536
537 return 0;
538}
539
540sub get_default_block_exp {
541 my $self = shift (@_);
542
543 return "";
544}
545
546sub get_default_process_exp {
547 my $self = shift (@_);
548
549 return "";
550}
551
552# default implementation is to do nothing.
553sub store_block_files
554{
555 my $self =shift (@_);
556 my ($filename) = @_;
557 return;
558}
559
560#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
561sub block_cover_image
562{
563 my $self =shift;
564 my $filename = shift;
565
566 if ($self->{'cover_image'}) {
567 my $coverfile = $filename;
568 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
569 if (!-e $coverfile) {
570 $coverfile =~ s/jpg$/JPG/;
571 }
572 if (-e $coverfile) {
573 $self->{'file_blocks'}->{$coverfile} = 1;
574 }
575 }
576
577 return;
578}
579
580sub root_ext_split
581{
582 my $self = shift (@_);
583 my ($filename,$tail_re) = @_;
584
585 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
586
587 if ((!defined $file_prefix) || (!defined $file_ext)) {
588 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
589 }
590
591 return ($file_prefix,$file_ext);
592}
593
594sub metadata_read {
595 my $self = shift (@_);
596 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
597 # Keep track of filenames with same root but different extensions
598 # Used to support -associate_ext and the more generalised
599 # -associate_tail_re
600
601 my $associate_tail_re = $self->{'associate_tail_re'};
602 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
603
604 my ($file_prefix,$file_ext)
605 = $self->root_ext_split($file,$associate_tail_re);
606
607 if ((defined $file_prefix) && (defined $file_ext)) {
608
609 my $shared_fileroot = $self->{'shared_fileroot'};
610 if (!defined $shared_fileroot->{$file_prefix}) {
611 my $file_prefix_rec = { 'tie_to' => undef,
612 'exts' => {} };
613 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
614 }
615
616 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
617
618 my $process_exp = $self->{'process_exp'};
619
620 if ($file =~ m/$process_exp/) {
621 # This is the document the others should be tied to
622 $file_prefix_rec->{'tie_to'} = $file_ext;
623 }
624 else {
625 if ($file_ext =~ m/$associate_tail_re$/) {
626 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
627 }
628 }
629
630 }
631 }
632
633 # now check whether we are actually processing this
634 my $filename = $file;
635 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
636 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
637 return undef; # can't recognise
638 }
639
640 # do smart blocking if appropriate
641 if ($self->{'smart_block'}) {
642 $self->store_block_files($filename);
643 }
644 # block the cover image if there is one
645 if ($self->{'cover_image'}) {
646 $self->block_cover_image($filename);
647 }
648
649 return 1;
650}
651
652sub tie_to_filename
653{
654 my $self = shift (@_);
655
656 my ($file_ext,$file_prefix_rec) = @_;
657
658 if (defined $file_prefix_rec) {
659 my $tie_to = $file_prefix_rec->{'tie_to'};
660
661 if (defined $tie_to) {
662 if ($tie_to eq $file_ext) {
663 return 1;
664 }
665 }
666 }
667
668 return 0;
669}
670
671sub tie_to_assoc_file
672{
673 my $self = shift (@_);
674 my ($file_ext,$file_prefix_rec) = @_;
675
676 if (defined $file_prefix_rec) {
677 my $tie_to = $file_prefix_rec->{'tie_to'};
678 if (defined $tie_to) {
679
680 my $exts = $file_prefix_rec->{'exts'};
681
682 my $has_file_ext = $exts->{$file_ext};
683
684 if ($has_file_ext) {
685 return 1;
686 }
687 }
688 }
689
690 return 0;
691}
692
693
694sub associate_with
695{
696 my $self = shift (@_);
697 my ($file, $filename, $metadata) = @_;
698
699 my $associate_tail_re = $self->{'associate_tail_re'};
700 return 0 if (!$associate_tail_re);
701
702 # If file, see if matches with "tie_to" doc or is one of the
703 # associated filename extensions.
704
705 my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
706
707 if ((defined $file_prefix) && (defined $file_ext)) {
708
709 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
710
711 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
712
713 # Set up gsdlassocfile_tobe
714
715 my $exts = $file_prefix_rec->{'exts'};
716
717 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
718 $metadata->{'gsdlassocfile_tobe'} = [];
719 }
720
721 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
722
723 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
724 foreach my $e (keys %$exts) {
725 my $assoc_file = "$full_prefix$e";
726 print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
727 my $mime_type = ""; # let system auto detect this
728 push(@$assoc_tobe,"$assoc_file:$mime_type:");
729 }
730
731 }
732 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
733
734
735 # a form of smart block
736 return 1;
737 }
738 }
739
740 return 0;
741}
742
743
744sub read_block {
745 my $self = shift (@_);
746
747 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
748
749
750 my $filename = $file;
751 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
752
753 if ($self->associate_with($file,$filename,$metadata)) {
754 # a form of smart block
755 $self->{'num_blocked'} ++;
756 return (0,undef); # blocked
757 }
758
759 my $smart_block = $self->{'smart_block'};
760 my $smart_block_BN = $self->{'smart_block_BN'};
761
762 if ($smart_block || $smart_block_BN) {
763 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
764 $self->{'num_blocked'} ++;
765 return (0,undef); # blocked
766 }
767 } else {
768 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
769 $self->{'num_blocked'} ++;
770 return (0,undef); # blocked
771 }
772 if ($self->{'cover_image'}) {
773 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
774 $self->{'num_blocked'} ++;
775 return (0,undef); # blocked
776 }
777 }
778 }
779
780 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
781 return (undef,undef); # can't recognise
782 }
783
784 return (1,$filename);
785}
786
787sub read_tidy_file {
788
789 my $self = shift (@_);
790
791 my ($file) = @_;
792
793 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
794
795 return $file;
796}
797
798
799sub filename_to_metadata
800{
801 my $self = shift (@_);
802 my ($file, $encoding) = @_;
803
804 my $outhandle = $self->{'outhandle'};
805
806 my $filesystem_encoding = undef;
807
808 eval {
809 use POSIX qw(locale_h);
810
811 # With only one parameter, setlocale retrieves the current value
812 my $current_locale = setlocale(LC_CTYPE);
813
814 if ($current_locale =~ m/^.*\.(.*?)$/) {
815 my $char_encoding = lc($1);
816 $char_encoding =~ s/-/_/g;
817 $char_encoding =~ s/^utf_8$/utf8/;
818
819 if ($char_encoding =~ m/^\d+$/) {
820 if (defined $encodings::encoding->{"windows_$char_encoding"}) {
821 $char_encoding = "windows_$char_encoding";
822 }
823 elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
824 $char_encoding = "dos_$char_encoding";
825 }
826 }
827
828 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
829 || (defined $encodings::encoding->{$char_encoding})) {
830 $filesystem_encoding = $char_encoding;
831 }
832 else {
833 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
834 }
835 }
836
837
838 };
839 if ($@) {
840 print $outhandle "$@\n";
841 print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
842
843 }
844
845 my ($filemeta) = $file =~ /([^\\\/]+)$/;
846
847 # how do we know what encoding the filename is in?
848 # => one answer is to check the locale
849
850 if (defined $filesystem_encoding) {
851 if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) {
852 $filemeta = unicode::unicode2utf8(
853 unicode::convert2unicode($filesystem_encoding, \$filemeta)
854 );
855 }
856 }
857 # assume it is in the same encoding as its contents
858 elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) {
859 $filemeta = unicode::unicode2utf8(
860 unicode::convert2unicode($encoding, \$filemeta)
861 );
862 }
863
864 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
865
866 return $dmsafe_filemeta;
867}
868
869
870# The BasPlug read_into_doc_obj() function. This function does all the
871# right things to make general options work for a given plugin. It reads in
872# a file and sets up a slew of metadata all saved in doc_obj, which
873# it then returns as part of a tuple (process_status,doc_obj)
874#
875# Much of this functionality used to reside in read, but it was broken
876# down into a supporting routine to make the code more flexible.
877#
878# recursive plugins (e.g. RecPlug) and specialized plugins like those
879# capable of processing many documents within a single file (e.g.
880# GMLPlug) will normally want to implement their own version of
881# read_into_doc_obj()
882#
883# Note that $base_dir might be "" and that $file might
884# include directories
885sub read_into_doc_obj {
886 my $self = shift (@_);
887 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
888
889 if ($self->is_recursive()) {
890 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
891 }
892
893 my $outhandle = $self->{'outhandle'};
894
895 my ($block_status,$filename) = $self->read_block(@_);
896 return $block_status if ((!defined $block_status) || ($block_status==0));
897 $file = $self->read_tidy_file($file);
898
899 # Do encoding stuff
900 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
901 if ($self->{'verbosity'} > 2) {
902 print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
903 }
904
905 # create a new document
906 my $doc_obj = new doc ($filename, "indexed_doc");
907 my $top_section = $doc_obj->get_top_section();
908
909 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
910 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
911 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
912 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
913 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
914
915 my $filemeta = $self->filename_to_metadata($file,$encoding);
916 $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
917 if ($self->{'cover_image'}) {
918 $self->associate_cover_image($doc_obj, $filename);
919 }
920
921 # read in file ($text will be in utf8)
922 my $text = "";
923 $self->read_file ($filename, $encoding, $language, \$text);
924
925 if (!length ($text)) {
926 my $plugin_name = ref ($self);
927 if ($gli) {
928 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
929 }
930 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
931
932 my $failhandle = $self->{'failhandle'};
933 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
934 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
935 $self->{'num_not_processed'} ++;
936
937 return (0,undef); # what should we return here?? error but don't want to pass it on
938 }
939
940 # include any metadata passed in from previous plugins
941 # note that this metadata is associated with the top level section
942
943 my $associate_tail_re = $self->{'associate_tail_re'};
944
945 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
946
947 # do plugin specific processing of doc_obj
948 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
949 $text = '';
950 undef $text;
951 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
952 return (-1,undef);
953 }
954 $text='';
955 undef $text;
956
957 # do any automatic metadata extraction
958 $self->auto_extract_metadata ($doc_obj);
959
960 # add an OID
961 # see if there is a plugin-specific set_OID function...
962 if (defined ($self->can('set_OID'))) {
963 # it will need $doc_obj to set the Identifier metadata...
964 $self->set_OID($doc_obj);
965 } else {
966 # use the default set_OID() in doc.pm
967 $doc_obj->set_OID();
968 }
969
970 return (1,$doc_obj);
971}
972
973
974# The BasPlug read() function. This function calls read_into_doc_obj()
975# to ensure all the right things to make general options work for a
976# given plugin are done. It then calls the process() function which
977# does all the work specific to a plugin (like the old read functions
978# used to do). Most plugins should define their own process() function
979# and let this read() function keep control.
980#
981# recursive plugins (e.g. RecPlug) and specialized plugins like those
982# capable of processing many documents within a single file (e.g.
983# GMLPlug) might want to implement their own version of read(), but
984# more likely need to implement their own version of read_into_doc_obj()
985#
986# Return number of files processed, undef if can't recognise, -1 if can't
987# process
988
989sub read {
990 my $self = shift (@_);
991 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
992
993 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
994
995 if ((defined $process_status) && ($process_status == 1)) {
996 # process the document
997 $processor->process($doc_obj);
998
999 if(defined($self->{'places_filename'})){
1000 &util::rm($self->{'places_filename'});
1001 $self->{'places_filename'} = undef;
1002 }
1003
1004 $self->{'num_processed'} ++;
1005 undef $doc_obj;
1006 }
1007
1008 # if process_status == 1, then the file has been processed.
1009 return $process_status;
1010
1011}
1012
1013# returns undef if file is rejected by the plugin
1014sub process {
1015 my $self = shift (@_);
1016 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1017
1018 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
1019 # die "Basplug::process function must be implemented in sub-class\n";
1020
1021 return undef; # never gets here
1022}
1023
1024# uses the multiread package to read in the entire file pointed to
1025# by filename and loads the resulting text into $$textref. Input text
1026# may be in any of the encodings handled by multiread, output text
1027# will be in utf8
1028sub read_file {
1029 my $self = shift (@_);
1030 my ($filename, $encoding, $language, $textref) = @_;
1031
1032 if (!-r $filename)
1033 {
1034 my $outhandle = $self->{'outhandle'};
1035 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
1036 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
1037 return;
1038 }
1039 $$textref = "";
1040 if (!open (FILE, $filename)) {
1041 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1042 die "\n";
1043 }
1044
1045 if ($encoding eq "ascii") {
1046 undef $/;
1047 $$textref = <FILE>;
1048 $/ = "\n";
1049 } else {
1050 my $reader = new multiread();
1051 $reader->set_handle ('BasPlug::FILE');
1052 $reader->set_encoding ($encoding);
1053 $reader->read_file ($textref);
1054 #Now segments chinese if the separate_cjk option is set
1055 if ($self->{'separate_cjk'}) {
1056 # segment the Chinese words
1057 $$textref = &cnseg::segment($$textref);
1058 }
1059 }
1060 close FILE;
1061}
1062
1063# write_file -- used by ConvertToPlug, for example in post processing
1064#
1065sub utf8_write_file {
1066 my $self = shift (@_);
1067 my ($textref, $filename) = @_;
1068
1069 if (!open (FILE, ">$filename")) {
1070 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1071 die "\n";
1072 }
1073 print FILE $$textref;
1074
1075 close FILE;
1076}
1077
1078
1079sub filename_based_title
1080{
1081 my $self = shift (@_);
1082 my ($file) = @_;
1083
1084 my $file_derived_title = $file;
1085 $file_derived_title =~ s/_/ /g;
1086 $file_derived_title =~ s/\..*?$//;
1087
1088 return $file_derived_title;
1089}
1090
1091
1092sub title_fallback
1093{
1094 my $self = shift (@_);
1095 my ($doc_obj,$section,$file) = @_;
1096
1097 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1098
1099 my $file_derived_title = $self->filename_based_title($file);
1100 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
1101 }
1102}
1103
1104sub textcat_get_language_encoding {
1105 my $self = shift (@_);
1106 my ($filename) = @_;
1107
1108
1109 my ($language, $encoding, $extracted_encoding);
1110 if ($self->{'input_encoding'} eq "auto") {
1111 # use textcat to automatically work out the input encoding and language
1112 ($language, $encoding) = $self->get_language_encoding ($filename);
1113 } elsif ($self->{'extract_language'}) {
1114 # use textcat to get language metadata
1115 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1116 $encoding = $self->{'input_encoding'};
1117 # don't print this message for english... english in utf8 is identical
1118 # to english in iso-8859-1 (except for some punctuation). We don't have
1119 # a language model for en_utf8, so textcat always says iso-8859-1!
1120 if ($extracted_encoding ne $encoding && $language ne "en"
1121 && $self->{'verbosity'}) {
1122 my $plugin_name = ref ($self);
1123 my $outhandle = $self->{'outhandle'};
1124 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
1125 }
1126 } else {
1127 $language = $self->{'default_language'};
1128 $encoding = $self->{'input_encoding'};
1129 }
1130
1131 return ($language, $encoding);
1132}
1133
1134# Uses textcat to work out the encoding and language of the text in
1135# $filename. All html tags are removed before processing.
1136# returns an array containing "language" and "encoding"
1137sub get_language_encoding {
1138 my $self = shift (@_);
1139 my ($filename) = @_;
1140 my $outhandle = $self->{'outhandle'};
1141 my $unicode_format = "";
1142 my $best_language = "";
1143 my $best_encoding = "";
1144
1145 # read in file
1146 if (!open (FILE, $filename)) {
1147 gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1148 # this is a pretty bad error, but try to continue anyway
1149 return ($self->{'default_language'}, $self->{'input_encoding'});
1150 }
1151 undef $/;
1152 my $text = <FILE>;
1153 $/ = "\n";
1154 close FILE;
1155
1156 # check if first few bytes have a Byte Order Marker
1157 my $bom=substr($text,0,2); # check 16bit unicode
1158 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1159 $unicode_format="unicode";
1160 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1161 $unicode_format="unicode";
1162 } else {
1163 $bom=substr($text,0,3); # check utf-8
1164 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1165 $unicode_format="utf8";
1166# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1167# $unicode_format="utf8";
1168 }
1169 }
1170
1171
1172 # handle html files specially
1173 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1174 if (ref($self) eq 'HTMLPlug' ||
1175 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1176
1177 # remove <title>stuff</title> -- as titles tend often to be in English
1178 # for foreign language documents
1179 $text =~ s!<title>.*?</title>!!si;
1180
1181 # see if this html file specifies its encoding
1182 if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1183 $best_encoding = $1;
1184 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#"
1185 $best_encoding = $1;
1186 }
1187 if ($best_encoding) { # we extracted an encoding
1188 $best_encoding =~ s/-+/_/g;
1189 $best_encoding = lc($best_encoding); # lowercase
1190 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1191 $self->{'input_encoding'} = $best_encoding;
1192 }
1193
1194 # remove all HTML tags
1195 $text =~ s/<[^>]*>//sg;
1196 }
1197
1198 # get the language/encoding
1199 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
1200 my $results = $self->{'textcat'}->classify(\$text);
1201
1202 # if textcat returns 3 or less possibilities we'll use the
1203 # first one in the list - otherwise use the defaults
1204 if (scalar @$results > 3) {
1205 if ($unicode_format) { # in case the first had a BOM
1206 $best_encoding=$unicode_format;
1207 } else {
1208 my %guessed_encodings = ();
1209 foreach my $result (@$results) {
1210 $result =~ /([^\-]+)$/;
1211 my $enc=$1;
1212 if (!defined($guessed_encodings{$enc})) {
1213 $guessed_encodings{$enc}=0;
1214 }
1215 $guessed_encodings{$enc}++;
1216 }
1217
1218 $guessed_encodings{""}=-1; # for default best_encoding of ""
1219 foreach my $enc (keys %guessed_encodings) {
1220 if ($guessed_encodings{$enc} >
1221 $guessed_encodings{$best_encoding}){
1222 $best_encoding=$enc;
1223 }
1224 }
1225 }
1226
1227 if ($self->{'input_encoding'} ne 'auto') {
1228 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1229 gsprintf($outhandle,
1230 "BasPlug: {BasPlug.could_not_extract_language}\n",
1231 $filename, $self->{'default_language'});
1232 }
1233 $best_language = $self->{'default_language'};
1234 $best_encoding = $self->{'input_encoding'};
1235
1236 } else {
1237 if ($self->{'verbosity'}>2) {
1238 gsprintf($outhandle,
1239 "BasPlug: {BasPlug.could_not_extract_language}\n",
1240 $filename, $self->{'default_language'});
1241 }
1242 $best_language = $self->{'default_language'};
1243 }
1244 } else { # <= 3 suggestions
1245 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
1246 if (!defined $language) {
1247 if ($self->{'verbosity'}>2) {
1248 gsprintf($outhandle,
1249 "BasPlug: {BasPlug.could_not_extract_language}\n",
1250 $filename, $self->{'default_language'});
1251 }
1252 $language = $self->{'default_language'};
1253 }
1254 if (!defined $encoding) {
1255 if ($self->{'verbosity'}>2) {
1256 gsprintf($outhandle,
1257 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
1258 $filename, $self->{'default_encoding'});
1259 }
1260 $encoding = $self->{'default_encoding'};
1261 }
1262 $best_language = $language;
1263 if (! $best_encoding ) { # may already be set... eg from html meta tag
1264 $best_encoding = $encoding;
1265 }
1266 }
1267
1268 my $text_copy = $text;
1269 if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1270 # the text is valid utf8, so assume that's the real encoding
1271 # (since textcat is based on probabilities)
1272 $best_encoding = 'utf8';
1273 }
1274
1275 # check for equivalents where textcat doesn't have some encodings...
1276 # eg MS versions of standard encodings
1277 if ($best_encoding =~ /^iso_8859_(\d+)/) {
1278 my $iso = $1; # which variant of the iso standard?
1279 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1280 if ($text =~ /[\x80-\x9f]/) {
1281 # Western Europe
1282 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1283 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1284 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1285 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1286 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1287 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1288 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
1289 }
1290 }
1291
1292 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
1293 !defined $encodings::encodings->{$best_encoding}) {
1294 if ($self->{'verbosity'}) {
1295 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1296 $filename, $best_encoding, $self->{'default_encoding'});
1297 }
1298 $best_encoding = $self->{'default_encoding'};
1299 }
1300
1301 return ($best_language, $best_encoding);
1302}
1303
1304# add any extra metadata that's been passed around from one
1305# plugin to another.
1306# extra_metadata uses add_utf8_metadata so it expects metadata values
1307# to already be in utf8
1308sub extra_metadata {
1309 my $self = shift (@_);
1310 my ($doc_obj, $cursection, $metadata) = @_;
1311
1312 my $associate_tail_re = $self->{'associate_tail_re'};
1313
1314 foreach my $field (keys(%$metadata)) {
1315 # $metadata->{$field} may be an array reference
1316 if ($field eq "gsdlassocfile_tobe") {
1317 # 'gsdlassocfile_tobe' is artificially introduced metadata
1318 # that is used to signal that certain additional files should
1319 # be tied to this document. Useful in situations where a
1320 # metadata pass in the plugin pipeline works out some files
1321 # need to be associated with a document, but the document hasn't
1322 # been formed yet.
1323
1324 my $equiv_form = "";
1325 foreach my $gaf (@{$metadata->{$field}}) {
1326 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1327 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1328 my $filename = $full_filename;
1329
1330 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1331
1332 # work out extended tail extension (i.e. matching tail re)
1333
1334 my ($file_prefix,$file_extended_ext)
1335 = $self->root_ext_split($tail_filename,$associate_tail_re);
1336 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1337
1338 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1339 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1340 my $srcicon = "_icon".$doc_ext."_";
1341 my $end_doclink = "</a>";
1342
1343 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1344
1345 if (defined $pre_doc_ext) {
1346 # for metadata such as [mp3._edited] [mp3._full] ...
1347 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1348 }
1349
1350 # for multiple metadata such as [mp3.assoclink]
1351 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1352
1353 $equiv_form .= " $assoc_form";
1354 }
1355 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1356 }
1357 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1358 map {
1359 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1360 } @{$metadata->{$field}};
1361 } else {
1362 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1363 }
1364 }
1365}
1366
1367# initialise metadata extractors
1368sub initialise_extractors {
1369 my $self = shift (@_);
1370
1371 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1372 &acronym::initialise_acronyms();
1373 }
1374}
1375
1376# finalise metadata extractors
1377sub finalise_extractors {
1378 my $self = shift (@_);
1379
1380 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1381 &acronym::finalise_acronyms();
1382 }
1383}
1384
1385# FIRSTNNN: extract the first NNN characters as metadata
1386sub extract_first_NNNN_characters {
1387 my $self = shift (@_);
1388 my ($textref, $doc_obj, $thissection) = @_;
1389
1390 foreach my $size (split /,/, $self->{'first'}) {
1391 my $tmptext = $$textref;
1392 $tmptext =~ s/^\s+//;
1393 $tmptext =~ s/\s+$//;
1394 $tmptext =~ s/\s+/ /gs;
1395 $tmptext = substr ($tmptext, 0, $size);
1396 $tmptext =~ s/\s\S*$/&#8230;/;
1397 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1398 }
1399}
1400
1401sub extract_email {
1402 my $self = shift (@_);
1403 my ($textref, $doc_obj, $thissection) = @_;
1404 my $outhandle = $self->{'outhandle'};
1405
1406 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1407 if ($self->{'verbosity'} > 2);
1408
1409 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1410 @email = sort @email;
1411
1412# if($self->{"new_extract_email"} == 0)
1413# {
1414# my @email2 = ();
1415# foreach my $address (@email)
1416# {
1417# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
1418# {
1419# push @email2, $address;
1420# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1421# # print $outhandle " extracting $address\n"
1422# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1423# if ($self->{'verbosity'} > 3);
1424# }
1425# }
1426# }
1427# else
1428# {
1429 my $hashExistMail = {};
1430 foreach my $address (@email) {
1431 if (!(defined $hashExistMail->{$address}))
1432 {
1433 $hashExistMail->{$address} = 1;
1434 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1435 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1436 if ($self->{'verbosity'} > 3);
1437 }
1438 }
1439 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1440 if ($self->{'verbosity'} > 2);
1441}
1442
1443# extract metadata
1444sub auto_extract_metadata {
1445
1446 my $self = shift (@_);
1447 my ($doc_obj) = @_;
1448
1449 if ($self->{'extract_email'}) {
1450 my $thissection = $doc_obj->get_top_section();
1451 while (defined $thissection) {
1452 my $text = $doc_obj->get_text($thissection);
1453 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1454 $thissection = $doc_obj->get_next_section ($thissection);
1455 }
1456 }
1457 if ($self->{'extract_placenames'}) {
1458 my $thissection = $doc_obj->get_top_section();
1459 while (defined $thissection) {
1460 my $text = $doc_obj->get_text($thissection);
1461 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1462 $thissection = $doc_obj->get_next_section ($thissection);
1463 }
1464 }
1465
1466 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
1467 $self->extract_keyphrases($doc_obj);
1468 }
1469
1470 if ($self->{'first'}) {
1471 my $thissection = $doc_obj->get_top_section();
1472 while (defined $thissection) {
1473 my $text = $doc_obj->get_text($thissection);
1474 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1475 $thissection = $doc_obj->get_next_section ($thissection);
1476 }
1477 }
1478
1479 if ($self->{'extract_acronyms'}) {
1480 my $thissection = $doc_obj->get_top_section();
1481 while (defined $thissection) {
1482 my $text = $doc_obj->get_text($thissection);
1483 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1484 $thissection = $doc_obj->get_next_section ($thissection);
1485 }
1486 }
1487
1488 if ($self->{'markup_acronyms'}) {
1489 my $thissection = $doc_obj->get_top_section();
1490 while (defined $thissection) {
1491 my $text = $doc_obj->get_text($thissection);
1492 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1493 $doc_obj->delete_text($thissection);
1494 $doc_obj->add_text($thissection, $text);
1495 $thissection = $doc_obj->get_next_section ($thissection);
1496 }
1497 }
1498
1499 if($self->{'extract_historical_years'}) {
1500 my $thissection = $doc_obj->get_top_section();
1501 while (defined $thissection) {
1502
1503 my $text = $doc_obj->get_text($thissection);
1504 &DateExtract::get_date_metadata($text, $doc_obj,
1505 $thissection,
1506 $self->{'no_bibliography'},
1507 $self->{'maximum_year'},
1508 $self->{'maximum_century'});
1509 $thissection = $doc_obj->get_next_section ($thissection);
1510 }
1511 }
1512}
1513
1514
1515#adding kea keyphrases
1516sub extract_keyphrases
1517{
1518 my $self = shift(@_);
1519 my $doc_obj = shift(@_);
1520
1521 # Use Kea 3.0 unless 4.0 has been specified
1522 my $kea_version = "3.0";
1523 if ($self->{'extract_keyphrases_kea4'}) {
1524 $kea_version = "4.0";
1525 }
1526
1527 # Check that Kea exists, and tell the user where to get it if not
1528 my $keahome = &Kea::get_Kea_directory($kea_version);
1529 if (!-e $keahome) {
1530 gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1531 return;
1532 }
1533
1534 my $thissection = $doc_obj->get_top_section();
1535 my $text = "";
1536 my $list;
1537
1538 #loop through sections to gather whole doc
1539 while (defined $thissection) {
1540 my $sectiontext = $doc_obj->get_text($thissection);
1541 $text = $text.$sectiontext;
1542 $thissection = $doc_obj->get_next_section ($thissection);
1543 }
1544
1545 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1546 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1547 } else { #otherwise call Kea with no options
1548 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
1549 }
1550
1551 if ($list){
1552 # if a list of kea keyphrases was returned (ie not empty)
1553 if ($self->{'verbosity'}) {
1554 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1555 }
1556
1557 #add metadata to top section
1558 $thissection = $doc_obj->get_top_section();
1559
1560 # add all key phrases as one metadata
1561 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1562
1563 # add individual key phrases as multiple metadata
1564 foreach my $keyphrase (split(',', $list)) {
1565 $keyphrase =~ s/^\s+|\s+$//g;
1566 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1567 }
1568 }
1569}
1570
1571
1572# extract acronyms from a section in a document. progress is
1573# reported to outhandle based on the verbosity. both the Acronym
1574# and the AcronymKWIC metadata items are created.
1575
1576sub extract_acronyms {
1577 my $self = shift (@_);
1578 my ($textref, $doc_obj, $thissection) = @_;
1579 my $outhandle = $self->{'outhandle'};
1580
1581 # print $outhandle " extracting acronyms ...\n"
1582 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1583 if ($self->{'verbosity'} > 2);
1584
1585 my $acro_array = &acronym::acronyms($textref);
1586
1587 foreach my $acro (@$acro_array) {
1588
1589 #check that this is the first time ...
1590 my $seen_before = "false";
1591 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1592 foreach my $thisAcro (@$previous_data) {
1593 if ($thisAcro eq $acro->to_string()) {
1594 $seen_before = "true";
1595 if ($self->{'verbosity'} >= 4) {
1596 gsprintf($outhandle, " {BasPlug.already_seen} " .
1597 $acro->to_string() . "\n");
1598 }
1599 }
1600 }
1601
1602 if ($seen_before eq "false") {
1603 #write it to the file ...
1604 $acro->write_to_file();
1605
1606 #do the normal acronym
1607 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1608 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1609 if ($self->{'verbosity'} > 3);
1610 }
1611 }
1612
1613 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1614 if ($self->{'verbosity'} > 2);
1615}
1616
1617sub markup_acronyms {
1618 my $self = shift (@_);
1619 my ($text, $doc_obj, $thissection) = @_;
1620 my $outhandle = $self->{'outhandle'};
1621
1622 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1623 if ($self->{'verbosity'} > 2);
1624
1625 #self is passed in to check for verbosity ...
1626 $text = &acronym::markup_acronyms($text, $self);
1627
1628 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1629 if ($self->{'verbosity'} > 2);
1630
1631 return $text;
1632}
1633
1634sub compile_stats {
1635 my $self = shift(@_);
1636 my ($stats) = @_;
1637
1638 $stats->{'num_processed'} += $self->{'num_processed'};
1639 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1640 $stats->{'num_archives'} += $self->{'num_archives'};
1641
1642}
1643
1644sub associate_cover_image {
1645 my $self = shift;
1646 my ($doc_obj, $filename) = @_;
1647
1648 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1649 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1650 # don't stat() for existence eg for multiple document input files
1651 # (eg SplitPlug)
1652 return;
1653 }
1654
1655 my $top_section=$doc_obj->get_top_section();
1656
1657 if (-e $filename) {
1658 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1659 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1660 } else {
1661 my $upper_filename = $filename;
1662 $upper_filename =~ s/jpg$/JPG/;
1663 if (-e $upper_filename) {
1664 $doc_obj->associate_file($upper_filename, "cover.jpg",
1665 "image/jpeg");
1666 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1667 } else {
1668 # file doesn't exist, so record the fact that it's missing so
1669 # we don't stat() again (stat is slow)
1670 $self->{'covers_missing_cache'}->{$filename} = 1;
1671 }
1672 }
1673
1674}
1675
1676
1677# Overridden by exploding plugins (eg. ISISPlug)
1678sub clean_up_after_exploding
1679{
1680 my $self = shift(@_);
1681}
1682
1683
16841;
Note: See TracBrowser for help on using the repository browser.