source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 11467

Last change on this file since 11467 was 11389, checked in by jrm21, 18 years ago

try to get the encoding from a '<meta http-equiv' tag if HTML.

make sure we add the filename/Source metadata as utf-8, so that we
won't create invalid xml files if the filename is in some other encoding.

If the file contents aren't in utf8, assume the filename is in the same
encoding as the contents.

print out the encoding we used to read this file if verbosity >= 3.

  • Property svn:keywords set to Author Date Id Revision
File size: 46.1 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict;
39no strict 'subs';
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42use File::Basename;
43
44use Kea;
45use multiread;
46use encodings;
47use unicode;
48use cnseg;
49use acronym;
50use textcat;
51use doc;
52eval "require diagnostics"; # some perl distros (eg mac) don't have this
53use DateExtract;
54use ghtml;
55use gsprintf 'gsprintf';
56use printusage;
57use parse2;
58
59
60use GISBasPlug;
61
62@BasPlug::ISA = ( GISBasPlug );
63
64my $unicode_list =
65 [ { 'name' => "ascii",
66 'desc' => "{BasPlug.input_encoding.ascii}" },
67 { 'name' => "utf8",
68 'desc' => "{BasPlug.input_encoding.utf8}" },
69 { 'name' => "unicode",
70 'desc' => "{BasPlug.input_encoding.unicode}" } ];
71
72my $auto_unicode_list =
73 [ { 'name' => "auto",
74 'desc' => "{BasPlug.input_encoding.auto}" } ];
75
76my $e = $encodings::encodings;
77foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78{
79 my $hashEncode =
80 {'name' => $enc,
81 'desc' => $e->{$enc}->{'name'}};
82
83 push(@{$unicode_list},$hashEncode);
84}
85
86push(@{$auto_unicode_list},@{$unicode_list});
87
88my $arguments =
89 [ { 'name' => "process_exp",
90 'desc' => "{BasPlug.process_exp}",
91 'type' => "regexp",
92 'deft' => "",
93 'reqd' => "no" },
94 { 'name' => "block_exp",
95 'desc' => "{BasPlug.block_exp}",
96 'type' => "regexp",
97 'deft' => "",
98 'reqd' => "no" },
99 { 'name' => "smart_block",
100 'desc' => "{BasPlug.smart_block}",
101 'type' => "flag",
102 'reqd' => "no" },
103 { 'name' => "associate_ext",
104 'desc' => "{BasPlug.associate_ext}",
105 'type' => "string",
106 'reqd' => "no" },
107 { 'name' => "associate_tail_re",
108 'desc' => "{BasPlug.associate_tail_re}",
109 'type' => "string",
110 'reqd' => "no" },
111 { 'name' => "input_encoding",
112 'desc' => "{BasPlug.input_encoding}",
113 'type' => "enum",
114 'list' => $auto_unicode_list,
115 'reqd' => "no" ,
116 'deft' => "auto" } ,
117 { 'name' => "default_encoding",
118 'desc' => "{BasPlug.default_encoding}",
119 'type' => "enum",
120 'list' => $unicode_list,
121 'reqd' => "no",
122 'deft' => "utf8" },
123 { 'name' => "extract_language",
124 'desc' => "{BasPlug.extract_language}",
125 'type' => "flag",
126 'reqd' => "no" },
127 { 'name' => "default_language",
128 'desc' => "{BasPlug.default_language}",
129 'type' => "string",
130 'deft' => "en",
131 'reqd' => "no" },
132 { 'name' => "extract_acronyms",
133 'desc' => "{BasPlug.extract_acronyms}",
134 'type' => "flag",
135 'reqd' => "no" },
136 { 'name' => "markup_acronyms",
137 'desc' => "{BasPlug.markup_acronyms}",
138 'type' => "flag",
139 'reqd' => "no" },
140 { 'name' => "extract_keyphrases",
141 'desc' => "{BasPlug.extract_keyphrases}",
142 'type' => "flag",
143 'reqd' => "no" },
144 { 'name' => "extract_keyphrases_kea4",
145 'desc' => "{BasPlug.extract_keyphrases_kea4}",
146 'type' => "flag",
147 'reqd' => "no" },
148 { 'name' => "extract_keyphrase_options",
149 'desc' => "{BasPlug.extract_keyphrase_options}",
150 'type' => "string",
151 'deft' => "",
152 'reqd' => "no" },
153 { 'name' => "first",
154 'desc' => "{BasPlug.first}",
155 'type' => "string",
156 'reqd' => "no" },
157 { 'name' => "extract_email",
158 'desc' => "{BasPlug.extract_email}",
159 'type' => "flag",
160 'reqd' => "no" },
161 { 'name' => "extract_historical_years",
162 'desc' => "{BasPlug.extract_historical_years}",
163 'type' => "flag",
164 'reqd' => "no" },
165 { 'name' => "maximum_year",
166 'desc' => "{BasPlug.maximum_year}",
167 'type' => "int",
168 'deft' => (localtime)[5]+1900,
169 'char_length' => "4",
170 #'range' => "2,100",
171 'reqd' => "no"},
172 { 'name' => "maximum_century",
173 'desc' => "{BasPlug.maximum_century}",
174 'type' => "string",
175 'deft' => "-1",
176 'reqd' => "no" },
177 { 'name' => "no_bibliography",
178 'desc' => "{BasPlug.no_bibliography}",
179 'type' => "flag",
180 'reqd' => "no"},
181 { 'name' => "no_cover_image",
182 'desc' => "{BasPlug.no_cover_image}",
183 'type' => "flag",
184 'reqd' => "no" },
185 { 'name' => "separate_cjk",
186 'desc' => "{BasPlug.separate_cjk}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190 { 'name' => "new_extract_email",
191 'desc' => "",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "yes" } ];
195
196my $gis_arguments =
197 [ { 'name' => "extract_placenames",
198 'desc' => "{GISBasPlug.extract_placenames}",
199 'type' => "flag",
200 'reqd' => "no" },
201 { 'name' => "gazetteer",
202 'desc' => "{GISBasPlug.gazetteer}",
203 'type' => "string",
204 'reqd' => "no" },
205 { 'name' => "place_list",
206 'desc' => "{GISBasPlug.place_list}",
207 'type' => "flag",
208 'reqd' => "no" } ];
209
210
211my $options = { 'name' => "BasPlug",
212 'desc' => "{BasPlug.desc}",
213 'abstract' => "yes",
214 'inherits' => "no",
215 'args' => $arguments };
216
217
218sub set_keepold {
219 my $self = shift(@_);
220 my ($keepold) = @_;
221
222 $self->{'keepold'} = $keepold;
223}
224
225sub get_arguments
226{
227 my $self = shift(@_);
228 my $optionlistref = $self->{'option_list'};
229 my @optionlist = @$optionlistref;
230 my $pluginoptions = pop(@$optionlistref);
231 my $pluginarguments = $pluginoptions->{'args'};
232 return $pluginarguments;
233}
234
235
236sub print_xml_usage
237{
238 my $self = shift(@_);
239
240 # XML output is always in UTF-8
241 gsprintf::output_strings_in_UTF8;
242
243 PrintUsage::print_xml_header();
244 $self->print_xml();
245}
246
247
248sub print_xml
249{
250 my $self = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $pluginoptions = shift(@$optionlistref);
255 return if (!defined($pluginoptions));
256
257 gsprintf(STDERR, "<PlugInfo>\n");
258 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
259 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
260 $desc =~ s/</&amp;lt;/g; # doubly escaped
261 $desc =~ s/>/&amp;gt;/g;
262
263 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
264 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
265 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
266 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
267 gsprintf(STDERR, " <Arguments>\n");
268 if (defined($pluginoptions->{'args'})) {
269 &PrintUsage::print_options_xml($pluginoptions->{'args'});
270 }
271
272 # Recurse up the plugin hierarchy
273 $self->print_xml();
274
275 gsprintf(STDERR, " </Arguments>\n");
276 gsprintf(STDERR, "</PlugInfo>\n");
277}
278
279
280sub print_txt_usage
281{
282 my $self = shift(@_);
283 # Print the usage message for a plugin (recursively)
284 my $descoffset = $self->determine_description_offset(0);
285 $self->print_plugin_usage($descoffset, 1);
286}
287
288
289sub determine_description_offset
290{
291 my $self = shift(@_);
292 my $maxoffset = shift(@_);
293
294 my $optionlistref = $self->{'option_list'};
295 my @optionlist = @$optionlistref;
296 my $pluginoptions = shift(@$optionlistref);
297 return $maxoffset if (!defined($pluginoptions));
298
299 # Find the length of the longest option string of this plugin
300 my $pluginargs = $pluginoptions->{'args'};
301 if (defined($pluginargs)) {
302 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
303 if ($longest > $maxoffset) {
304 $maxoffset = $longest;
305 }
306 }
307
308 # Recurse up the plugin hierarchy
309 $maxoffset = $self->determine_description_offset($maxoffset);
310 $self->{'option_list'} = \@optionlist;
311 return $maxoffset;
312}
313
314
315sub print_plugin_usage
316{
317 my $self = shift(@_);
318 my $descoffset = shift(@_);
319 my $isleafclass = shift(@_);
320
321 my $optionlistref = $self->{'option_list'};
322 my @optionlist = @$optionlistref;
323 my $pluginoptions = shift(@$optionlistref);
324 return if (!defined($pluginoptions));
325
326 my $pluginname = $pluginoptions->{'name'};
327 my $pluginargs = $pluginoptions->{'args'};
328 my $plugindesc = $pluginoptions->{'desc'};
329
330 # Produce the usage information using the data structure above
331 if ($isleafclass) {
332 if (defined($plugindesc)) {
333 gsprintf(STDERR, "$plugindesc\n\n");
334 }
335 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
336 }
337
338 # Display the plugin options, if there are some
339 if (defined($pluginargs)) {
340 # Calculate the column offset of the option descriptions
341 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
342
343 if ($isleafclass) {
344 gsprintf(STDERR, " {common.specific_options}:\n");
345 }
346 else {
347 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
348 }
349
350 # Display the plugin options
351 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
352 }
353
354 # Recurse up the plugin hierarchy
355 $self->print_plugin_usage($descoffset, 0);
356 $self->{'option_list'} = \@optionlist;
357}
358
359
360sub new {
361 # Set Encodings to the list!!
362
363
364 # Start the BasPlug Constructor
365 my $class = shift (@_);
366 my ($pluginlist,$args,$hashArgOptLists) = @_;
367 push(@$pluginlist, $class);
368 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
369
370 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
371 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
372
373 if (GISBasPlug::has_mapdata()) {
374 push(@$arguments,@$gis_arguments);
375 }
376
377 my $self = {};
378 $self->{'outhandle'} = STDERR;
379 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
380 $self->{"info_only"} = 0;
381
382 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
383 # the args, just return the object.
384 foreach my $strArg (@{$args})
385 {
386 if($strArg eq "-gsdlinfo")
387 {
388 $self->{"info_only"} = 1;
389 return bless $self, $class;
390 }
391 }
392
393 if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
394 {
395 my $classTempClass = bless $self, $class;
396 print STDERR "<BadPlugin p=$plugin_name>\n";
397 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
398 $classTempClass->print_txt_usage(""); # Use default resource bundle
399 die "\n";
400 }
401
402
403 delete $self->{"info_only"};
404 # else parsing was successful.
405
406 $self->{'plugin_type'} = $plugin_name;
407 #$self->{'outhandle'} = STDERR;
408 $self->{'textcat'} = new textcat();
409 $self->{'num_processed'} = 0;
410 $self->{'num_not_processed'} = 0;
411 $self->{'num_blocked'} = 0;
412 $self->{'num_archives'} = 0;
413 $self->{'cover_image'} = 1; # cover image is on by default
414 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
415 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
416
417 my $associate_ext = $self->{'associate_ext'};
418 if ((defined $associate_ext) && ($associate_ext ne "")) {
419
420 my $associate_tail_re = $self->{'associate_tail_re'};
421 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
422 my $outhandle = $self->{'outhandle'};
423 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
424 print $outhandle " defaulting to 'associate_tail_re'\n";
425 }
426 else {
427 my @exts = split(/,/,$associate_ext);
428
429 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
430 my $associate_tail_re = join("|",@exts_bracketed);
431 $self->{'associate_tail_re'} = $associate_tail_re;
432 }
433
434 delete $self->{'associate_ext'};
435 }
436
437 $self->{'shared_fileroot'} = {};
438 $self->{'file_blocks'} = {};
439
440 if ($self->{'extract_placenames'}) {
441
442 my $outhandle = $self->{'outhandle'};
443
444 my $places_ref
445 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
446
447 if (!defined $places_ref) {
448 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
449 print $outhandle " No placename extraction will take place.\n";
450 $self->{'extract_placenames'} = undef;
451 }
452 else {
453 $self->{'places'} = $places_ref;
454 }
455 }
456
457 return bless $self, $class;
458
459}
460
461# initialize BasPlug options
462# if init() is overridden in a sub-class, remember to call BasPlug::init()
463sub init {
464 my $self = shift (@_);
465 my ($verbosity, $outhandle, $failhandle) = @_;
466
467 # verbosity is passed through from the processor
468 $self->{'verbosity'} = $verbosity;
469
470 # as are the outhandle and failhandle
471 $self->{'outhandle'} = $outhandle if defined $outhandle;
472 $self->{'failhandle'} = $failhandle;
473
474 # set process_exp and block_exp to defaults unless they were
475 # explicitly set
476
477 if ((!$self->is_recursive()) and
478 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
479
480 $self->{'process_exp'} = $self->get_default_process_exp ();
481 if ($self->{'process_exp'} eq "") {
482 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
483 }
484 }
485
486 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
487 $self->{'block_exp'} = $self->get_default_block_exp ();
488 }
489
490}
491
492sub begin {
493 my $self = shift (@_);
494 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
495 $self->initialise_extractors();
496}
497
498sub end {
499 # potentially called at the end of each plugin pass
500 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
501
502 my ($self) = @_;
503 $self->finalise_extractors();
504}
505
506sub deinit {
507 # called only once, after all plugin passes have been done
508
509 my ($self) = @_;
510}
511
512# this function should be overridden to return 1
513# in recursive plugins
514sub is_recursive {
515 my $self = shift (@_);
516
517 return 0;
518}
519
520sub get_default_block_exp {
521 my $self = shift (@_);
522
523 return "";
524}
525
526sub get_default_process_exp {
527 my $self = shift (@_);
528
529 return "";
530}
531
532# default implementation is to do nothing.
533sub store_block_files
534{
535 my $self =shift (@_);
536 my ($filename) = @_;
537 return;
538}
539
540#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
541sub block_cover_image
542{
543 my $self =shift;
544 my $filename = shift;
545
546 if ($self->{'cover_image'}) {
547 my $coverfile = $filename;
548 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
549 if (!-e $coverfile) {
550 $coverfile =~ s/jpg$/JPG/;
551 }
552 if (-e $coverfile) {
553 $self->{'file_blocks'}->{$coverfile} = 1;
554 }
555 }
556
557 return;
558}
559
560sub root_ext_split
561{
562 my $self = shift (@_);
563 my ($filename,$tail_re) = @_;
564
565 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
566
567 if ((!defined $file_prefix) || (!defined $file_ext)) {
568 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
569 }
570
571 return ($file_prefix,$file_ext);
572}
573
574sub metadata_read {
575 my $self = shift (@_);
576 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
577 # Keep track of filenames with same root but different extensions
578 # Used to support -associate_ext and the more generalised
579 # -associate_tail_re
580
581 my $associate_tail_re = $self->{'associate_tail_re'};
582 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
583
584 my ($file_prefix,$file_ext)
585 = $self->root_ext_split($file,$associate_tail_re);
586
587 if ((defined $file_prefix) && (defined $file_ext)) {
588
589 my $shared_fileroot = $self->{'shared_fileroot'};
590 if (!defined $shared_fileroot->{$file_prefix}) {
591 my $file_prefix_rec = { 'tie_to' => undef,
592 'exts' => {} };
593 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
594 }
595
596 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
597
598 my $process_exp = $self->{'process_exp'};
599
600 if ($file =~ m/$process_exp/) {
601 # This is the document the others should be tied to
602 $file_prefix_rec->{'tie_to'} = $file_ext;
603 }
604 else {
605 if ($file_ext =~ m/$associate_tail_re$/) {
606 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
607 }
608 }
609
610 }
611 }
612
613 # now check whether we are actually processing this
614 my $filename = $file;
615 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
616 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
617 return undef; # can't recognise
618 }
619
620 # do smart blocking if appropriate
621 if ($self->{'smart_block'}) {
622 $self->store_block_files($filename);
623 }
624 # block the cover image if there is one
625 if ($self->{'cover_image'}) {
626 $self->block_cover_image($filename);
627 }
628
629 return 1;
630}
631
632sub tie_to_filename
633{
634 my $self = shift (@_);
635
636 my ($file_ext,$file_prefix_rec) = @_;
637
638 if (defined $file_prefix_rec) {
639 my $tie_to = $file_prefix_rec->{'tie_to'};
640
641 if (defined $tie_to) {
642 if ($tie_to eq $file_ext) {
643 return 1;
644 }
645 }
646 }
647
648 return 0;
649}
650
651sub tie_to_assoc_file
652{
653 my $self = shift (@_);
654 my ($file_ext,$file_prefix_rec) = @_;
655
656 if (defined $file_prefix_rec) {
657 my $tie_to = $file_prefix_rec->{'tie_to'};
658 if (defined $tie_to) {
659
660 my $exts = $file_prefix_rec->{'exts'};
661
662 my $has_file_ext = $exts->{$file_ext};
663
664 if ($has_file_ext) {
665 return 1;
666 }
667 }
668 }
669
670 return 0;
671}
672
673
674sub associate_with
675{
676 my $self = shift (@_);
677 my ($file, $filename, $metadata) = @_;
678
679 my $associate_tail_re = $self->{'associate_tail_re'};
680 return 0 if (!$associate_tail_re);
681
682 # If file, see if matches with "tie_to" doc or is one of the
683 # associated filename extensions.
684
685 my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
686
687 if ((defined $file_prefix) && (defined $file_ext)) {
688
689 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
690
691 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
692
693 # Set up gsdlassocfile_tobe
694
695 my $exts = $file_prefix_rec->{'exts'};
696
697 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
698 $metadata->{'gsdlassocfile_tobe'} = [];
699 }
700
701 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
702
703 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
704 foreach my $e (keys %$exts) {
705 my $assoc_file = "$full_prefix$e";
706 print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
707 my $mime_type = ""; # let system auto detect this
708 push(@$assoc_tobe,"$assoc_file:$mime_type:");
709 }
710
711 }
712 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
713
714
715 # a form of smart block
716 return 1;
717 }
718 }
719
720 return 0;
721}
722
723
724sub read_block {
725 my $self = shift (@_);
726
727 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
728
729
730 my $filename = $file;
731 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
732
733 if ($self->associate_with($file,$filename,$metadata)) {
734 # a form of smart block
735 $self->{'num_blocked'} ++;
736 return (0,undef); # blocked
737 }
738
739 my $smart_block = $self->{'smart_block'};
740 my $smart_block_BN = $self->{'smart_block_BN'};
741
742 if ($smart_block || $smart_block_BN) {
743 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
744 $self->{'num_blocked'} ++;
745 return (0,undef); # blocked
746 }
747 } else {
748 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
749 $self->{'num_blocked'} ++;
750 return (0,undef); # blocked
751 }
752 if ($self->{'cover_image'}) {
753 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
754 $self->{'num_blocked'} ++;
755 return (0,undef); # blocked
756 }
757 }
758 }
759
760 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
761 return (undef,undef); # can't recognise
762 }
763
764 return (1,$filename);
765}
766
767sub read_tidy_file {
768
769 my $self = shift (@_);
770
771 my ($file) = @_;
772
773 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
774
775 return $file;
776}
777
778
779
780# The BasPlug read_into_doc_obj() function. This function does all the
781# right things to make general options work for a given plugin. It reads in
782# a file and sets up a slew of metadata all saved in doc_obj, which
783# it then returns as part of a tuple (process_status,doc_obj)
784#
785# Much of this functionality used to reside in read, but it was broken
786# down into a supporting routine to make the code more flexible.
787#
788# recursive plugins (e.g. RecPlug) and specialized plugins like those
789# capable of processing many documents within a single file (e.g.
790# GMLPlug) will normally want to implement their own version of
791# read_into_doc_obj()
792#
793# Note that $base_dir might be "" and that $file might
794# include directories
795sub read_into_doc_obj {
796 my $self = shift (@_);
797 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
798
799 if ($self->is_recursive()) {
800 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
801 }
802
803 my $outhandle = $self->{'outhandle'};
804
805 my ($block_status,$filename) = $self->read_block(@_);
806 return $block_status if ((!defined $block_status) || ($block_status==0));
807 $file = $self->read_tidy_file($file);
808
809 # Do encoding stuff
810 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
811 if ($self->{'verbosity'} > 2) {
812 print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
813 }
814
815 # create a new document
816 my $doc_obj = new doc ($filename, "indexed_doc");
817 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
818 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
819 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
820 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
821 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
822
823 my ($filemeta) = $file =~ /([^\\\/]+)$/;
824 # how do we know what encoding the filename is in?
825 # assume it is in the same encoding as its contents
826 if ($encoding !~ /(?:ascii|utf8|unicode)/) {
827 $filemeta = unicode::unicode2utf8(
828 unicode::convert2unicode($encoding, \$filemeta)
829 );
830 }
831 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
832 if ($self->{'cover_image'}) {
833 $self->associate_cover_image($doc_obj, $filename);
834 }
835
836 # read in file ($text will be in utf8)
837 my $text = "";
838 $self->read_file ($filename, $encoding, $language, \$text);
839
840 if (!length ($text)) {
841 my $plugin_name = ref ($self);
842 if ($gli) {
843 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
844 }
845 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
846
847 my $failhandle = $self->{'failhandle'};
848 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
849 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
850 $self->{'num_not_processed'} ++;
851
852 return (0,undef); # what should we return here?? error but don't want to pass it on
853 }
854
855 # include any metadata passed in from previous plugins
856 # note that this metadata is associated with the top level section
857
858 my $associate_tail_re = $self->{'associate_tail_re'};
859
860 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
861
862 # do plugin specific processing of doc_obj
863 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
864 $text = '';
865 undef $text;
866 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
867 return (-1,undef);
868 }
869 $text='';
870 undef $text;
871
872 # do any automatic metadata extraction
873 $self->auto_extract_metadata ($doc_obj);
874
875 # add an OID
876 # see if there is a plugin-specific set_OID function...
877 if (defined ($self->can('set_OID'))) {
878 # it will need $doc_obj to set the Identifier metadata...
879 $self->set_OID($doc_obj);
880 } else {
881 # use the default set_OID() in doc.pm
882 $doc_obj->set_OID();
883 }
884
885 return (1,$doc_obj);
886}
887
888
889# The BasPlug read() function. This function calls read_into_doc_obj()
890# to ensure all the right things to make general options work for a
891# given plugin are done. It then calls the process() function which
892# does all the work specific to a plugin (like the old read functions
893# used to do). Most plugins should define their own process() function
894# and let this read() function keep control.
895#
896# recursive plugins (e.g. RecPlug) and specialized plugins like those
897# capable of processing many documents within a single file (e.g.
898# GMLPlug) might want to implement their own version of read(), but
899# more likely need to implement their own version of read_into_doc_obj()
900#
901# Return number of files processed, undef if can't recognise, -1 if can't
902# process
903
904sub read {
905 my $self = shift (@_);
906 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
907
908 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
909
910 if ((defined $process_status) && ($process_status == 1)) {
911 # process the document
912 $processor->process($doc_obj);
913
914 if(defined($self->{'places_filename'})){
915 &util::rm($self->{'places_filename'});
916 $self->{'places_filename'} = undef;
917 }
918
919 $self->{'num_processed'} ++;
920 undef $doc_obj;
921 }
922
923 # if process_status == 1, then the file has been processed.
924 return $process_status;
925
926}
927
928# returns undef if file is rejected by the plugin
929sub process {
930 my $self = shift (@_);
931 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
932
933 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
934 # die "Basplug::process function must be implemented in sub-class\n";
935
936 return undef; # never gets here
937}
938
939# uses the multiread package to read in the entire file pointed to
940# by filename and loads the resulting text into $$textref. Input text
941# may be in any of the encodings handled by multiread, output text
942# will be in utf8
943sub read_file {
944 my $self = shift (@_);
945 my ($filename, $encoding, $language, $textref) = @_;
946
947 if (!-r $filename)
948 {
949 my $outhandle = $self->{'outhandle'};
950 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
951 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
952 return;
953 }
954 $$textref = "";
955 if (!open (FILE, $filename)) {
956 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
957 die "\n";
958 }
959
960 if ($encoding eq "ascii") {
961 undef $/;
962 $$textref = <FILE>;
963 $/ = "\n";
964 } else {
965 my $reader = new multiread();
966 $reader->set_handle ('BasPlug::FILE');
967 $reader->set_encoding ($encoding);
968 $reader->read_file ($textref);
969 #Now segments chinese if the separate_cjk option is set
970 if ($self->{'separate_cjk'}) {
971 # segment the Chinese words
972 $$textref = &cnseg::segment($$textref);
973 }
974 }
975 close FILE;
976}
977
978# write_file -- used by ConvertToPlug, for example in post processing
979#
980sub utf8_write_file {
981 my $self = shift (@_);
982 my ($textref, $filename) = @_;
983
984 if (!open (FILE, ">$filename")) {
985 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
986 die "\n";
987 }
988 print FILE $$textref;
989
990 close FILE;
991}
992
993
994sub filename_based_title
995{
996 my $self = shift (@_);
997 my ($file) = @_;
998
999 my $file_derived_title = $file;
1000 $file_derived_title =~ s/_/ /g;
1001 $file_derived_title =~ s/\..*?$//;
1002
1003 return $file_derived_title;
1004}
1005
1006
1007sub title_fallback
1008{
1009 my $self = shift (@_);
1010 my ($doc_obj,$section,$file) = @_;
1011
1012 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1013
1014 my $file_derived_title = $self->filename_based_title($file);
1015 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
1016 }
1017}
1018
1019sub textcat_get_language_encoding {
1020 my $self = shift (@_);
1021 my ($filename) = @_;
1022
1023
1024 my ($language, $encoding, $extracted_encoding);
1025 if ($self->{'input_encoding'} eq "auto") {
1026 # use textcat to automatically work out the input encoding and language
1027 ($language, $encoding) = $self->get_language_encoding ($filename);
1028 } elsif ($self->{'extract_language'}) {
1029 # use textcat to get language metadata
1030 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1031 $encoding = $self->{'input_encoding'};
1032 # don't print this message for english... english in utf8 is identical
1033 # to english in iso-8859-1 (except for some punctuation). We don't have
1034 # a language model for en_utf8, so textcat always says iso-8859-1!
1035 if ($extracted_encoding ne $encoding && $language ne "en"
1036 && $self->{'verbosity'}) {
1037 my $plugin_name = ref ($self);
1038 my $outhandle = $self->{'outhandle'};
1039 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
1040 }
1041 } else {
1042 $language = $self->{'default_language'};
1043 $encoding = $self->{'input_encoding'};
1044 }
1045
1046 return ($language, $encoding);
1047}
1048
1049# Uses textcat to work out the encoding and language of the text in
1050# $filename. All html tags are removed before processing.
1051# returns an array containing "language" and "encoding"
1052sub get_language_encoding {
1053 my $self = shift (@_);
1054 my ($filename) = @_;
1055 my $outhandle = $self->{'outhandle'};
1056 my $unicode_format = "";
1057 my $best_language = "";
1058 my $best_encoding = "";
1059
1060 # read in file
1061 if (!open (FILE, $filename)) {
1062 gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1063 # this is a pretty bad error, but try to continue anyway
1064 return ($self->{'default_language'}, $self->{'input_encoding'});
1065 }
1066 undef $/;
1067 my $text = <FILE>;
1068 $/ = "\n";
1069 close FILE;
1070
1071 # check if first few bytes have a Byte Order Marker
1072 my $bom=substr($text,0,2); # check 16bit unicode
1073 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1074 $unicode_format="unicode";
1075 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1076 $unicode_format="unicode";
1077 } else {
1078 $bom=substr($text,0,3); # check utf-8
1079 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1080 $unicode_format="utf8";
1081# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1082# $unicode_format="utf8";
1083 }
1084 }
1085
1086
1087 # handle html files specially
1088 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1089 if (ref($self) eq 'HTMLPlug' ||
1090 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1091
1092 # remove <title>stuff</title> -- as titles tend often to be in English
1093 # for foreign language documents
1094 $text =~ s!<title>.*?</title>!!si;
1095
1096 # see if this html file specifies its encoding
1097 if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1098 $best_encoding = $1;
1099 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {
1100 $best_encoding = $1;
1101 }
1102 if ($best_encoding) { # we extracted an encoding
1103 $best_encoding =~ s/-+/_/g;
1104 $best_encoding = lc($best_encoding); # lowercase
1105 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1106 $self->{'input_encoding'} = $best_encoding;
1107 }
1108
1109 # remove all HTML tags
1110 $text =~ s/<[^>]*>//sg;
1111 }
1112
1113 # get the language/encoding
1114 my $results = $self->{'textcat'}->classify(\$text);
1115
1116 # if textcat returns 3 or less possibilities we'll use the
1117 # first one in the list - otherwise use the defaults
1118 if (scalar @$results > 3) {
1119 if ($unicode_format) { # in case the first had a BOM
1120 $best_encoding=$unicode_format;
1121 } else {
1122 my %guessed_encodings = ();
1123 foreach my $result (@$results) {
1124 $result =~ /([^\-]+)$/;
1125 my $enc=$1;
1126 if (!defined($guessed_encodings{$enc})) {
1127 $guessed_encodings{$enc}=0;
1128 }
1129 $guessed_encodings{$enc}++;
1130 }
1131
1132 $guessed_encodings{""}=-1; # for default best_encoding of ""
1133 foreach my $enc (keys %guessed_encodings) {
1134 if ($guessed_encodings{$enc} >
1135 $guessed_encodings{$best_encoding}){
1136 $best_encoding=$enc;
1137 }
1138 }
1139 }
1140
1141 if ($self->{'input_encoding'} ne 'auto') {
1142 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1143 gsprintf($outhandle,
1144 "BasPlug: {BasPlug.could_not_extract_language}\n",
1145 $filename, $self->{'default_language'});
1146 }
1147 $best_language = $self->{'default_language'};
1148 $best_encoding = $self->{'input_encoding'};
1149
1150 } else {
1151 if ($self->{'verbosity'}>2) {
1152 gsprintf($outhandle,
1153 "BasPlug: {BasPlug.could_not_extract_language}\n",
1154 $filename, $self->{'default_language'});
1155 }
1156 $best_language = $self->{'default_language'};
1157 }
1158 } else { # <= 3 suggestions
1159 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
1160 if (!defined $language) {
1161 if ($self->{'verbosity'}>2) {
1162 gsprintf($outhandle,
1163 "BasPlug: {BasPlug.could_not_extract_language}\n",
1164 $filename, $self->{'default_language'});
1165 }
1166 $language = $self->{'default_language'};
1167 }
1168 if (!defined $encoding) {
1169 if ($self->{'verbosity'}>2) {
1170 gsprintf($outhandle,
1171 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
1172 $filename, $self->{'default_encoding'});
1173 }
1174 $encoding = $self->{'default_encoding'};
1175 }
1176 $best_language = $language;
1177 if (! $best_encoding ) { # may already be set... eg from html meta tag
1178 $best_encoding = $encoding;
1179 }
1180 }
1181
1182 my $text_copy = $text;
1183 if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1184 # the text is valid utf8, so assume that's the real encoding
1185 # (since textcat is based on probabilities)
1186 $best_encoding = 'utf8';
1187 }
1188
1189 # check for equivalents where textcat doesn't have some encodings...
1190 # eg MS versions of standard encodings
1191 if ($best_encoding =~ /^iso_8859_(\d+)/) {
1192 my $iso = $1; # which variant of the iso standard?
1193 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1194 if ($text =~ /[\x80-\x9f]/) {
1195 # Western Europe
1196 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1197 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1198 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1199 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1200 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1201 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1202 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
1203 }
1204 }
1205
1206 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
1207 !defined $encodings::encodings->{$best_encoding}) {
1208 if ($self->{'verbosity'}) {
1209 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1210 $filename, $best_encoding, $self->{'default_encoding'});
1211 }
1212 $best_encoding = $self->{'default_encoding'};
1213 }
1214
1215 return ($best_language, $best_encoding);
1216}
1217
1218# add any extra metadata that's been passed around from one
1219# plugin to another.
1220# extra_metadata uses add_utf8_metadata so it expects metadata values
1221# to already be in utf8
1222sub extra_metadata {
1223 my $self = shift (@_);
1224 my ($doc_obj, $cursection, $metadata) = @_;
1225
1226 my $associate_tail_re = $self->{'associate_tail_re'};
1227
1228 foreach my $field (keys(%$metadata)) {
1229 # $metadata->{$field} may be an array reference
1230 if ($field eq "gsdlassocfile_tobe") {
1231 # 'gsdlassocfile_tobe' is artificially introduced metadata
1232 # that is used to signal that certain additional files should
1233 # be tied to this document. Useful in situations where a
1234 # metadata pass in the plugin pipeline works out some files
1235 # need to be associated with a document, but the document hasn't
1236 # been formed yet.
1237
1238 my $equiv_form = "";
1239 foreach my $gaf (@{$metadata->{$field}}) {
1240 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1241 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1242 my $filename = $full_filename;
1243
1244 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1245
1246 # work out extended tail extension (i.e. matching tail re)
1247
1248 my ($file_prefix,$file_extended_ext)
1249 = $self->root_ext_split($tail_filename,$associate_tail_re);
1250 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1251
1252 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1253 my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1254 my $srcicon = "_icon".$doc_ext."_";
1255 my $end_doclink = "</a>";
1256
1257 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1258
1259 if (defined $pre_doc_ext) {
1260 # for metadata such as [mp3._edited] [mp3._full] ...
1261 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1262 }
1263
1264 # for multiple metadata such as [mp3.assoclink]
1265 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1266
1267 $equiv_form .= " $assoc_form";
1268 }
1269 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1270 }
1271 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1272 map {
1273 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1274 } @{$metadata->{$field}};
1275 } else {
1276 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1277 }
1278 }
1279}
1280
1281# initialise metadata extractors
1282sub initialise_extractors {
1283 my $self = shift (@_);
1284
1285 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1286 &acronym::initialise_acronyms();
1287 }
1288}
1289
1290# finalise metadata extractors
1291sub finalise_extractors {
1292 my $self = shift (@_);
1293
1294 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1295 &acronym::finalise_acronyms();
1296 }
1297}
1298
1299# FIRSTNNN: extract the first NNN characters as metadata
1300sub extract_first_NNNN_characters {
1301 my $self = shift (@_);
1302 my ($textref, $doc_obj, $thissection) = @_;
1303
1304 foreach my $size (split /,/, $self->{'first'}) {
1305 my $tmptext = $$textref;
1306 $tmptext =~ s/^\s+//;
1307 $tmptext =~ s/\s+$//;
1308 $tmptext =~ s/\s+/ /gs;
1309 $tmptext = substr ($tmptext, 0, $size);
1310 $tmptext =~ s/\s\S*$/&#8230;/;
1311 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1312 }
1313}
1314
1315sub extract_email {
1316 my $self = shift (@_);
1317 my ($textref, $doc_obj, $thissection) = @_;
1318 my $outhandle = $self->{'outhandle'};
1319
1320 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1321 if ($self->{'verbosity'} > 2);
1322
1323 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1324 @email = sort @email;
1325
1326# if($self->{"new_extract_email"} == 0)
1327# {
1328# my @email2 = ();
1329# foreach my $address (@email)
1330# {
1331# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
1332# {
1333# push @email2, $address;
1334# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1335# # print $outhandle " extracting $address\n"
1336# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1337# if ($self->{'verbosity'} > 3);
1338# }
1339# }
1340# }
1341# else
1342# {
1343 my $hashExistMail = {};
1344 foreach my $address (@email) {
1345 if (!(defined $hashExistMail->{$address}))
1346 {
1347 $hashExistMail->{$address} = 1;
1348 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1349 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1350 if ($self->{'verbosity'} > 3);
1351 }
1352 }
1353 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1354 if ($self->{'verbosity'} > 2);
1355}
1356
1357# extract metadata
1358sub auto_extract_metadata {
1359
1360 my $self = shift (@_);
1361 my ($doc_obj) = @_;
1362
1363 if ($self->{'extract_email'}) {
1364 my $thissection = $doc_obj->get_top_section();
1365 while (defined $thissection) {
1366 my $text = $doc_obj->get_text($thissection);
1367 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1368 $thissection = $doc_obj->get_next_section ($thissection);
1369 }
1370 }
1371 if ($self->{'extract_placenames'}) {
1372 my $thissection = $doc_obj->get_top_section();
1373 while (defined $thissection) {
1374 my $text = $doc_obj->get_text($thissection);
1375 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1376 $thissection = $doc_obj->get_next_section ($thissection);
1377 }
1378 }
1379
1380 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
1381 $self->extract_keyphrases($doc_obj);
1382 }
1383
1384 if ($self->{'first'}) {
1385 my $thissection = $doc_obj->get_top_section();
1386 while (defined $thissection) {
1387 my $text = $doc_obj->get_text($thissection);
1388 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1389 $thissection = $doc_obj->get_next_section ($thissection);
1390 }
1391 }
1392
1393 if ($self->{'extract_acronyms'}) {
1394 my $thissection = $doc_obj->get_top_section();
1395 while (defined $thissection) {
1396 my $text = $doc_obj->get_text($thissection);
1397 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1398 $thissection = $doc_obj->get_next_section ($thissection);
1399 }
1400 }
1401
1402 if ($self->{'markup_acronyms'}) {
1403 my $thissection = $doc_obj->get_top_section();
1404 while (defined $thissection) {
1405 my $text = $doc_obj->get_text($thissection);
1406 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1407 $doc_obj->delete_text($thissection);
1408 $doc_obj->add_text($thissection, $text);
1409 $thissection = $doc_obj->get_next_section ($thissection);
1410 }
1411 }
1412
1413 if($self->{'extract_historical_years'}) {
1414 my $thissection = $doc_obj->get_top_section();
1415 while (defined $thissection) {
1416
1417 my $text = $doc_obj->get_text($thissection);
1418 &DateExtract::get_date_metadata($text, $doc_obj,
1419 $thissection,
1420 $self->{'no_bibliography'},
1421 $self->{'maximum_year'},
1422 $self->{'maximum_century'});
1423 $thissection = $doc_obj->get_next_section ($thissection);
1424 }
1425 }
1426}
1427
1428
1429#adding kea keyphrases
1430sub extract_keyphrases
1431{
1432 my $self = shift(@_);
1433 my $doc_obj = shift(@_);
1434
1435 # Use Kea 3.0 unless 4.0 has been specified
1436 my $kea_version = "3.0";
1437 if ($self->{'extract_keyphrases_kea4'}) {
1438 $kea_version = "4.0";
1439 }
1440
1441 # Check that Kea exists, and tell the user where to get it if not
1442 my $keahome = &Kea::get_Kea_directory($kea_version);
1443 if (!-e $keahome) {
1444 gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1445 return;
1446 }
1447
1448 my $thissection = $doc_obj->get_top_section();
1449 my $text = "";
1450 my $list;
1451
1452 #loop through sections to gather whole doc
1453 while (defined $thissection) {
1454 my $sectiontext = $doc_obj->get_text($thissection);
1455 $text = $text.$sectiontext;
1456 $thissection = $doc_obj->get_next_section ($thissection);
1457 }
1458
1459 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1460 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1461 } else { #otherwise call Kea with no options
1462 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
1463 }
1464
1465 if ($list){
1466 # if a list of kea keyphrases was returned (ie not empty)
1467 if ($self->{'verbosity'}) {
1468 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1469 }
1470
1471 #add metadata to top section
1472 $thissection = $doc_obj->get_top_section();
1473
1474 # add all key phrases as one metadata
1475 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1476
1477 # add individual key phrases as multiple metadata
1478 foreach my $keyphrase (split(',', $list)) {
1479 $keyphrase =~ s/^\s+|\s+$//g;
1480 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1481 }
1482 }
1483}
1484
1485
1486# extract acronyms from a section in a document. progress is
1487# reported to outhandle based on the verbosity. both the Acronym
1488# and the AcronymKWIC metadata items are created.
1489
1490sub extract_acronyms {
1491 my $self = shift (@_);
1492 my ($textref, $doc_obj, $thissection) = @_;
1493 my $outhandle = $self->{'outhandle'};
1494
1495 # print $outhandle " extracting acronyms ...\n"
1496 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1497 if ($self->{'verbosity'} > 2);
1498
1499 my $acro_array = &acronym::acronyms($textref);
1500
1501 foreach my $acro (@$acro_array) {
1502
1503 #check that this is the first time ...
1504 my $seen_before = "false";
1505 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1506 foreach my $thisAcro (@$previous_data) {
1507 if ($thisAcro eq $acro->to_string()) {
1508 $seen_before = "true";
1509 if ($self->{'verbosity'} >= 4) {
1510 gsprintf($outhandle, " {BasPlug.already_seen} " .
1511 $acro->to_string() . "\n");
1512 }
1513 }
1514 }
1515
1516 if ($seen_before eq "false") {
1517 #write it to the file ...
1518 $acro->write_to_file();
1519
1520 #do the normal acronym
1521 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1522 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1523 if ($self->{'verbosity'} > 3);
1524 }
1525 }
1526
1527 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1528 if ($self->{'verbosity'} > 2);
1529}
1530
1531sub markup_acronyms {
1532 my $self = shift (@_);
1533 my ($text, $doc_obj, $thissection) = @_;
1534 my $outhandle = $self->{'outhandle'};
1535
1536 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1537 if ($self->{'verbosity'} > 2);
1538
1539 #self is passed in to check for verbosity ...
1540 $text = &acronym::markup_acronyms($text, $self);
1541
1542 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1543 if ($self->{'verbosity'} > 2);
1544
1545 return $text;
1546}
1547
1548sub compile_stats {
1549 my $self = shift(@_);
1550 my ($stats) = @_;
1551
1552 $stats->{'num_processed'} += $self->{'num_processed'};
1553 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1554 $stats->{'num_archives'} += $self->{'num_archives'};
1555
1556}
1557
1558sub associate_cover_image {
1559 my $self = shift;
1560 my ($doc_obj, $filename) = @_;
1561
1562 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1563 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1564 # don't stat() for existence eg for multiple document input files
1565 # (eg SplitPlug)
1566 return;
1567 }
1568
1569 my $top_section=$doc_obj->get_top_section();
1570
1571 if (-e $filename) {
1572 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1573 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1574 } else {
1575 my $upper_filename = $filename;
1576 $upper_filename =~ s/jpg$/JPG/;
1577 if (-e $upper_filename) {
1578 $doc_obj->associate_file($upper_filename, "cover.jpg",
1579 "image/jpeg");
1580 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1581 } else {
1582 # file doesn't exist, so record the fact that it's missing so
1583 # we don't stat() again (stat is slow)
1584 $self->{'covers_missing_cache'}->{$filename} = 1;
1585 }
1586 }
1587
1588}
1589
1590
1591# Overridden by exploding plugins (eg. ISISPlug)
1592sub clean_up_after_exploding
1593{
1594 my $self = shift(@_);
1595}
1596
1597
15981;
Note: See TracBrowser for help on using the repository browser.