source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 9706

Last change on this file since 9706 was 9703, checked in by mdewsnip, 19 years ago

Improvement to previous change so "file not processed" messages are seen in Expert mode.

  • Property svn:keywords set to Author Date Id Revision
File size: 39.0 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict; no strict 'subs';
39
40use File::Basename;
41
42use Kea;
43use parsargv;
44use multiread;
45use encodings;
46use cnseg;
47use acronym;
48use textcat;
49use doc;
50eval "require diagnostics"; # some perl distros (eg mac) don't have this
51use DateExtract;
52use ghtml;
53use gsprintf 'gsprintf';
54use printusage;
55
56use GISBasPlug;
57
58@BasPlug::ISA = ( GISBasPlug );
59
60my $unicode_list =
61 [ { 'name' => "auto",
62 'desc' => "{BasPlug.input_encoding.auto}" },
63 { 'name' => "ascii",
64 'desc' => "{BasPlug.input_encoding.ascii}" },
65 { 'name' => "utf8",
66 'desc' => "{BasPlug.input_encoding.utf8}" },
67 { 'name' => "unicode",
68 'desc' => "{BasPlug.input_encoding.unicode}" } ];
69
70my $arguments =
71 [ { 'name' => "process_exp",
72 'desc' => "{BasPlug.process_exp}",
73 'type' => "regexp",
74 'deft' => "",
75 'reqd' => "no" },
76 { 'name' => "block_exp",
77 'desc' => "{BasPlug.block_exp}",
78 'type' => "regexp",
79 'deft' => "",
80 'reqd' => "no" },
81 { 'name' => "smart_block",
82 'desc' => "{BasPlug.smart_block}",
83 'type' => "flag",
84 'reqd' => "no" },
85 { 'name' => "associate_ext",
86 'desc' => "{BasPlug.associate_ext}",
87 'type' => "string",
88 'reqd' => "no" },
89 { 'name' => "input_encoding",
90 'desc' => "{BasPlug.input_encoding}",
91 'type' => "enum",
92 'list' => $unicode_list,
93 'reqd' => "no" ,
94 'deft' => "auto" } ,
95 { 'name' => "default_encoding",
96 'desc' => "{BasPlug.default_encoding}",
97 'type' => "enum",
98 'list' => $unicode_list,
99 'reqd' => "no",
100 'deft' => "utf8" },
101 { 'name' => "extract_language",
102 'desc' => "{BasPlug.extract_language}",
103 'type' => "flag",
104 'reqd' => "no" },
105 { 'name' => "default_language",
106 'desc' => "{BasPlug.default_language}",
107 'type' => "language",
108 'deft' => "en",
109 'reqd' => "no" },
110 { 'name' => "extract_acronyms",
111 'desc' => "{BasPlug.extract_acronyms}",
112 'type' => "flag",
113 'reqd' => "no" },
114 { 'name' => "markup_acronyms",
115 'desc' => "{BasPlug.markup_acronyms}",
116 'type' => "flag",
117 'reqd' => "no" },
118 { 'name' => "extract_keyphrases",
119 'desc' => "{BasPlug.extract_keyphrases}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "extract_keyphrase_options",
123 'desc' => "{BasPlug.extract_keyphrase_options}",
124 'type' => "string",
125 'deft' => "",
126 'reqd' => "no" },
127 { 'name' => "first",
128 'desc' => "{BasPlug.first}",
129 'type' => "string",
130 'reqd' => "no" },
131 { 'name' => "extract_email",
132 'desc' => "{BasPlug.extract_email}",
133 'type' => "flag",
134 'reqd' => "no" },
135 { 'name' => "extract_historical_years",
136 'desc' => "{BasPlug.extract_historical_years}",
137 'type' => "flag",
138 'reqd' => "no" },
139 { 'name' => "maximum_year",
140 'desc' => "{BasPlug.maximum_year}",
141 'type' => "int",
142 'deft' => (localtime)[5]+1900,
143 'reqd' => "no"},
144 { 'name' => "maximum_century",
145 'desc' => "{BasPlug.maximum_century}",
146 'type' => "string",
147 'deft' => "",
148 'reqd' => "no" },
149 { 'name' => "no_bibliography",
150 'desc' => "{BasPlug.no_bibliography}",
151 'type' => "flag",
152 'reqd' => "no"},
153 { 'name' => "no_cover_image",
154 'desc' => "{BasPlug.no_cover_image}",
155 'type' => "flag",
156 'reqd' => "no" } ];
157
158my $gis_arguments =
159 [ { 'name' => "extract_placenames",
160 'desc' => "{GISBasPlug.extract_placenames}",
161 'type' => "flag",
162 'reqd' => "no" },
163 { 'name' => "gazetteer",
164 'desc' => "{GISBasPlug.gazetteer}",
165 'type' => "string",
166 'reqd' => "no" },
167 { 'name' => "place_list",
168 'desc' => "{GISBasPlug.place_list}",
169 'type' => "flag",
170 'reqd' => "no" } ];
171
172
173my $options = { 'name' => "BasPlug",
174 'desc' => "{BasPlug.desc}",
175 'abstract' => "yes",
176 'inherits' => "no",
177 'args' => $arguments };
178
179
180
181
182sub get_arguments
183{
184 my $self = shift(@_);
185 my $optionlistref = $self->{'option_list'};
186 my @optionlist = @$optionlistref;
187 my $pluginoptions = pop(@$optionlistref);
188 my $pluginarguments = $pluginoptions->{'args'};
189 return $pluginarguments;
190}
191
192
193sub print_xml_usage
194{
195 my $self = shift(@_);
196
197 # XML output is always in UTF-8
198 gsprintf::output_strings_in_UTF8;
199
200 PrintUsage::print_xml_header();
201 $self->print_xml();
202}
203
204
205sub print_xml
206{
207 my $self = shift(@_);
208
209 my $optionlistref = $self->{'option_list'};
210 my @optionlist = @$optionlistref;
211 my $pluginoptions = pop(@$optionlistref);
212 return if (!defined($pluginoptions));
213
214 gsprintf(STDERR, "<PlugInfo>\n");
215 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
216 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
217 $desc =~ s/</&amp;lt;/g; # doubly escaped
218 $desc =~ s/>/&amp;gt;/g;
219
220 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
222 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
223 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
224 gsprintf(STDERR, " <Arguments>\n");
225 if (defined($pluginoptions->{'args'})) {
226 &PrintUsage::print_options_xml($pluginoptions->{'args'});
227 }
228
229 # Recurse up the plugin hierarchy
230 $self->print_xml();
231
232 gsprintf(STDERR, " </Arguments>\n");
233 gsprintf(STDERR, "</PlugInfo>\n");
234}
235
236
237sub print_txt_usage
238{
239 my $self = shift(@_);
240
241 # Print the usage message for a plugin (recursively)
242 my $descoffset = $self->determine_description_offset(0);
243 $self->print_plugin_usage($descoffset, 1);
244}
245
246
247sub determine_description_offset
248{
249 my $self = shift(@_);
250 my $maxoffset = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $pluginoptions = pop(@$optionlistref);
255 return $maxoffset if (!defined($pluginoptions));
256
257 # Find the length of the longest option string of this plugin
258 my $pluginargs = $pluginoptions->{'args'};
259 if (defined($pluginargs)) {
260 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
261 if ($longest > $maxoffset) {
262 $maxoffset = $longest;
263 }
264 }
265
266 # Recurse up the plugin hierarchy
267 $maxoffset = $self->determine_description_offset($maxoffset);
268 $self->{'option_list'} = \@optionlist;
269 return $maxoffset;
270}
271
272
273sub print_plugin_usage
274{
275 my $self = shift(@_);
276 my $descoffset = shift(@_);
277 my $isleafclass = shift(@_);
278
279 my $optionlistref = $self->{'option_list'};
280 my @optionlist = @$optionlistref;
281 my $pluginoptions = pop(@$optionlistref);
282 return if (!defined($pluginoptions));
283
284 my $pluginname = $pluginoptions->{'name'};
285 my $pluginargs = $pluginoptions->{'args'};
286 my $plugindesc = $pluginoptions->{'desc'};
287
288 # Produce the usage information using the data structure above
289 if ($isleafclass) {
290 if (defined($plugindesc)) {
291 gsprintf(STDERR, "$plugindesc\n\n");
292 }
293 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
294 }
295
296 # Display the plugin options, if there are some
297 if (defined($pluginargs)) {
298 # Calculate the column offset of the option descriptions
299 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
300
301 if ($isleafclass) {
302 gsprintf(STDERR, " {common.specific_options}:\n");
303 }
304 else {
305 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
306 }
307
308 # Display the plugin options
309 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
310 }
311
312 # Recurse up the plugin hierarchy
313 $self->print_plugin_usage($descoffset, 0);
314 $self->{'option_list'} = \@optionlist;
315}
316
317
318sub new {
319 my $class = shift (@_);
320 my $plugin_name = shift (@_);
321 my $self = {};
322 $self->{'plugin_type'} = "BasPlug";
323
324 if (GISBasPlug::has_mapdata()) {
325 push(@$arguments,@$gis_arguments);
326 }
327
328 my $enc = "^(";
329 map {$enc .= "$_|";} keys %$encodings::encodings;
330 my $denc = $enc . "ascii|utf8|unicode)\$";
331 $enc .= "ascii|utf8|unicode|auto)\$";
332
333 $self->{'outhandle'} = STDERR;
334 my $year = (localtime)[5]+1900;
335
336 $self->{'textcat'} = new textcat();
337
338 $self->{'num_processed'} = 0;
339 $self->{'num_not_processed'} = 0;
340 $self->{'num_blocked'} = 0;
341 $self->{'num_archives'} = 0;
342 $self->{'cover_image'} = 1; # cover image is on by default
343
344 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
345 $self->{'option_list'} = [ $options ];
346
347 my $no_cover_image = 0;
348 # general options available to all plugins
349 if (!parsargv::parse(\@_,
350 q^process_exp/.*/^, \$self->{'process_exp'},
351 q^block_exp/.*/^, \$self->{'block_exp'},
352 q^associate_ext/.*/^, \$self->{'associate_ext'},
353 q^extract_language^, \$self->{'extract_language'},
354 q^extract_acronyms^, \$self->{'extract_acronyms'},
355 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
356 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
357 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
358 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
359 q^extract_email^, \$self->{'extract_email'},
360 q^extract_placenames^, \$self->{'extract_placenames'},
361 q^gazetteer/.*/^, \$self->{'gazetteer'},
362 q^place_list^, \$self->{'place_list'},
363 q^markup_acronyms^, \$self->{'markup_acronyms'},
364 q^default_language/.{2}/en^, \$self->{'default_language'},
365 q^first/.*/^, \$self->{'first'},
366 q^extract_historical_years^, \$self->{'date_extract'},
367 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
368 q^no_bibliography^, \$self->{'no_biblio'},
369 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
370 q^no_cover_image^, \$no_cover_image,
371 q^separate_cjk^, \$self->{'separate_cjk'},
372 q^smart_block^, \$self->{'smart_block'},
373 q^smart_block_BN^, \$self->{'smart_block_BN'},
374 "allow_extra_options")) {
375
376 gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
377 bless $self, $class;
378 $self->print_txt_usage(""); # Use default resource bundle
379 die "\n";
380 }
381
382 my $associate_ext = $self->{'associate_ext'};
383 if ((defined $associate_ext) && ($associate_ext ne "")) {
384 my @exts = split(/,/,$associate_ext);
385
386 my %associate_ext_lookup = ();
387 foreach my $e (@exts) {
388 $associate_ext_lookup{$e} = 1;
389 }
390
391 $self->{'associate_ext_lookup'} = \%associate_ext_lookup;
392 }
393
394 $self->{'shared_fileroot'} = {};
395 $self->{'file_blocks'} = {};
396
397 $self->{'cover_image'} = 0 if ($no_cover_image);
398
399 if ($self->{'extract_placenames'}) {
400
401 my $outhandle = $self->{'outhandle'};
402
403 my $places_ref
404 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
405
406 if (!defined $places_ref) {
407 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
408 print $outhandle " No placename extraction will take place.\n";
409 $self->{'extract_placenames'} = undef;
410 }
411 else {
412 $self->{'places'} = $places_ref;
413 }
414 }
415 return bless $self, $class;
416}
417
418# initialize BasPlug options
419# if init() is overridden in a sub-class, remember to call BasPlug::init()
420sub init {
421 my $self = shift (@_);
422 my ($verbosity, $outhandle, $failhandle) = @_;
423
424 # verbosity is passed through from the processor
425 $self->{'verbosity'} = $verbosity;
426
427 # as are the outhandle and failhandle
428 $self->{'outhandle'} = $outhandle if defined $outhandle;
429 $self->{'failhandle'} = $failhandle;
430
431 # set process_exp and block_exp to defaults unless they were
432 # explicitly set
433
434 if ((!$self->is_recursive()) and
435 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
436
437 $self->{'process_exp'} = $self->get_default_process_exp ();
438 if ($self->{'process_exp'} eq "") {
439 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
440 }
441 }
442
443 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
444 $self->{'block_exp'} = $self->get_default_block_exp ();
445 }
446}
447
448sub begin {
449 my $self = shift (@_);
450 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
451 $self->initialise_extractors();
452}
453
454sub end {
455 my ($self) = @_;
456 $self->finalise_extractors();
457}
458
459# this function should be overridden to return 1
460# in recursive plugins
461sub is_recursive {
462 my $self = shift (@_);
463
464 return 0;
465}
466
467sub get_default_block_exp {
468 my $self = shift (@_);
469
470 return "";
471}
472
473sub get_default_process_exp {
474 my $self = shift (@_);
475
476 return "";
477}
478
479# default implementation is to do nothing.
480sub store_block_files
481{
482 my $self =shift (@_);
483 my ($filename) = @_;
484 return;
485}
486
487#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
488sub block_cover_image
489{
490 my $self =shift (@_);
491 my ($filename) = @_;
492 if ($self->{'cover_image'}) {
493 my $coverfile = $filename;
494 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
495 if (!-e $coverfile) {
496 $coverfile =~ s/jpg$/JPG/;
497 }
498 if (-e $coverfile) {
499 $self->{'file_blocks'}->{$coverfile} = 1;
500 }
501 }
502
503 return;
504}
505
506sub metadata_read {
507 my $self = shift (@_);
508 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
509 # Keep track of filenames with same root but different extensions
510 # Used to support -associate_ext
511
512 my $associate_ext = $self->{'associate_ext'};
513 if ((defined $associate_ext) && ($associate_ext ne "")) {
514
515 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
516 if ((defined $file_prefix) && (defined $file_ext)) {
517
518 my $shared_fileroot = $self->{'shared_fileroot'};
519 if (!defined $shared_fileroot->{$file_prefix}) {
520 my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
521 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
522 }
523
524 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
525
526 my $process_exp = $self->{'process_exp'};
527
528 if ($file =~ m/$self->{'process_exp'}/) {
529 # This is the document the others should be tied to
530 $file_prefix_rec->{'tie_to'} = $file_ext;
531 }
532 else {
533 if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
534 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
535 }
536 }
537 }
538 }
539
540 # now check whether we are actually processing this
541 my $filename = $file;
542 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
543 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
544 return undef; # can't recognise
545 }
546
547 # do smart blocking if appropriate
548 if (defined $self->{'smart_block'}) {
549 $self->block_cover_image($filename);
550 $self->store_block_files($filename);
551 }
552
553 return 1;
554}
555
556sub tie_to_filename
557{
558 my $self = shift (@_);
559
560 my ($file_ext,$file_prefix_rec) = @_;
561
562 if (defined $file_prefix_rec) {
563 my $tie_to = $file_prefix_rec->{'tie_to'};
564
565 if (defined $tie_to) {
566 if ($tie_to eq $file_ext) {
567 return 1;
568 }
569 }
570 }
571
572 return 0;
573}
574
575sub tie_to_assoc_file
576{
577 my $self = shift (@_);
578 my ($file_ext,$file_prefix_rec) = @_;
579
580 if (defined $file_prefix_rec) {
581 my $tie_to = $file_prefix_rec->{'tie_to'};
582 if (defined $tie_to) {
583
584 my $exts = $file_prefix_rec->{'exts'};
585
586 my $has_file_ext = $exts->{$file_ext};
587
588 if ($has_file_ext) {
589 return 1;
590 }
591 }
592 }
593
594 return 0;
595}
596
597
598sub associate_with
599{
600 my $self = shift (@_);
601 my ($file, $filename, $metadata) = @_;
602
603 my $associate_ext = $self->{'associate_ext'};
604
605
606 return 0 if (!$associate_ext);
607
608 # If file, see if matches with "tie_to" doc or is one of the
609 # associated filename extensions.
610
611 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
612 if ((defined $file_prefix) && (defined $file_ext)) {
613
614 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
615
616 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
617
618 # Set up gsdlassocfile_tobe
619
620 my $exts = $file_prefix_rec->{'exts'};
621
622 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
623 $metadata->{'gsdlassocfile_tobe'} = [];
624 }
625
626 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
627
628 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
629 foreach my $e (keys %$exts) {
630 my $assoc_file = "$full_prefix.$e";
631 my $mime_type = ""; # let system auto detect this
632 push(@$assoc_tobe,"$assoc_file:$mime_type:");
633 }
634 }
635 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
636 # a form of smart block
637
638 return 1;
639 }
640 }
641
642 return 0;
643}
644
645
646# The BasPlug read() function. This function does all the right things
647# to make general options work for a given plugin. It calls the process()
648# function which does all the work specific to a plugin (like the old
649# read functions used to do). Most plugins should define their own
650# process() function and let this read() function keep control.
651#
652# recursive plugins (e.g. RecPlug) and specialized plugins like those
653# capable of processing many documents within a single file (e.g.
654# GMLPlug) should normally implement their own version of read()
655#
656# Return number of files processed, undef if can't recognise, -1 if can't
657# process
658# Note that $base_dir might be "" and that $file might
659# include directories
660
661sub read {
662 my $self = shift (@_);
663
664 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
665
666 if ($self->is_recursive()) {
667 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
668 }
669
670 my $outhandle = $self->{'outhandle'};
671 my $smart_block = $self->{'smart_block'};
672 my $smart_block_BN = $self->{'smart_block_BN'};
673
674 my $filename = $file;
675 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
676
677 if ($self->associate_with($file,$filename,$metadata)) {
678 # a form of smart block
679 $self->{'num_blocked'} ++;
680 return 0; # blocked
681 }
682
683 if ($smart_block || $smart_block_BN) {
684 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
685 $self->{'num_blocked'} ++;
686 return 0; # blocked
687 }
688 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
689 $self->{'num_blocked'} ++;
690 return 0; # blocked
691 }
692
693 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
694 return undef; # can't recognise
695 }
696 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
697
698 # Do encoding stuff
699 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
700
701 # create a new document
702 my $doc_obj = new doc ($filename, "indexed_doc");
703 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
704 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
705 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
706 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
707 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
708
709 my ($filemeta) = $file =~ /([^\\\/]+)$/;
710 # how do we know what encoding the filename is in?
711 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
712 if ($self->{'cover_image'}) {
713 $self->associate_cover_image($doc_obj, $filename);
714 }
715
716 # read in file ($text will be in utf8)
717 my $text = "";
718 $self->read_file ($filename, $encoding, $language, \$text);
719
720 if (!length ($text)) {
721 my $plugin_name = ref ($self);
722 if ($gli) {
723 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
724 }
725 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
726
727 my $failhandle = $self->{'failhandle'};
728 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
729 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
730 $self->{'num_not_processed'} ++;
731
732 return 0; # what should we return here?? error but don't want to pass it on
733 }
734
735 # include any metadata passed in from previous plugins
736 # note that this metadata is associated with the top level section
737
738 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
739
740 # do plugin specific processing of doc_obj
741 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
742 $text = '';
743 undef $text;
744 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
745 return -1;
746 }
747 $text='';
748 undef $text;
749
750 # do any automatic metadata extraction
751 $self->auto_extract_metadata ($doc_obj);
752
753 # add an OID
754 # see if there is a plugin-specific set_OID function...
755 if (defined ($self->can('set_OID'))) {
756 # it will need $doc_obj to set the Identifier metadata...
757 $self->set_OID($doc_obj);
758 } else {
759 # use the default set_OID() in doc.pm
760 $doc_obj->set_OID();
761 }
762
763 # process the document
764 $processor->process($doc_obj);
765
766 if(defined($self->{'places_filename'})){
767 &util::rm($self->{'places_filename'});
768 $self->{'places_filename'} = undef;
769 }
770
771 $self->{'num_processed'} ++;
772 undef $doc_obj;
773 return 1; # processed the file
774}
775
776# returns undef if file is rejected by the plugin
777sub process {
778 my $self = shift (@_);
779 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
780
781 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
782 # die "Basplug::process function must be implemented in sub-class\n";
783
784 return undef; # never gets here
785}
786
787# uses the multiread package to read in the entire file pointed to
788# by filename and loads the resulting text into $$textref. Input text
789# may be in any of the encodings handled by multiread, output text
790# will be in utf8
791sub read_file {
792 my $self = shift (@_);
793 my ($filename, $encoding, $language, $textref) = @_;
794
795 if (!-r $filename)
796 {
797 my $outhandle = $self->{'outhandle'};
798 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
799 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
800 return;
801 }
802
803 $$textref = "";
804
805 if (!open (FILE, $filename)) {
806 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
807 die "\n";
808 }
809
810 if ($encoding eq "ascii") {
811 undef $/;
812 $$textref = <FILE>;
813 $/ = "\n";
814 } else {
815 my $reader = new multiread();
816 $reader->set_handle ('BasPlug::FILE');
817 $reader->set_encoding ($encoding);
818 $reader->read_file ($textref);
819
820 #Now segments chinese if the separate_cjk option is set
821 if ($self->{'separate_cjk'}) {
822 # segment the Chinese words
823 $$textref = &cnseg::segment($$textref);
824 }
825 }
826
827 close FILE;
828}
829
830sub filename_based_title
831{
832 my $self = shift (@_);
833 my ($file) = @_;
834
835 my $file_derived_title = $file;
836 $file_derived_title =~ s/_/ /g;
837 $file_derived_title =~ s/\..*?$//;
838
839 return $file_derived_title;
840}
841
842
843sub title_fallback
844{
845 my $self = shift (@_);
846 my ($doc_obj,$section,$file) = @_;
847
848 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
849
850 my $file_derived_title = $self->filename_based_title($file);
851 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
852 }
853}
854
855sub textcat_get_language_encoding {
856 my $self = shift (@_);
857 my ($filename) = @_;
858
859 my ($language, $encoding, $extracted_encoding);
860 if ($self->{'input_encoding'} eq "auto") {
861 # use textcat to automatically work out the input encoding and language
862 ($language, $encoding) = $self->get_language_encoding ($filename);
863 } elsif ($self->{'extract_language'}) {
864 # use textcat to get language metadata
865 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
866 $encoding = $self->{'input_encoding'};
867 # don't print this message for english... english in utf8 is identical
868 # to english in iso-8859-1 (except for some punctuation). We don't have
869 # a language model for en_utf8, so textcat always says iso-8859-1!
870 if ($extracted_encoding ne $encoding && $language ne "en"
871 && $self->{'verbosity'}) {
872 my $plugin_name = ref ($self);
873 my $outhandle = $self->{'outhandle'};
874 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
875 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
876 # print $outhandle "appears to be encoded as $extracted_encoding.\n";
877 }
878 } else {
879 $language = $self->{'default_language'};
880 $encoding = $self->{'input_encoding'};
881 }
882 return ($language, $encoding);
883}
884
885# Uses textcat to work out the encoding and language of the text in
886# $filename. All html tags are removed before processing.
887# returns an array containing "language" and "encoding"
888sub get_language_encoding {
889 my $self = shift (@_);
890 my ($filename) = @_;
891 my $outhandle = $self->{'outhandle'};
892 my $unicode_format = "";
893 # read in file
894 open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
895 undef $/;
896 my $text = <FILE>;
897 $/ = "\n";
898 close FILE;
899
900 # check if first few bytes have a Byte Order Marker
901 my $bom=substr($text,0,2); # check 16bit unicode
902 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
903 $unicode_format="unicode";
904 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
905 $unicode_format="unicode";
906 } else {
907 $bom=substr($text,0,3); # check utf-8
908 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
909 $unicode_format="utf8";
910# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
911# $unicode_format="utf8";
912 }
913 }
914
915
916 # remove <title>stuff</title> -- as titles tend often to be in English
917 # for foreign language documents
918 $text =~ s/<title>(.|\n)*?<\/title>//i;
919
920 # remove all HTML tags
921 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
922 if (ref($self) eq 'HTMLPlug' ||
923 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
924 $text =~ s/<[^>]*>//sg;
925 }
926
927 # get the language/encoding
928 my $results = $self->{'textcat'}->classify(\$text);
929
930 # if textcat returns 3 or less possibilities we'll use the
931 # first one in the list - otherwise use the defaults
932 if (scalar @$results > 3) {
933 my $best_encoding="";
934 if ($unicode_format) { # in case the first had a BOM
935 $best_encoding=$unicode_format;
936 } else {
937 my %guessed_encodings = ();
938 foreach my $result (@$results) {
939 $result =~ /([^\-]+)$/;
940 my $enc=$1;
941 if (!defined($guessed_encodings{$enc})) {
942 $guessed_encodings{$enc}=0;
943 }
944 $guessed_encodings{$enc}++;
945 }
946
947 $guessed_encodings{""}=-1; # for default best_encoding of ""
948 foreach my $enc (keys %guessed_encodings) {
949 if ($guessed_encodings{$enc} >
950 $guessed_encodings{$best_encoding}){
951 $best_encoding=$enc;
952 }
953 }
954 }
955
956 if ($self->{'input_encoding'} ne 'auto') {
957 if ($self->{'extract_language'} && $self->{'verbosity'}) {
958 gsprintf($outhandle,
959 "BasPlug: {BasPlug.could_not_extract_language}\n",
960 $filename, $self->{'default_language'});
961 }
962 return ($self->{'default_language'}, $self->{'input_encoding'});
963
964 } else {
965 if ($self->{'verbosity'}) {
966 gsprintf($outhandle,
967 "BasPlug: {BasPlug.could_not_extract_language}\n",
968 $filename, $self->{'default_language'});
969 }
970 return ($self->{'default_language'}, $best_encoding);
971 }
972 }
973
974 # format language/encoding
975 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
976 if (!defined $language) {
977 if ($self->{'verbosity'}) {
978 gsprintf($outhandle,
979 "BasPlug: {BasPlug.could_not_extract_language}\n",
980 $filename, $self->{'default_language'});
981 }
982 $language = $self->{'default_language'};
983 }
984 if (!defined $encoding) {
985 if ($self->{'verbosity'}) {
986 gsprintf($outhandle,
987 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
988 $filename, $self->{'default_encoding'});
989 }
990 $encoding = $self->{'default_encoding'};
991 }
992
993
994 # check for equivalents where textcat doesn't have some encodings...
995 # eg MS versions of standard encodings
996 if ($encoding =~ /^iso_8859_(\d+)/) {
997 my $iso = $1; # which variant of the iso standard?
998 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
999 if ($text =~ /[\x80-\x9f]/) {
1000 # Western Europe
1001 if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1002 elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1003 elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1004 elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1005 elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1006 elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1007 elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1008 }
1009 }
1010
1011 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
1012 !defined $encodings::encodings->{$encoding}) {
1013 if ($self->{'verbosity'}) {
1014 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1015 $filename, $encoding, $self->{'default_encoding'});
1016 }
1017 $encoding = $self->{'default_encoding'};
1018 }
1019
1020 return ($language, $encoding);
1021}
1022
1023# add any extra metadata that's been passed around from one
1024# plugin to another.
1025# extra_metadata uses add_utf8_metadata so it expects metadata values
1026# to already be in utf8
1027sub extra_metadata {
1028 my $self = shift (@_);
1029 my ($doc_obj, $cursection, $metadata) = @_;
1030
1031 foreach my $field (keys(%$metadata)) {
1032 # $metadata->{$field} may be an array reference
1033 if ($field eq "gsdlassocfile_tobe") {
1034 # 'gsdlassocfile_tobe' is artificially introduced metadata
1035 # that is used to signal that certain additional files should
1036 # be tied to this document. Useful in situations where a
1037 # metadata pass in the plugin pipeline works out some files
1038 # need to be associated with a document, but the document hasn't
1039 # been formed yet.
1040
1041 my $equiv_form = "";
1042 foreach my $gaf (@{$metadata->{$field}}) {
1043 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1044 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1045 my $filename = $full_filename;
1046
1047 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1048
1049 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1050 my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1051 my $srcicon = "_icon".$doc_ext."_";
1052 my $end_doclink = "</a>";
1053
1054 $equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1055 }
1056 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1057 }
1058 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1059 map {
1060 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1061 } @{$metadata->{$field}};
1062 } else {
1063 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1064 }
1065 }
1066}
1067
1068# initialise metadata extractors
1069sub initialise_extractors {
1070 my $self = shift (@_);
1071
1072 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1073 &acronym::initialise_acronyms();
1074 }
1075}
1076
1077# finalise metadata extractors
1078sub finalise_extractors {
1079 my $self = shift (@_);
1080
1081 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1082 &acronym::finalise_acronyms();
1083 }
1084}
1085
1086# FIRSTNNN: extract the first NNN characters as metadata
1087sub extract_first_NNNN_characters {
1088 my $self = shift (@_);
1089 my ($textref, $doc_obj, $thissection) = @_;
1090
1091 foreach my $size (split /,/, $self->{'first'}) {
1092 my $tmptext = $$textref;
1093 $tmptext =~ s/^\s+//;
1094 $tmptext =~ s/\s+$//;
1095 $tmptext =~ s/\s+/ /gs;
1096 $tmptext = substr ($tmptext, 0, $size);
1097 $tmptext =~ s/\s\S*$/&#8230;/;
1098 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1099 }
1100}
1101
1102sub extract_email {
1103 my $self = shift (@_);
1104 my ($textref, $doc_obj, $thissection) = @_;
1105 my $outhandle = $self->{'outhandle'};
1106
1107 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1108 if ($self->{'verbosity'} > 2);
1109
1110 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1111 @email = sort @email;
1112
1113 my @email2 = ();
1114 foreach my $address (@email) {
1115 if (!(join(" ",@email2) =~ m/$address/ )) {
1116 push @email2, $address;
1117 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1118 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1119 if ($self->{'verbosity'} > 3);
1120 }
1121 }
1122 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1123 if ($self->{'verbosity'} > 2);
1124}
1125
1126# extract metadata
1127sub auto_extract_metadata {
1128
1129 my $self = shift (@_);
1130 my ($doc_obj) = @_;
1131
1132 if ($self->{'extract_email'}) {
1133 my $thissection = $doc_obj->get_top_section();
1134 while (defined $thissection) {
1135 my $text = $doc_obj->get_text($thissection);
1136 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1137 $thissection = $doc_obj->get_next_section ($thissection);
1138 }
1139 }
1140 if ($self->{'extract_placenames'}) {
1141 my $thissection = $doc_obj->get_top_section();
1142 while (defined $thissection) {
1143 my $text = $doc_obj->get_text($thissection);
1144 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1145 $thissection = $doc_obj->get_next_section ($thissection);
1146 }
1147 }
1148
1149 # adding kea keyphrases
1150 if ($self->{'kea'}) {
1151
1152 my $thissection = $doc_obj->get_top_section();
1153 my $text = "";
1154 my $list;
1155
1156 #loop through sections to gather whole doc
1157 while (defined $thissection) {
1158 my $sectiontext = $doc_obj->get_text($thissection);
1159 $text = $text.$sectiontext;
1160 $thissection = $doc_obj->get_next_section ($thissection);
1161 }
1162
1163 if ($self->{'kea_options'}) {
1164 #if kea options flag is set, call Kea with specified options
1165 $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
1166 } else {
1167 #otherwise call Kea with no options
1168 $list = &Kea::extract_KeyPhrases ($text);
1169 }
1170 if ($list){
1171 # if a list of kea keyphrases was returned (ie not empty)
1172 if ($self->{'verbosity'}) {
1173 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1174 }
1175
1176 #add metadata to top section
1177 $thissection = $doc_obj->get_top_section();
1178
1179 # add all key phrases as one metadata
1180 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1181
1182 # add individual key phrases as multiple metadata
1183 foreach my $keyphrase (split(',', $list)) {
1184 $keyphrase =~ s/^\s+|\s+$//g;
1185 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1186 }
1187 }
1188 } #end of kea
1189
1190 if ($self->{'first'}) {
1191 my $thissection = $doc_obj->get_top_section();
1192 while (defined $thissection) {
1193 my $text = $doc_obj->get_text($thissection);
1194 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1195 $thissection = $doc_obj->get_next_section ($thissection);
1196 }
1197 }
1198
1199 if ($self->{'extract_acronyms'}) {
1200 my $thissection = $doc_obj->get_top_section();
1201 while (defined $thissection) {
1202 my $text = $doc_obj->get_text($thissection);
1203 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1204 $thissection = $doc_obj->get_next_section ($thissection);
1205 }
1206 }
1207
1208 if ($self->{'markup_acronyms'}) {
1209 my $thissection = $doc_obj->get_top_section();
1210 while (defined $thissection) {
1211 my $text = $doc_obj->get_text($thissection);
1212 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1213 $doc_obj->delete_text($thissection);
1214 $doc_obj->add_text($thissection, $text);
1215 $thissection = $doc_obj->get_next_section ($thissection);
1216 }
1217 }
1218
1219 if($self->{'date_extract'}) {
1220 my $thissection = $doc_obj->get_top_section();
1221 while (defined $thissection) {
1222
1223 my $text = $doc_obj->get_text($thissection);
1224 &DateExtract::get_date_metadata($text, $doc_obj,
1225 $thissection,
1226 $self->{'no_biblio'},
1227 $self->{'max_year'},
1228 $self->{'max_century'});
1229 $thissection = $doc_obj->get_next_section ($thissection);
1230 }
1231 }
1232}
1233
1234# extract acronyms from a section in a document. progress is
1235# reported to outhandle based on the verbosity. both the Acronym
1236# and the AcronymKWIC metadata items are created.
1237
1238sub extract_acronyms {
1239 my $self = shift (@_);
1240 my ($textref, $doc_obj, $thissection) = @_;
1241 my $outhandle = $self->{'outhandle'};
1242
1243 # print $outhandle " extracting acronyms ...\n"
1244 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1245 if ($self->{'verbosity'} > 2);
1246
1247 my $acro_array = &acronym::acronyms($textref);
1248
1249 foreach my $acro (@$acro_array) {
1250
1251 #check that this is the first time ...
1252 my $seen_before = "false";
1253 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1254 foreach my $thisAcro (@$previous_data) {
1255 if ($thisAcro eq $acro->to_string()) {
1256 $seen_before = "true";
1257 if ($self->{'verbosity'} >= 4) {
1258 gsprintf($outhandle, " {BasPlug.already_seen} " .
1259 $acro->to_string() . "\n");
1260 }
1261 }
1262 }
1263
1264 if ($seen_before eq "false") {
1265 #write it to the file ...
1266 $acro->write_to_file();
1267
1268 #do the normal acronym
1269 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1270 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1271 if ($self->{'verbosity'} > 3);
1272 }
1273 }
1274
1275 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1276 if ($self->{'verbosity'} > 2);
1277}
1278
1279sub markup_acronyms {
1280 my $self = shift (@_);
1281 my ($text, $doc_obj, $thissection) = @_;
1282 my $outhandle = $self->{'outhandle'};
1283
1284 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1285 if ($self->{'verbosity'} > 2);
1286
1287 #self is passed in to check for verbosity ...
1288 $text = &acronym::markup_acronyms($text, $self);
1289
1290 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1291 if ($self->{'verbosity'} > 2);
1292
1293 return $text;
1294}
1295
1296sub compile_stats {
1297 my $self = shift(@_);
1298 my ($stats) = @_;
1299
1300 $stats->{'num_processed'} += $self->{'num_processed'};
1301 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1302 $stats->{'num_archives'} += $self->{'num_archives'};
1303
1304}
1305
1306sub associate_cover_image {
1307 my $self = shift(@_);
1308 my ($doc_obj, $filename) = @_;
1309
1310 my $top_section=$doc_obj->get_top_section();
1311
1312 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1313 if (-e $filename) {
1314 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1315 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1316 } else {
1317 $filename =~ s/jpg$/JPG/;
1318 if (-e $filename) {
1319 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1320 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1321 }
1322 }
1323}
1324
13251;
Note: See TracBrowser for help on using the repository browser.