source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 7644

Last change on this file since 7644 was 7644, checked in by jrm21, 20 years ago

don't print "wrong encoding" message for text in english.
textcat thinks all english is in iso-8859-1, so basplug complains if the
file was read as utf8.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.0 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use Kea;
35use parsargv;
36use multiread;
37use encodings;
38use cnseg;
39use acronym;
40use textcat;
41use doc;
42use diagnostics;
43use DateExtract;
44use ghtml;
45use gsprintf;
46use printusage;
47
48
49my $unicode_list =
50 [ { 'name' => "auto",
51 'desc' => "{BasPlug.input_encoding.auto}" },
52 { 'name' => "ascii",
53 'desc' => "{BasPlug.input_encoding.ascii}" },
54 { 'name' => "utf8",
55 'desc' => "{BasPlug.input_encoding.utf8}" },
56 { 'name' => "unicode",
57 'desc' => "{BasPlug.input_encoding.unicode}" } ];
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "regexp",
63 'deft' => "",
64 'reqd' => "no" },
65 { 'name' => "block_exp",
66 'desc' => "{BasPlug.block_exp}",
67 'type' => "regexp",
68 'deft' => "",
69 'reqd' => "no" },
70 { 'name' => "input_encoding",
71 'desc' => "{BasPlug.input_encoding}",
72 'type' => "enum",
73 'list' => $unicode_list,
74 'reqd' => "no" ,
75 'deft' => "auto" } ,
76 { 'name' => "default_encoding",
77 'desc' => "{BasPlug.default_encoding}",
78 'type' => "enum",
79 'list' => $unicode_list,
80 'reqd' => "no",
81 'deft' => "utf8" },
82 { 'name' => "extract_language",
83 'desc' => "{BasPlug.extract_language}",
84 'type' => "flag",
85 'reqd' => "no" },
86 { 'name' => "default_language",
87 'desc' => "{BasPlug.default_language}",
88 'type' => "language",
89 'deft' => "en",
90 'reqd' => "no" },
91 { 'name' => "extract_acronyms",
92 'desc' => "{BasPlug.extract_acronyms}",
93 'type' => "flag",
94 'reqd' => "no" },
95 { 'name' => "markup_acronyms",
96 'desc' => "{BasPlug.markup_acronyms}",
97 'type' => "flag",
98 'reqd' => "no" },
99 { 'name' => "first",
100 'desc' => "{BasPlug.first}",
101 'type' => "string",
102 'reqd' => "no" },
103 { 'name' => "extract_email",
104 'desc' => "{BasPlug.extract_email}",
105 'type' => "flag",
106 'reqd' => "no" },
107 { 'name' => "extract_historical_years",
108 'desc' => "{BasPlug.extract_historical_years}",
109 'type' => "flag",
110 'reqd' => "no" },
111 { 'name' => "maximum_year",
112 'desc' => "{BasPlug.maximum_year}",
113 'type' => "int",
114 'deft' => (localtime)[5]+1900,
115 'reqd' => "no"},
116 { 'name' => "maximum_century",
117 'desc' => "{BasPlug.maximum_century}",
118 'type' => "string",
119 'deft' => "",
120 'reqd' => "no" },
121 { 'name' => "no_bibliography",
122 'desc' => "{BasPlug.no_bibliography}",
123 'type' => "flag",
124 'reqd' => "no"},
125 { 'name' => "cover_image",
126 'desc' => "{BasPlug.cover_image}",
127 'type' => "flag",
128 'reqd' => "no" } ];
129
130my $options = { 'name' => "BasPlug",
131 'desc' => "{BasPlug.desc}",
132 'abstract' => "yes",
133 'inherits' => "no",
134 'args' => $arguments };
135
136
137sub gsprintf
138{
139 return &gsprintf::gsprintf(@_);
140}
141
142
143sub get_arguments
144{
145 local $self = shift(@_);
146 local $optionlistref = $self->{'option_list'};
147 local @optionlist = @$optionlistref;
148 local $pluginoptions = pop(@$optionlistref);
149 local $pluginarguments = $pluginoptions->{'args'};
150 return $pluginarguments;
151}
152
153
154sub print_xml_usage
155{
156 local $self = shift(@_);
157
158 # XML output is always in UTF-8
159 &gsprintf::output_strings_in_UTF8;
160
161 &PrintUsage::print_xml_header();
162 $self->print_xml();
163}
164
165
166sub print_xml
167{
168 local $self = shift(@_);
169
170 local $optionlistref = $self->{'option_list'};
171 local @optionlist = @$optionlistref;
172 local $pluginoptions = pop(@$optionlistref);
173 return if (!defined($pluginoptions));
174
175 &gsprintf(STDERR, "<PlugInfo>\n");
176 &gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
177 my $desc = &gsprintf::lookup_string($pluginoptions->{'desc'});
178 $desc =~ s/</&amp;lt;/g; # doubly escaped
179 $desc =~ s/>/&amp;gt;/g;
180
181 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182 &gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
183 &gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
184 &gsprintf(STDERR, " <Arguments>\n");
185 if (defined($pluginoptions->{'args'})) {
186 &PrintUsage::print_options_xml($pluginoptions->{'args'});
187 }
188
189 # Recurse up the plugin hierarchy
190 $self->print_xml();
191
192 &gsprintf(STDERR, " </Arguments>\n");
193 &gsprintf(STDERR, "</PlugInfo>\n");
194}
195
196
197sub print_txt_usage
198{
199 local $self = shift(@_);
200
201 # Print the usage message for a plugin (recursively)
202 local $descoffset = $self->determine_description_offset(0);
203 $self->print_plugin_usage($descoffset, 1);
204}
205
206
207sub determine_description_offset
208{
209 local $self = shift(@_);
210 local $maxoffset = shift(@_);
211
212 local $optionlistref = $self->{'option_list'};
213 local @optionlist = @$optionlistref;
214 local $pluginoptions = pop(@$optionlistref);
215 return $maxoffset if (!defined($pluginoptions));
216
217 # Find the length of the longest option string of this plugin
218 local $pluginargs = $pluginoptions->{'args'};
219 if (defined($pluginargs)) {
220 local $longest = &PrintUsage::find_longest_option_string($pluginargs);
221 if ($longest > $maxoffset) {
222 $maxoffset = $longest;
223 }
224 }
225
226 # Recurse up the plugin hierarchy
227 $maxoffset = $self->determine_description_offset($maxoffset);
228 $self->{'option_list'} = \@optionlist;
229 return $maxoffset;
230}
231
232
233sub print_plugin_usage
234{
235 local $self = shift(@_);
236 local $descoffset = shift(@_);
237 local $isleafclass = shift(@_);
238
239 local $optionlistref = $self->{'option_list'};
240 local @optionlist = @$optionlistref;
241 local $pluginoptions = pop(@$optionlistref);
242 return if (!defined($pluginoptions));
243
244 local $pluginname = $pluginoptions->{'name'};
245 local $pluginargs = $pluginoptions->{'args'};
246 local $plugindesc = $pluginoptions->{'desc'};
247
248 # Produce the usage information using the data structure above
249 if ($isleafclass) {
250 if (defined($plugindesc)) {
251 &gsprintf(STDERR, "$plugindesc\n\n");
252 }
253 &gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
254 }
255
256 # Display the plugin options, if there are some
257 if (defined($pluginargs)) {
258 # Calculate the column offset of the option descriptions
259 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261 if ($isleafclass) {
262 &gsprintf(STDERR, " {common.specific_options}:\n");
263 }
264 else {
265 &gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
266 }
267
268 # Display the plugin options
269 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
270 }
271
272 # Recurse up the plugin hierarchy
273 $self->print_plugin_usage($descoffset, 0);
274 $self->{'option_list'} = \@optionlist;
275}
276
277
278sub new {
279 my $class = shift (@_);
280 my $plugin_name = shift (@_);
281 my $self = {};
282 $self->{'plugin_type'} = "BasPlug";
283 my $enc = "^(";
284 map {$enc .= "$_|";} keys %$encodings::encodings;
285 my $denc = $enc . "ascii|utf8|unicode)\$";
286 $enc .= "ascii|utf8|unicode|auto)\$";
287
288 $self->{'outhandle'} = STDERR;
289 my $year = (localtime)[5]+1900;
290
291 $self->{'textcat'} = new textcat();
292
293 $self->{'num_processed'} = 0;
294 $self->{'num_not_processed'} = 0;
295 $self->{'num_blocked'} = 0;
296 $self->{'num_archives'} = 0;
297
298 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
299 $self->{'option_list'} = [ $options ];
300
301 # general options available to all plugins
302 if (!parsargv::parse(\@_,
303 q^process_exp/.*/^, \$self->{'process_exp'},
304 q^block_exp/.*/^, \$self->{'block_exp'},
305 q^extract_language^, \$self->{'extract_language'},
306 q^extract_acronyms^, \$self->{'extract_acronyms'},
307 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
308 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
309 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
310 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
311 q^extract_email^, \$self->{'extract_email'},
312 q^markup_acronyms^, \$self->{'markup_acronyms'},
313 q^default_language/.{2}/en^, \$self->{'default_language'},
314 q^first/.*/^, \$self->{'first'},
315 q^extract_historical_years^, \$self->{'date_extract'},
316 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
317 q^no_bibliography^, \$self->{'no_biblio'},
318 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
319 q^cover_image^, \$self->{'cover_image'},
320 q^separate_cjk^, \$self->{'separate_cjk'},
321 "allow_extra_options")) {
322
323 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
324 $self->print_txt_usage(""); # Use default resource bundle
325 die "\n";
326 }
327
328 return bless $self, $class;
329}
330
331# initialize BasPlug options
332# if init() is overridden in a sub-class, remember to call BasPlug::init()
333sub init {
334 my $self = shift (@_);
335 my ($verbosity, $outhandle, $failhandle) = @_;
336
337 # verbosity is passed through from the processor
338 $self->{'verbosity'} = $verbosity;
339
340 # as are the outhandle and failhandle
341 $self->{'outhandle'} = $outhandle if defined $outhandle;
342 $self->{'failhandle'} = $failhandle;
343
344 # set process_exp and block_exp to defaults unless they were
345 # explicitly set
346
347 if ((!$self->is_recursive()) and
348 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
349
350 $self->{'process_exp'} = $self->get_default_process_exp ();
351 if ($self->{'process_exp'} eq "") {
352 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
353 }
354 }
355
356 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
357 $self->{'block_exp'} = $self->get_default_block_exp ();
358 }
359}
360
361sub begin {
362 my $self = shift (@_);
363 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
364 $self->initialise_extractors();
365}
366
367sub end {
368 my ($self) = @_;
369 $self->finalise_extractors();
370}
371
372# this function should be overridden to return 1
373# in recursive plugins
374sub is_recursive {
375 my $self = shift (@_);
376
377 return 0;
378}
379
380sub get_default_block_exp {
381 my $self = shift (@_);
382
383 return "";
384}
385
386sub get_default_process_exp {
387 my $self = shift (@_);
388
389 return "";
390}
391
392# The BasPlug read() function. This function does all the right things
393# to make general options work for a given plugin. It calls the process()
394# function which does all the work specific to a plugin (like the old
395# read functions used to do). Most plugins should define their own
396# process() function and let this read() function keep control.
397#
398# recursive plugins (e.g. RecPlug) and specialized plugins like those
399# capable of processing many documents within a single file (e.g.
400# GMLPlug) should normally implement their own version of read()
401#
402# Return number of files processed, undef if can't recognise, -1 if can't
403# process
404# Note that $base_dir might be "" and that $file might
405# include directories
406
407sub read {
408 my $self = shift (@_);
409
410 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
411
412 if ($self->is_recursive()) {
413 &gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
414 }
415
416 my $outhandle = $self->{'outhandle'};
417
418 my $filename = $file;
419 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
420
421 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
422 $self->{'num_blocked'} ++;
423 return 0; # blocked
424 }
425 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
426 return undef; # can't recognise
427 }
428 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
429
430 # Do encoding stuff
431 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
432
433 # create a new document
434 my $doc_obj = new doc ($filename, "indexed_doc");
435 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
436 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
437 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
438 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
439 my ($filemeta) = $file =~ /([^\\\/]+)$/;
440 # how do we know what encoding the filename is in?
441 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
442 if ($self->{'cover_image'}) {
443 $self->associate_cover_image($doc_obj, $filename);
444 }
445
446 # read in file ($text will be in utf8)
447 my $text = "";
448 $self->read_file ($filename, $encoding, $language, \$text);
449
450 if (!length ($text)) {
451 my $plugin_name = ref ($self);
452 &gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
453
454 my $failhandle = $self->{'failhandle'};
455 &gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
456 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
457 $self->{'num_not_processed'} ++;
458
459 return 0; # what should we return here?? error but don't want to pass it on
460 }
461
462 # include any metadata passed in from previous plugins
463 # note that this metadata is associated with the top level section
464 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
465
466 # do plugin specific processing of doc_obj
467 return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli));
468
469 # do any automatic metadata extraction
470 $self->auto_extract_metadata ($doc_obj);
471
472 # add an OID
473 # see if there is a plugin-specific set_OID function...
474 if (defined ($self->can(set_OID))) {
475 # it will need $doc_obj to set the Identifier metadata...
476 $self->set_OID($doc_obj);
477 } else {
478 # use the default set_OID() in doc.pm
479 $doc_obj->set_OID();
480 }
481
482 # process the document
483 $processor->process($doc_obj);
484
485 $self->{'num_processed'} ++;
486
487 return 1; # processed the file
488}
489
490# returns undef if file is rejected by the plugin
491sub process {
492 my $self = shift (@_);
493 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
494
495 &gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
496 # die "Basplug::process function must be implemented in sub-class\n";
497
498 return undef; # never gets here
499}
500
501# uses the multiread package to read in the entire file pointed to
502# by filename and loads the resulting text into $$textref. Input text
503# may be in any of the encodings handled by multiread, output text
504# will be in utf8
505sub read_file {
506 my $self = shift (@_);
507 my ($filename, $encoding, $language, $textref) = @_;
508
509 if (!-r $filename)
510 {
511 my $outhandle = $self->{'outhandle'};
512 &gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
513 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
514 return;
515 }
516
517 $$textref = "";
518
519 open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
520 # open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
521
522 if ($encoding eq "ascii") {
523 undef $/;
524 $$textref = <FILE>;
525 $/ = "\n";
526 } else {
527 my $reader = new multiread();
528 $reader->set_handle ('BasPlug::FILE');
529 $reader->set_encoding ($encoding);
530 $reader->read_file ($textref);
531
532 #Now segments chinese if the separate_cjk option is set
533 if ($self->{'separate_cjk'}) {
534 # segment the Chinese words
535 $$textref = &cnseg::segment($$textref);
536 }
537 }
538
539 close FILE;
540}
541
542sub filename_based_title
543{
544 my $self = shift (@_);
545 my ($file) = @_;
546
547 my $file_derived_title = $file;
548 $file_derived_title =~ s/_/ /g;
549 $file_derived_title =~ s/\..*?$//;
550
551 return $file_derived_title;
552}
553
554
555sub title_fallback
556{
557 my $self = shift (@_);
558 my ($doc_obj,$section,$file) = @_;
559
560 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
561
562 my $file_derived_title = $self->filename_based_title($file);
563 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
564 }
565}
566
567sub textcat_get_language_encoding {
568 my $self = shift (@_);
569 my ($filename) = @_;
570
571 my ($language, $encoding, $extracted_encoding);
572 if ($self->{'input_encoding'} eq "auto") {
573 # use textcat to automatically work out the input encoding and language
574 ($language, $encoding) = $self->get_language_encoding ($filename);
575 } elsif ($self->{'extract_language'}) {
576 # use textcat to get language metadata
577 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
578 $encoding = $self->{'input_encoding'};
579 # don't print this message for english... english in utf8 is identical
580 # to english in iso-8859-1 (except for some punctuation). We don't have
581 # a language model for en_utf8, so textcat always says iso-8859-1!
582 if ($extracted_encoding ne $encoding && $language ne "en"
583 && $self->{'verbosity'}) {
584 my $plugin_name = ref ($self);
585 my $outhandle = $self->{'outhandle'};
586 &gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
587 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
588 # print $outhandle "appears to be encoded as $extracted_encoding.\n";
589 }
590 } else {
591 $language = $self->{'default_language'};
592 $encoding = $self->{'input_encoding'};
593 }
594 return ($language, $encoding);
595}
596
597# Uses textcat to work out the encoding and language of the text in
598# $filename. All html tags are removed before processing.
599# returns an array containing "language" and "encoding"
600sub get_language_encoding {
601 my $self = shift (@_);
602 my ($filename) = @_;
603 my $outhandle = $self->{'outhandle'};
604
605 # read in file
606 open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
607 undef $/;
608 my $text = <FILE>;
609 $/ = "\n";
610 close FILE;
611
612 # remove <title>stuff</title> -- as titles tend often to be in English
613 # for foreign language documents
614 $text =~ s/<title>.*?<\/title>//i;
615
616 # remove all HTML tags
617 $text =~ s/<[^>]*>//sg;
618
619 # get the language/encoding
620 my $results = $self->{'textcat'}->classify(\$text);
621
622 # if textcat returns 3 or less possibilities we'll use the
623 # first one in the list - otherwise use the defaults
624 if (scalar @$results > 3) {
625 # changed 12 Feb 2003 by jrm21
626 # use the most popular encoding at least... otherwise we might
627 # generate invalid archive files!
628 my %guessed_encodings = ();
629 foreach my $result (@$results) {
630 $result =~ /([^\-]+)$/;
631 my $enc=$1;
632 if (!defined($guessed_encodings{$enc})) {
633 $guessed_encodings{$enc}=0;
634 }
635 $guessed_encodings{$enc}++;
636 }
637 my $best_encoding="";
638 $guessed_encodings{""}=-1;
639 foreach my $enc (keys %guessed_encodings) {
640 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
641 $best_encoding=$enc;
642 }
643 }
644
645 if ($self->{'input_encoding'} ne 'auto') {
646 if ($self->{'extract_language'} && $self->{'verbosity'}) {
647 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
648 # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
649 # print $outhandle "defaulting to $self->{'default_language'}\n";
650 }
651 return ($self->{'default_language'}, $self->{'input_encoding'});
652
653 } else {
654 if ($self->{'verbosity'}) {
655 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
656 # print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
657 # print $outhandle "defaulting to $self->{'default_language'}.\n";
658 }
659 return ($self->{'default_language'}, $best_encoding);
660 }
661 }
662
663 # format language/encoding
664 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
665 if (!defined $language) {
666 if ($self->{'verbosity'}) {
667 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
668 # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
669 # print $outhandle "defaulting to $self->{'default_language'}\n";
670 }
671 $language = $self->{'default_language'};
672 }
673 if (!defined $encoding) {
674 if ($self->{'verbosity'}) {
675 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
676 # print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
677 # print $outhandle "defaulting to $self->{'default_encoding'}\n";
678 }
679 $encoding = $self->{'default_encoding'};
680 }
681
682 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
683 !defined $encodings::encodings->{$encoding}) {
684 if ($self->{'verbosity'}) {
685 &gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", $filename, $encoding, $self->{'default_encoding'});
686 # print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
687 # print $outhandle "using $self->{'default_encoding'}\n";
688 }
689 $encoding = $self->{'default_encoding'};
690 }
691
692 return ($language, $encoding);
693}
694
695# add any extra metadata that's been passed around from one
696# plugin to another.
697# extra_metadata uses add_utf8_metadata so it expects metadata values
698# to already be in utf8
699sub extra_metadata {
700 my $self = shift (@_);
701 my ($doc_obj, $cursection, $metadata) = @_;
702
703 foreach my $field (keys(%$metadata)) {
704 # $metadata->{$field} may be an array reference
705 if (ref ($metadata->{$field}) eq "ARRAY") {
706 map {
707 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
708 } @{$metadata->{$field}};
709 } else {
710 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
711 }
712 }
713}
714
715# initialise metadata extractors
716sub initialise_extractors {
717 my $self = shift (@_);
718
719 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
720 &acronym::initialise_acronyms();
721 }
722}
723
724# finalise metadata extractors
725sub finalise_extractors {
726 my $self = shift (@_);
727
728 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
729 &acronym::finalise_acronyms();
730 }
731}
732
733# FIRSTNNN: extract the first NNN characters as metadata
734sub extract_first_NNNN_characters {
735 my $self = shift (@_);
736 my ($textref, $doc_obj, $thissection) = @_;
737
738 foreach my $size (split /,/, $self->{'first'}) {
739 my $tmptext = $$textref;
740 $tmptext =~ s/^\s+//;
741 $tmptext =~ s/\s+$//;
742 $tmptext =~ s/\s+/ /gs;
743 $tmptext = substr ($tmptext, 0, $size);
744 $tmptext =~ s/\s\S*$/&#8230;/;
745 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
746 }
747}
748
749sub extract_email {
750 my $self = shift (@_);
751 my ($textref, $doc_obj, $thissection) = @_;
752 my $outhandle = $self->{'outhandle'};
753
754 # print $outhandle " extracting email addresses ...\n"
755 &gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
756 if ($self->{'verbosity'} > 2);
757
758 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
759 @email = sort @email;
760
761 my @email2 = ();
762 foreach my $address (@email) {
763 if (!(join(" ",@email2) =~ m/$address/ )) {
764 push @email2, $address;
765 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
766 # print $outhandle " extracting $address\n"
767 &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
768 if ($self->{'verbosity'} > 3);
769 }
770 }
771 # print $outhandle " done extracting email addresses.\n"
772 &gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
773 if ($self->{'verbosity'} > 2);
774}
775
776# extract metadata
777sub auto_extract_metadata {
778
779 my $self = shift (@_);
780 my ($doc_obj) = @_;
781
782 if ($self->{'extract_email'}) {
783 my $thissection = $doc_obj->get_top_section();
784 while (defined $thissection) {
785 my $text = $doc_obj->get_text($thissection);
786 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
787 $thissection = $doc_obj->get_next_section ($thissection);
788 }
789 }
790
791
792#adding kea keyphrases
793 if ($self->{'kea'}) {
794
795 my $thissection = $doc_obj->get_top_section();
796 my $text = "";
797 my @list;
798
799 while (defined $thissection) { #loop through sections to gather whole doc
800 my $sectiontext = $doc_obj->get_text($thissection);
801 $text = $text.$sectiontext;
802 $thissection = $doc_obj->get_next_section ($thissection);
803 }
804
805 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
806 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
807 } else { #otherwise call Kea with no options
808 @list = &Kea::extract_KeyPhrases ($text);
809 }
810
811 if(@list){ #if a list of kea keyphrases was returned (ie not empty)
812 my $keyphrases = $list[0]; #first arg is keyphrase list
813 my $stems = $list[1]; #second arg is stemmed keyphrase list
814 &gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
815 # print STDERR "keyphrases: $keyphrases\n";
816 &gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
817 # print STDERR "stems: $stems\n";
818 $thissection = $doc_obj->get_top_section(); #add metadata to top section
819 $doc_obj->add_metadata($thissection, "kea", $keyphrases);
820 $doc_obj->add_metadata($thissection, "stems", $stems);
821 }
822 } #end of kea
823
824 if ($self->{'first'}) {
825 my $thissection = $doc_obj->get_top_section();
826 while (defined $thissection) {
827 my $text = $doc_obj->get_text($thissection);
828 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
829 $thissection = $doc_obj->get_next_section ($thissection);
830 }
831 }
832
833 if ($self->{'extract_acronyms'}) {
834 my $thissection = $doc_obj->get_top_section();
835 while (defined $thissection) {
836 my $text = $doc_obj->get_text($thissection);
837 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
838 $thissection = $doc_obj->get_next_section ($thissection);
839 }
840 }
841
842 if ($self->{'markup_acronyms'}) {
843 my $thissection = $doc_obj->get_top_section();
844 while (defined $thissection) {
845 my $text = $doc_obj->get_text($thissection);
846 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
847 $doc_obj->delete_text($thissection);
848 $doc_obj->add_text($thissection, $text);
849 $thissection = $doc_obj->get_next_section ($thissection);
850 }
851 }
852
853 if($self->{'date_extract'}) {
854 my $thissection = $doc_obj->get_top_section();
855 while (defined $thissection) {
856
857 my $text = $doc_obj->get_text($thissection);
858 &DateExtract::get_date_metadata($text, $doc_obj,
859 $thissection,
860 $self->{'no_biblio'},
861 $self->{'max_year'},
862 $self->{'max_century'});
863 $thissection = $doc_obj->get_next_section ($thissection);
864 }
865 }
866}
867
868# extract acronyms from a section in a document. progress is
869# reported to outhandle based on the verbosity. both the Acronym
870# and the AcronymKWIC metadata items are created.
871
872sub extract_acronyms {
873 my $self = shift (@_);
874 my ($textref, $doc_obj, $thissection) = @_;
875 my $outhandle = $self->{'outhandle'};
876
877 # print $outhandle " extracting acronyms ...\n"
878 &gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
879 if ($self->{'verbosity'} > 2);
880
881 my $acro_array = &acronym::acronyms($textref);
882
883 foreach my $acro (@$acro_array) {
884
885 #check that this is the first time ...
886 my $seen_before = "false";
887 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
888 foreach my $thisAcro (@$previous_data) {
889 if ($thisAcro eq $acro->to_string()) {
890 $seen_before = "true";
891 # print $outhandle " already seen ". $acro->to_string() . "\n"
892 &gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
893 if ($self->{'verbosity'} >= 4);
894 }
895 }
896
897 if ($seen_before eq "false") {
898 #write it to the file ...
899 $acro->write_to_file();
900
901 #do the normal acronym
902 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
903 # print $outhandle " adding ". $acro->to_string() . "\n"
904 &gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
905 if ($self->{'verbosity'} > 3);
906 }
907 }
908
909 # print $outhandle " done extracting acronyms. \n"
910 &gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
911 if ($self->{'verbosity'} > 2);
912}
913
914sub markup_acronyms {
915 my $self = shift (@_);
916 my ($text, $doc_obj, $thissection) = @_;
917 my $outhandle = $self->{'outhandle'};
918
919 # print $outhandle " marking up acronyms ...\n"
920 &gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
921 if ($self->{'verbosity'} > 2);
922
923 #self is passed in to check for verbosity ...
924 $text = &acronym::markup_acronyms($text, $self);
925
926 # print $outhandle " done marking up acronyms. \n"
927 &gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
928 if ($self->{'verbosity'} > 2);
929
930 return $text;
931}
932
933sub compile_stats {
934 my $self = shift(@_);
935 my ($stats) = @_;
936
937 $stats->{'num_processed'} += $self->{'num_processed'};
938 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
939 $stats->{'num_archives'} += $self->{'num_archives'};
940
941}
942
943sub associate_cover_image {
944 my $self = shift(@_);
945 my ($doc_obj, $filename) = @_;
946
947 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
948 if (-e $filename) {
949 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
950 } else {
951 $filename =~ s/jpg$/JPG/;
952 if (-e $filename) {
953 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
954 }
955 }
956}
957
9581;
Note: See TracBrowser for help on using the repository browser.