source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 4873

Last change on this file since 4873 was 4873, checked in by mdewsnip, 21 years ago

Further work on standardising option descriptions. Specifically, in preparation for translating the option descriptions into other languages, all the option description strings have been moved in a "resource bundle" file (modelled on a Java resource bundle). (This also has the advantage of reducing the number of duplicate descriptions). The option descriptions in the plugins, classifiers, mkcol.pl, import.pl and buildcol.pl have been replaced with keys into this resource bundle (perllib/strings.rb). When translating the strings in this file into a new language, the new resource bundle should be named strings_<language-code>.rb (where <language-code> is a combination of language and country, eg. 'fr_FR' for the version of French spoken in France).

To support these changes, the PrintUsage module (perllib/printusage.pm) has new code for reading resource bundles and displaying the correct strings. Also, pluginfo.pl, classinfo.pl, mkcol.pl, import.pl and buildcol.pl have a new option (-language) for specifying the language code to display option descriptions in.

If a resource bundle for the specified language code does not exist, a generic resource bundle is used (strings.rb). This currently contains the English text descriptions. However, for users who always use Greenstone in another language, it would be easier to rename the standard file to strings_en_US.rb and rename the resource bundle of their desired language to strings.rb. This would mean they would not have to constantly specify their language with the -language option, since the default resource bundle will suit them.

Currently, the encoding names (in encodings.pm) are not part of this scheme. These are displayed as part of BasPlug's input_encoding option. It is debatable whether these names would be worth translating into other languages.

Parse errors in plugins and classifiers currently cause them to display the usage information using the default resource bundle. It is likely that BasPlug will soon have an option added to specify the language for the usage information in this case. (Note that this does not include using pluginfo.pl or classinfo.pl to display usage information - these have a -language option).

  • Property svn:keywords set to Author Date Id Revision
File size: 33.0 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use Kea;
35use parsargv;
36use multiread;
37use encodings;
38use cnseg;
39use acronym;
40use textcat;
41use doc;
42use diagnostics;
43use DateExtract;
44use ghtml;
45use printusage;
46
47my $unicode_list =
48 [ { 'name' => "auto",
49 'desc' => "{BasPlug.input_encoding.auto}" },
50 { 'name' => "ascii",
51 'desc' => "{BasPlug.input_encoding.ascii}" },
52 { 'name' => "utf8",
53 'desc' => "{BasPlug.input_encoding.utf8}" },
54 { 'name' => "unicode",
55 'desc' => "{BasPlug.input_encoding.unicode}" } ];
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasPlug.process_exp}",
60 'type' => "string",
61 'deft' => "",
62 'reqd' => "no" },
63 { 'name' => "block_exp",
64 'desc' => "{BasPlug.block_exp}",
65 'type' => "string",
66 'deft' => "",
67 'reqd' => "no" },
68 { 'name' => "input_encoding",
69 'desc' => "{BasPlug.input_encoding}",
70 'type' => "enum",
71 'list' => $unicode_list,
72 'reqd' => "no" ,
73 'deft' => "auto" } ,
74 { 'name' => "default_encoding",
75 'desc' => "{BasPlug.default_encoding}",
76 'type' => "enum",
77 'reqd' => "no",
78 'deft' => "utf8" },
79 { 'name' => "extract_language",
80 'desc' => "{BasPlug.extract_language}",
81 'type' => "flag",
82 'reqd' => "no" },
83 { 'name' => "default_language",
84 'desc' => "{BasPlug.default_language}",
85 'type' => "language",
86 'deft' => "en",
87 'reqd' => "no" },
88 { 'name' => "extract_acronyms",
89 'desc' => "{BasPlug.extract_acronyms}",
90 'type' => "flag",
91 'reqd' => "no" },
92 { 'name' => "markup_acronyms",
93 'desc' => "{BasPlug.markup_acronyms}",
94 'type' => "flag",
95 'reqd' => "no" },
96 { 'name' => "first",
97 'desc' => "{BasPlug.first}",
98 'type' => "string",
99 'reqd' => "no" },
100 { 'name' => "extract_email",
101 'desc' => "{BasPlug.extract_email}",
102 'type' => "flag",
103 'reqd' => "no" },
104 { 'name' => "extract_historical_years",
105 'desc' => "{BasPlug.extract_historical_years}",
106 'type' => "flag",
107 'reqd' => "no" },
108 { 'name' => "maximum_year",
109 'desc' => "{BasPlug.maximum_year}",
110 'type' => "int",
111 'deft' => (localtime)[5]+1900,
112 'reqd' => "no"},
113 { 'name' => "maximum_century",
114 'desc' => "{BasPlug.maximum_century}",
115 'type' => "int",
116 'deft' => "-1",
117 'reqd' => "no" },
118 { 'name' => "no_bibliography",
119 'desc' => "{BasPlug.no_bibliography}",
120 'type' => "flag",
121 'reqd' => "no"},
122 { 'name' => "cover_image",
123 'desc' => "{BasPlug.cover_image}",
124 'type' => "flag",
125 'reqd' => "no" } ];
126
127my $options = { 'name' => "BasPlug",
128 'desc' => "Base class for all the import plugins.",
129 'inherits' => "No",
130 'args' => $arguments };
131
132
133sub get_arguments
134{
135 local $self = shift(@_);
136 local $optionlistref = $self->{'option_list'};
137 local @optionlist = @$optionlistref;
138 local $pluginoptions = pop(@$optionlistref);
139 local $pluginarguments = $pluginoptions->{'args'};
140 return $pluginarguments;
141}
142
143
144sub print_xml_usage
145{
146 local $self = shift(@_);
147 local $language = shift(@_);
148
149 &PrintUsage::print_xml_header();
150 $self->print_xml($language);
151}
152
153
154sub print_xml
155{
156 local $self = shift(@_);
157 local $language = shift(@_);
158
159 local $optionlistref = $self->{'option_list'};
160 local @optionlist = @$optionlistref;
161 local $pluginoptions = pop(@$optionlistref);
162 return if (!defined($pluginoptions));
163
164 print STDERR "<PlugInfo>\n";
165 print STDERR " <Name>$pluginoptions->{'name'}</Name>\n";
166 print STDERR " <Desc>$pluginoptions->{'desc'}</Desc>\n";
167 print STDERR " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n";
168 print STDERR " <Arguments>\n";
169 if (defined($pluginoptions->{'args'})) {
170 &PrintUsage::print_options_xml($language, $pluginoptions->{'args'});
171 }
172
173 # Recurse up the plugin hierarchy
174 $self->print_xml($language);
175
176 print STDERR " </Arguments>\n";
177 print STDERR "</PlugInfo>\n";
178}
179
180
181sub print_txt_usage
182{
183 local $self = shift(@_);
184 local $language = shift(@_);
185
186 # Print the usage message for a plugin (recursively)
187 local $descoffset = $self->determine_description_offset(0);
188 $self->print_plugin_usage($language, $descoffset, 1);
189}
190
191
192sub determine_description_offset
193{
194 local $self = shift(@_);
195 local $maxoffset = shift(@_);
196
197 local $optionlistref = $self->{'option_list'};
198 local @optionlist = @$optionlistref;
199 local $pluginoptions = pop(@$optionlistref);
200 return $maxoffset if (!defined($pluginoptions));
201
202 # Find the length of the longest option string of this plugin
203 local $pluginargs = $pluginoptions->{'args'};
204 if (defined($pluginargs)) {
205 local $longest = &PrintUsage::find_longest_option_string($pluginargs);
206 if ($longest > $maxoffset) {
207 $maxoffset = $longest;
208 }
209 }
210
211 # Recurse up the plugin hierarchy
212 $maxoffset = $self->determine_description_offset($maxoffset);
213 $self->{'option_list'} = \@optionlist;
214 return $maxoffset;
215}
216
217
218sub print_plugin_usage
219{
220 local $self = shift(@_);
221 local $language = shift(@_);
222 local $descoffset = shift(@_);
223 local $isleafclass = shift(@_);
224
225 local $optionlistref = $self->{'option_list'};
226 local @optionlist = @$optionlistref;
227 local $pluginoptions = pop(@$optionlistref);
228 return if (!defined($pluginoptions));
229
230 local $pluginname = $pluginoptions->{'name'};
231 local $pluginargs = $pluginoptions->{'args'};
232
233 # Produce the usage information using the data structure above
234 if ($isleafclass) {
235 print STDERR " usage: plugin $pluginname [options]\n\n";
236 }
237
238 # Display the plugin options, if there are some
239 if (defined($pluginargs)) {
240 # Calculate the column offset of the option descriptions
241 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
242
243 if ($isleafclass) {
244 print STDERR " specific options:\n";
245 }
246 else {
247 print STDERR " general options (from $pluginname):\n";
248 }
249
250 # Display the plugin options
251 &PrintUsage::print_options_txt($language, $pluginargs, $optiondescoffset);
252 }
253
254 # Recurse up the plugin hierarchy
255 $self->print_plugin_usage($language, $descoffset, 0);
256 $self->{'option_list'} = \@optionlist;
257}
258
259
260# sub print_general_usage {
261# my ($plugin_name) = @_;
262
263# print STDERR "\n usage: plugin $plugin_name [options]\n\n";
264
265# print STDERR " -process_exp A perl regular expression to match against filenames.\n";
266# print STDERR " Matching filenames will be processed by this plugin.\n";
267# print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
268# print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
269# print STDERR " .htm or .html (case-insensitive).\n\n";
270
271# print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
272# print STDERR " being passed to any later plugins in the list. This has no\n";
273# print STDERR " real effect other than to prevent lots of warning messages\n";
274# print STDERR " about input files you don't care about. Each plugin might\n";
275# print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
276# print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
277# print STDERR " file extensions.\n\n";
278
279
280# print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
281# print STDERR " converted from these encodings and stored internally as\n";
282# print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
283# print STDERR " are:\n";
284
285# print STDERR " auto: Use text categorization algorithm to automatically\n";
286# print STDERR " identify the encoding of each source document. This\n";
287# print STDERR " will be slower than explicitly setting the encoding\n";
288# print STDERR " but will work where more than one encoding is used\n";
289# print STDERR " within the same collection.\n";
290
291# print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
292# print STDERR " using iso_8859_1. Beware of using this on a collection\n";
293# print STDERR " of documents that may contain characters outside the\n";
294# print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
295# print STDERR " documents containing accents), use iso_8859_1 instead.\n";
296
297# print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
298# print STDERR " unicode: just unicode\n";
299
300# my $e = $encodings::encodings;
301# foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
302# print STDERR " $enc: $e->{$enc}->{'name'}\n";
303# }
304# print STDERR "\n";
305# print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
306# print STDERR " the text categorization algorithm fails to extract the\n";
307# print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
308# print STDERR " The default is iso_8859_1.\n\n";
309
310# print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
311# print STDERR " metadata. Note that this will be done automatically if\n";
312# print STDERR " -input_encoding is 'auto'.\n\n";
313# print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
314# print STDERR " the 'Language' metadata element will be set to this value.\n";
315# print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
316# print STDERR " en = English). Note that if -input_encoding is not set to\n";
317# print STDERR " 'auto' and -extract_language is not set, all documents will\n";
318# print STDERR " have their 'Language' metadata set to this value.\n\n";
319
320# print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
321
322# print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
323
324# print STDERR " -first Comma separated list of first sizes to extract from the\n";
325# print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
326
327# print STDERR " -extract_email Extract email addresses as metadata\n\n";
328
329# print STDERR " -extract_historical_years Extract time-period information from historical\n";
330# print STDERR " documents. This is stored as metadata with the document.\n";
331# print STDERR " There is a search interface for this metadata, which you \n";
332# print STDERR " can include in your collection by adding the statement:\n";
333# print STDERR " format QueryInterface DateSearch\n";
334# print STDERR " to your collection configuration file\n";
335# print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
336# print STDERR " Common Era date, such as 1950)\n";
337# print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
338# print STDERR " metadata (e.g. 14 will extract all references up to the\n";
339# print STDERR " 14th century)\n";
340# print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
341# print STDERR " historical dates.\n";
342# print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
343# print STDERR " prefix as the file being processed) and associate it as a\n";
344# print STDERR " cover image\n\n";
345# }
346
347# sub print_usage {
348# print STDERR "\nThis plugin has no plugin specific options\n\n";
349# }
350
351sub new {
352 my $class = shift (@_);
353 my $plugin_name = shift (@_);
354 my $self = {};
355
356 my $enc = "^(";
357 map {$enc .= "$_|";} keys %$encodings::encodings;
358 my $denc = $enc . "ascii|utf8|unicode)\$";
359 $enc .= "ascii|utf8|unicode|auto)\$";
360
361 $self->{'outhandle'} = STDERR;
362 my $year = (localtime)[5]+1900;
363
364 $self->{'textcat'} = new textcat();
365
366 $self->{'num_processed'} = 0;
367 $self->{'num_not_processed'} = 0;
368 $self->{'num_blocked'} = 0;
369 $self->{'num_archives'} = 0;
370
371 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
372 $self->{'option_list'} = [ $options ];
373
374 # general options available to all plugins
375 if (!parsargv::parse(\@_,
376 q^process_exp/.*/^, \$self->{'process_exp'},
377 q^block_exp/.*/^, \$self->{'block_exp'},
378 q^extract_language^, \$self->{'extract_language'},
379 q^extract_acronyms^, \$self->{'extract_acronyms'},
380 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
381 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
382 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
383 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
384 q^extract_email^, \$self->{'extract_email'},
385 q^markup_acronyms^, \$self->{'markup_acronyms'},
386 q^default_language/.{2}/en^, \$self->{'default_language'},
387 q^first/.*/^, \$self->{'first'},
388 q^extract_historical_years^, \$self->{'date_extract'},
389 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
390 q^no_bibliography^, \$self->{'no_biblio'},
391 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
392 q^cover_image^, \$self->{'cover_image'},
393 "allow_extra_options")) {
394
395 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
396 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
397 $self->print_txt_usage(""); # Use default resource bundle
398 die "\n";
399 }
400
401 return bless $self, $class;
402}
403
404# initialize BasPlug options
405# if init() is overridden in a sub-class, remember to call BasPlug::init()
406sub init {
407 my $self = shift (@_);
408 my ($verbosity, $outhandle, $failhandle) = @_;
409
410 # verbosity is passed through from the processor
411 $self->{'verbosity'} = $verbosity;
412
413 # as are the outhandle and failhandle
414 $self->{'outhandle'} = $outhandle if defined $outhandle;
415 $self->{'failhandle'} = $failhandle;
416
417 # set process_exp and block_exp to defaults unless they were
418 # explicitly set
419
420 if ((!$self->is_recursive()) and
421 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
422
423 $self->{'process_exp'} = $self->get_default_process_exp ();
424 if ($self->{'process_exp'} eq "") {
425 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
426 }
427 }
428
429 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
430 $self->{'block_exp'} = $self->get_default_block_exp ();
431 }
432}
433
434sub begin {
435 my $self = shift (@_);
436 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
437 $self->initialise_extractors();
438}
439
440sub end {
441 my ($self) = @_;
442 $self->finalise_extractors();
443}
444
445# this function should be overridden to return 1
446# in recursive plugins
447sub is_recursive {
448 my $self = shift (@_);
449
450 return 0;
451}
452
453sub get_default_block_exp {
454 my $self = shift (@_);
455
456 return "";
457}
458
459sub get_default_process_exp {
460 my $self = shift (@_);
461
462 return "";
463}
464
465# The BasPlug read() function. This function does all the right things
466# to make general options work for a given plugin. It calls the process()
467# function which does all the work specific to a plugin (like the old
468# read functions used to do). Most plugins should define their own
469# process() function and let this read() function keep control.
470#
471# recursive plugins (e.g. RecPlug) and specialized plugins like those
472# capable of processing many documents within a single file (e.g.
473# GMLPlug) should normally implement their own version of read()
474#
475# Return number of files processed, undef if can't process
476# Note that $base_dir might be "" and that $file might
477# include directories
478
479sub read {
480 my $self = shift (@_);
481
482 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
483
484 if ($self->is_recursive()) {
485 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
486 }
487
488 my $outhandle = $self->{'outhandle'};
489
490 my $filename = $file;
491 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
492
493 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
494 $self->{'num_blocked'} ++;
495 return 0;
496 }
497 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
498 return undef;
499 }
500 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
501
502 # Do encoding stuff
503 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
504
505 # create a new document
506 my $doc_obj = new doc ($filename, "indexed_doc");
507 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
508 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
509 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
510 my ($filemeta) = $file =~ /([^\\\/]+)$/;
511 # how do we know what encoding the filename is in?
512 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
513 if ($self->{'cover_image'}) {
514 $self->associate_cover_image($doc_obj, $filename);
515 }
516
517 # read in file ($text will be in utf8)
518 my $text = "";
519 $self->read_file ($filename, $encoding, $language, \$text);
520
521 if (!length ($text)) {
522 my $plugin_name = ref ($self);
523 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
524
525 my $failhandle = $self->{'failhandle'};
526 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
527 $self->{'num_not_processed'} ++;
528
529 return 0;
530 }
531
532 # include any metadata passed in from previous plugins
533 # note that this metadata is associated with the top level section
534 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
535
536 # do plugin specific processing of doc_obj
537 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
538
539 # do any automatic metadata extraction
540 $self->auto_extract_metadata ($doc_obj);
541
542 # add an OID
543 # see if there is a plugin-specific set_OID function...
544 if (defined ($self->can(set_OID))) {
545 # it will need $doc_obj to set the Identifier metadata...
546 $self->set_OID($doc_obj);
547 } else {
548 # use the default set_OID() in doc.pm
549 $doc_obj->set_OID();
550 }
551
552 # process the document
553 $processor->process($doc_obj);
554
555 $self->{'num_processed'} ++;
556
557 return 1; # processed the file
558}
559
560# returns undef if file is rejected by the plugin
561sub process {
562 my $self = shift (@_);
563 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
564
565 die "Basplug::process function must be implemented in sub-class\n";
566
567 return undef; # never gets here
568}
569
570# uses the multiread package to read in the entire file pointed to
571# by filename and loads the resulting text into $$textref. Input text
572# may be in any of the encodings handled by multiread, output text
573# will be in utf8
574sub read_file {
575 my $self = shift (@_);
576 my ($filename, $encoding, $language, $textref) = @_;
577
578 if (!-r $filename)
579 {
580 my $outhandle = $self->{'outhandle'};
581 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
582 return;
583 }
584
585 $$textref = "";
586
587 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
588
589 if ($encoding eq "ascii") {
590 undef $/;
591 $$textref = <FILE>;
592 $/ = "\n";
593 } else {
594 my $reader = new multiread();
595 $reader->set_handle ('BasPlug::FILE');
596 $reader->set_encoding ($encoding);
597 $reader->read_file ($textref);
598
599 if ($language eq "zh") {
600 # segment the Chinese words
601 $$textref = &cnseg::segment($$textref);
602 }
603 }
604
605 close FILE;
606}
607
608sub textcat_get_language_encoding {
609 my $self = shift (@_);
610 my ($filename) = @_;
611
612 my ($language, $encoding, $extracted_encoding);
613 if ($self->{'input_encoding'} eq "auto") {
614 # use textcat to automatically work out the input encoding and language
615 ($language, $encoding) = $self->get_language_encoding ($filename);
616 } elsif ($self->{'extract_language'}) {
617 # use textcat to get language metadata
618 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
619 $encoding = $self->{'input_encoding'};
620 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
621 my $plugin_name = ref ($self);
622 my $outhandle = $self->{'outhandle'};
623 print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
624 print $outhandle "appears to be encoded as $extracted_encoding.\n";
625 }
626 } else {
627 $language = $self->{'default_language'};
628 $encoding = $self->{'input_encoding'};
629 }
630 return ($language, $encoding);
631}
632
633# Uses textcat to work out the encoding and language of the text in
634# $filename. All html tags are removed before processing.
635# returns an array containing "language" and "encoding"
636sub get_language_encoding {
637 my $self = shift (@_);
638 my ($filename) = @_;
639 my $outhandle = $self->{'outhandle'};
640
641 # read in file
642 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
643 undef $/;
644 my $text = <FILE>;
645 $/ = "\n";
646 close FILE;
647
648 # remove <title>stuff</title> -- as titles tend often to be in English
649 # for foreign language documents
650 $text =~ s/<title>.*?<\/title>//i;
651
652 # remove all HTML tags
653 $text =~ s/<[^>]*>//sg;
654
655 # get the language/encoding
656 my $results = $self->{'textcat'}->classify(\$text);
657
658 # if textcat returns 3 or less possibilities we'll use the
659 # first one in the list - otherwise use the defaults
660 if (scalar @$results > 3) {
661 # changed 12 Feb 2003 by jrm21
662 # use the most popular encoding at least... otherwise we might
663 # generate invalid archive files!
664 my %guessed_encodings = ();
665 foreach my $result (@$results) {
666 $result =~ /([^\-]+)$/;
667 my $enc=$1;
668 if (!defined($guessed_encodings{$enc})) {
669 $guessed_encodings{$enc}=0;
670 }
671 $guessed_encodings{$enc}++;
672 }
673 my $best_encoding="";
674 $guessed_encodings{""}=-1;
675 foreach my $enc (keys %guessed_encodings) {
676 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
677 $best_encoding=$enc;
678 }
679 }
680
681 if ($self->{'input_encoding'} ne 'auto') {
682 if ($self->{'extract_language'} && $self->{'verbosity'}) {
683 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
684 print $outhandle "defaulting to $self->{'default_language'}\n";
685 }
686 return ($self->{'default_language'}, $self->{'input_encoding'});
687
688 } else {
689 if ($self->{'verbosity'}) {
690 print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
691 print $outhandle "defaulting to $self->{'default_language'}.\n";
692 }
693 return ($self->{'default_language'}, $best_encoding);
694 }
695 }
696
697 # format language/encoding
698 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
699 if (!defined $language) {
700 if ($self->{'verbosity'}) {
701 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
702 print $outhandle "defaulting to $self->{'default_language'}\n";
703 }
704 $language = $self->{'default_language'};
705 }
706 if (!defined $encoding) {
707 if ($self->{'verbosity'}) {
708 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
709 print $outhandle "defaulting to $self->{'default_encoding'}\n";
710 }
711 $encoding = $self->{'default_encoding'};
712 }
713
714 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
715 !defined $encodings::encodings->{$encoding}) {
716 if ($self->{'verbosity'}) {
717 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
718 print $outhandle "using $self->{'default_encoding'}\n";
719 }
720 $encoding = $self->{'default_encoding'};
721 }
722
723 return ($language, $encoding);
724}
725
726# add any extra metadata that's been passed around from one
727# plugin to another.
728# extra_metadata uses add_utf8_metadata so it expects metadata values
729# to already be in utf8
730sub extra_metadata {
731 my $self = shift (@_);
732 my ($doc_obj, $cursection, $metadata) = @_;
733
734 foreach my $field (keys(%$metadata)) {
735 # $metadata->{$field} may be an array reference
736 if (ref ($metadata->{$field}) eq "ARRAY") {
737 map {
738 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
739 } @{$metadata->{$field}};
740 } else {
741 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
742 }
743 }
744}
745
746# initialise metadata extractors
747sub initialise_extractors {
748 my $self = shift (@_);
749
750 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
751 &acronym::initialise_acronyms();
752 }
753}
754
755# finalise metadata extractors
756sub finalise_extractors {
757 my $self = shift (@_);
758
759 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
760 &acronym::finalise_acronyms();
761 }
762}
763
764# FIRSTNNN: extract the first NNN characters as metadata
765sub extract_first_NNNN_characters {
766 my $self = shift (@_);
767 my ($textref, $doc_obj, $thissection) = @_;
768
769 foreach my $size (split /,/, $self->{'first'}) {
770 my $tmptext = $$textref;
771 $tmptext =~ s/^\s+//;
772 $tmptext =~ s/\s+$//;
773 $tmptext =~ s/\s+/ /gs;
774 $tmptext = substr ($tmptext, 0, $size);
775 $tmptext =~ s/\s\S*$/&#8230;/;
776 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
777 }
778}
779
780sub extract_email {
781 my $self = shift (@_);
782 my ($textref, $doc_obj, $thissection) = @_;
783 my $outhandle = $self->{'outhandle'};
784
785 print $outhandle " extracting email addresses ...\n"
786 if ($self->{'verbosity'} > 2);
787
788 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
789 @email = sort @email;
790
791 my @email2 = ();
792 foreach my $address (@email) {
793 if (!(join(" ",@email2) =~ m/$address/ )) {
794 push @email2, $address;
795 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
796 print $outhandle " extracting $address\n"
797 if ($self->{'verbosity'} > 3);
798 }
799 }
800 print $outhandle " done extracting email addresses.\n"
801 if ($self->{'verbosity'} > 2);
802
803}
804
805# extract metadata
806sub auto_extract_metadata {
807
808
809 my $self = shift (@_);
810 my ($doc_obj) = @_;
811
812 if ($self->{'extract_email'}) {
813 my $thissection = $doc_obj->get_top_section();
814 while (defined $thissection) {
815 my $text = $doc_obj->get_text($thissection);
816 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
817 $thissection = $doc_obj->get_next_section ($thissection);
818 }
819 }
820
821
822#adding kea keyphrases
823 if ($self->{'kea'}) {
824
825 my $thissection = $doc_obj->get_top_section();
826 my $text = "";
827 my @list;
828
829 while (defined $thissection) { #loop through sections to gather whole doc
830 my $sectiontext = $doc_obj->get_text($thissection);
831 $text = $text.$sectiontext;
832 $thissection = $doc_obj->get_next_section ($thissection);
833 }
834
835 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
836 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
837 } else { #otherwise call Kea with no options
838 @list = &Kea::extract_KeyPhrases ($text);
839 }
840
841 if(@list){ #if a list of kea keyphrases was returned (ie not empty)
842 my $keyphrases = $list[0]; #first arg is keyphrase list
843 my $stems = $list[1]; #second arg is stemmed keyphrase list
844 print STDERR "keyphrases: $keyphrases\n";
845 print STDERR "stems: $stems\n";
846 $thissection = $doc_obj->get_top_section(); #add metadata to top section
847 $doc_obj->add_metadata($thissection, "kea", $keyphrases);
848 $doc_obj->add_metadata($thissection, "stems", $stems);
849 }
850 } #end of kea
851
852 if ($self->{'first'}) {
853 my $thissection = $doc_obj->get_top_section();
854 while (defined $thissection) {
855 my $text = $doc_obj->get_text($thissection);
856 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
857 $thissection = $doc_obj->get_next_section ($thissection);
858 }
859 }
860
861 if ($self->{'extract_acronyms'}) {
862 my $thissection = $doc_obj->get_top_section();
863 while (defined $thissection) {
864 my $text = $doc_obj->get_text($thissection);
865 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
866 $thissection = $doc_obj->get_next_section ($thissection);
867 }
868 }
869
870 if ($self->{'markup_acronyms'}) {
871 my $thissection = $doc_obj->get_top_section();
872 while (defined $thissection) {
873 my $text = $doc_obj->get_text($thissection);
874 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
875 $doc_obj->delete_text($thissection);
876 $doc_obj->add_text($thissection, $text);
877 $thissection = $doc_obj->get_next_section ($thissection);
878 }
879 }
880
881 if($self->{'date_extract'}) {
882 my $thissection = $doc_obj->get_top_section();
883 while (defined $thissection) {
884
885 my $text = $doc_obj->get_text($thissection);
886 &DateExtract::get_date_metadata($text, $doc_obj,
887 $thissection,
888 $self->{'no_biblio'},
889 $self->{'max_year'},
890 $self->{'max_century'});
891 $thissection = $doc_obj->get_next_section ($thissection);
892 }
893 }
894}
895
896# extract acronyms from a section in a document. progress is
897# reported to outhandle based on the verbosity. both the Acronym
898# and the AcronymKWIC metadata items are created.
899
900sub extract_acronyms {
901 my $self = shift (@_);
902 my ($textref, $doc_obj, $thissection) = @_;
903 my $outhandle = $self->{'outhandle'};
904
905 print $outhandle " extracting acronyms ...\n"
906 if ($self->{'verbosity'} > 2);
907
908 my $acro_array = &acronym::acronyms($textref);
909
910 foreach my $acro (@$acro_array) {
911
912 #check that this is the first time ...
913 my $seen_before = "false";
914 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
915 foreach my $thisAcro (@$previous_data) {
916 if ($thisAcro eq $acro->to_string()) {
917 $seen_before = "true";
918 print $outhandle " already seen ". $acro->to_string() . "\n"
919 if ($self->{'verbosity'} >= 4);
920 }
921 }
922
923 if ($seen_before eq "false") {
924 #write it to the file ...
925 $acro->write_to_file();
926
927 #do the normal acronym
928 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
929 print $outhandle " adding ". $acro->to_string() . "\n"
930 if ($self->{'verbosity'} > 3);
931
932 }
933 }
934 print $outhandle " done extracting acronyms. \n"
935 if ($self->{'verbosity'} > 2);
936}
937
938sub markup_acronyms {
939 my $self = shift (@_);
940 my ($text, $doc_obj, $thissection) = @_;
941 my $outhandle = $self->{'outhandle'};
942
943 print $outhandle " marking up acronyms ...\n"
944 if ($self->{'verbosity'} > 2);
945
946 #self is passed in to check for verbosity ...
947 $text = &acronym::markup_acronyms($text, $self);
948
949 print $outhandle " done marking up acronyms. \n"
950 if ($self->{'verbosity'} > 2);
951
952 return $text;
953}
954
955sub compile_stats {
956 my $self = shift(@_);
957 my ($stats) = @_;
958
959 $stats->{'num_processed'} += $self->{'num_processed'};
960 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
961 $stats->{'num_archives'} += $self->{'num_archives'};
962
963}
964
965sub associate_cover_image {
966 my $self = shift(@_);
967 my ($doc_obj, $filename) = @_;
968
969 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
970 if (-e $filename) {
971 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
972 } else {
973 $filename =~ s/jpg$/JPG/;
974 if (-e $filename) {
975 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
976 }
977 }
978}
979
9801;
Note: See TracBrowser for help on using the repository browser.