source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 6408

Last change on this file since 6408 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 35.3 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use Kea;
35use parsargv;
36use multiread;
37use encodings;
38use cnseg;
39use acronym;
40use textcat;
41use doc;
42use diagnostics;
43use DateExtract;
44use ghtml;
45use gsprintf;
46use printusage;
47
48
49my $unicode_list =
50 [ { 'name' => "auto",
51 'desc' => "{BasPlug.input_encoding.auto}" },
52 { 'name' => "ascii",
53 'desc' => "{BasPlug.input_encoding.ascii}" },
54 { 'name' => "utf8",
55 'desc' => "{BasPlug.input_encoding.utf8}" },
56 { 'name' => "unicode",
57 'desc' => "{BasPlug.input_encoding.unicode}" } ];
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "regexp",
63 'deft' => "",
64 'reqd' => "no" },
65 { 'name' => "block_exp",
66 'desc' => "{BasPlug.block_exp}",
67 'type' => "regexp",
68 'deft' => "",
69 'reqd' => "no" },
70 { 'name' => "input_encoding",
71 'desc' => "{BasPlug.input_encoding}",
72 'type' => "enum",
73 'list' => $unicode_list,
74 'reqd' => "no" ,
75 'deft' => "auto" } ,
76 { 'name' => "default_encoding",
77 'desc' => "{BasPlug.default_encoding}",
78 'type' => "enum",
79 'list' => $unicode_list,
80 'reqd' => "no",
81 'deft' => "utf8" },
82 { 'name' => "extract_language",
83 'desc' => "{BasPlug.extract_language}",
84 'type' => "flag",
85 'reqd' => "no" },
86 { 'name' => "default_language",
87 'desc' => "{BasPlug.default_language}",
88 'type' => "language",
89 'deft' => "en",
90 'reqd' => "no" },
91 { 'name' => "extract_acronyms",
92 'desc' => "{BasPlug.extract_acronyms}",
93 'type' => "flag",
94 'reqd' => "no" },
95 { 'name' => "markup_acronyms",
96 'desc' => "{BasPlug.markup_acronyms}",
97 'type' => "flag",
98 'reqd' => "no" },
99 { 'name' => "first",
100 'desc' => "{BasPlug.first}",
101 'type' => "string",
102 'reqd' => "no" },
103 { 'name' => "extract_email",
104 'desc' => "{BasPlug.extract_email}",
105 'type' => "flag",
106 'reqd' => "no" },
107 { 'name' => "extract_historical_years",
108 'desc' => "{BasPlug.extract_historical_years}",
109 'type' => "flag",
110 'reqd' => "no" },
111 { 'name' => "maximum_year",
112 'desc' => "{BasPlug.maximum_year}",
113 'type' => "int",
114 'deft' => (localtime)[5]+1900,
115 'reqd' => "no"},
116 { 'name' => "maximum_century",
117 'desc' => "{BasPlug.maximum_century}",
118 'type' => "int",
119 'deft' => "-1",
120 'reqd' => "no" },
121 { 'name' => "no_bibliography",
122 'desc' => "{BasPlug.no_bibliography}",
123 'type' => "flag",
124 'reqd' => "no"},
125 { 'name' => "cover_image",
126 'desc' => "{BasPlug.cover_image}",
127 'type' => "flag",
128 'reqd' => "no" } ];
129
130my $options = { 'name' => "BasPlug",
131 'desc' => "{BasPlug.desc}",
132 'abstract' => "yes",
133 'inherits' => "no",
134 'args' => $arguments };
135
136
137sub gsprintf
138{
139 return &gsprintf::gsprintf(@_);
140}
141
142
143sub get_arguments
144{
145 local $self = shift(@_);
146 local $optionlistref = $self->{'option_list'};
147 local @optionlist = @$optionlistref;
148 local $pluginoptions = pop(@$optionlistref);
149 local $pluginarguments = $pluginoptions->{'args'};
150 return $pluginarguments;
151}
152
153
154sub print_xml_usage
155{
156 local $self = shift(@_);
157 local $language = shift(@_);
158
159 &PrintUsage::print_xml_header();
160 $self->print_xml($language);
161}
162
163
164sub print_xml
165{
166 local $self = shift(@_);
167 local $language = shift(@_);
168
169 local $optionlistref = $self->{'option_list'};
170 local @optionlist = @$optionlistref;
171 local $pluginoptions = pop(@$optionlistref);
172 return if (!defined($pluginoptions));
173
174 print STDERR "<PlugInfo>\n";
175 print STDERR " <Name>$pluginoptions->{'name'}</Name>\n";
176 print STDERR " <Desc>$pluginoptions->{'desc'}</Desc>\n";
177 print STDERR " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n";
178 print STDERR " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n";
179 print STDERR " <Arguments>\n";
180 if (defined($pluginoptions->{'args'})) {
181 &PrintUsage::print_options_xml($language, $pluginoptions->{'args'});
182 }
183
184 # Recurse up the plugin hierarchy
185 $self->print_xml($language);
186
187 print STDERR " </Arguments>\n";
188 print STDERR "</PlugInfo>\n";
189}
190
191
192sub print_txt_usage
193{
194 local $self = shift(@_);
195 local $language = shift(@_);
196
197 # Print the usage message for a plugin (recursively)
198 local $descoffset = $self->determine_description_offset(0);
199 $self->print_plugin_usage($language, $descoffset, 1);
200}
201
202
203sub determine_description_offset
204{
205 local $self = shift(@_);
206 local $maxoffset = shift(@_);
207
208 local $optionlistref = $self->{'option_list'};
209 local @optionlist = @$optionlistref;
210 local $pluginoptions = pop(@$optionlistref);
211 return $maxoffset if (!defined($pluginoptions));
212
213 # Find the length of the longest option string of this plugin
214 local $pluginargs = $pluginoptions->{'args'};
215 if (defined($pluginargs)) {
216 local $longest = &PrintUsage::find_longest_option_string($pluginargs);
217 if ($longest > $maxoffset) {
218 $maxoffset = $longest;
219 }
220 }
221
222 # Recurse up the plugin hierarchy
223 $maxoffset = $self->determine_description_offset($maxoffset);
224 $self->{'option_list'} = \@optionlist;
225 return $maxoffset;
226}
227
228
229sub print_plugin_usage
230{
231 local $self = shift(@_);
232 local $language = shift(@_);
233 local $descoffset = shift(@_);
234 local $isleafclass = shift(@_);
235
236 local $optionlistref = $self->{'option_list'};
237 local @optionlist = @$optionlistref;
238 local $pluginoptions = pop(@$optionlistref);
239 return if (!defined($pluginoptions));
240
241 local $pluginname = $pluginoptions->{'name'};
242 local $pluginargs = $pluginoptions->{'args'};
243
244 # Produce the usage information using the data structure above
245 if ($isleafclass) {
246 &gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
247 # print STDERR " usage: classify $classifiername [options]\n\n";
248 }
249
250 # Display the plugin options, if there are some
251 if (defined($pluginargs)) {
252 # Calculate the column offset of the option descriptions
253 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
254
255 if ($isleafclass) {
256 &gsprintf(STDERR, " {common.specific_options}:\n");
257 # print STDERR " specific options:\n";
258 }
259 else {
260 &gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
261 # print STDERR " general options (from $classifiername):\n";
262 }
263
264 # Display the plugin options
265 &PrintUsage::print_options_txt($language, $pluginargs, $optiondescoffset);
266 }
267
268 # Recurse up the plugin hierarchy
269 $self->print_plugin_usage($language, $descoffset, 0);
270 $self->{'option_list'} = \@optionlist;
271}
272
273
274# sub print_general_usage {
275# my ($plugin_name) = @_;
276
277# print STDERR "\n usage: plugin $plugin_name [options]\n\n";
278
279# print STDERR " -process_exp A perl regular expression to match against filenames.\n";
280# print STDERR " Matching filenames will be processed by this plugin.\n";
281# print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
282# print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
283# print STDERR " .htm or .html (case-insensitive).\n\n";
284
285# print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
286# print STDERR " being passed to any later plugins in the list. This has no\n";
287# print STDERR " real effect other than to prevent lots of warning messages\n";
288# print STDERR " about input files you don't care about. Each plugin might\n";
289# print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
290# print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
291# print STDERR " file extensions.\n\n";
292
293
294# print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
295# print STDERR " converted from these encodings and stored internally as\n";
296# print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
297# print STDERR " are:\n";
298
299# print STDERR " auto: Use text categorization algorithm to automatically\n";
300# print STDERR " identify the encoding of each source document. This\n";
301# print STDERR " will be slower than explicitly setting the encoding\n";
302# print STDERR " but will work where more than one encoding is used\n";
303# print STDERR " within the same collection.\n";
304
305# print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
306# print STDERR " using iso_8859_1. Beware of using this on a collection\n";
307# print STDERR " of documents that may contain characters outside the\n";
308# print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
309# print STDERR " documents containing accents), use iso_8859_1 instead.\n";
310
311# print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
312# print STDERR " unicode: just unicode\n";
313
314# my $e = $encodings::encodings;
315# foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
316# print STDERR " $enc: $e->{$enc}->{'name'}\n";
317# }
318# print STDERR "\n";
319# print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
320# print STDERR " the text categorization algorithm fails to extract the\n";
321# print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
322# print STDERR " The default is iso_8859_1.\n\n";
323
324# print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
325# print STDERR " metadata. Note that this will be done automatically if\n";
326# print STDERR " -input_encoding is 'auto'.\n\n";
327# print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
328# print STDERR " the 'Language' metadata element will be set to this value.\n";
329# print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
330# print STDERR " en = English). Note that if -input_encoding is not set to\n";
331# print STDERR " 'auto' and -extract_language is not set, all documents will\n";
332# print STDERR " have their 'Language' metadata set to this value.\n\n";
333
334# print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
335
336# print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
337
338# print STDERR " -first Comma separated list of first sizes to extract from the\n";
339# print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
340
341# print STDERR " -extract_email Extract email addresses as metadata\n\n";
342
343# print STDERR " -extract_historical_years Extract time-period information from historical\n";
344# print STDERR " documents. This is stored as metadata with the document.\n";
345# print STDERR " There is a search interface for this metadata, which you \n";
346# print STDERR " can include in your collection by adding the statement:\n";
347# print STDERR " format QueryInterface DateSearch\n";
348# print STDERR " to your collection configuration file\n";
349# print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
350# print STDERR " Common Era date, such as 1950)\n";
351# print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
352# print STDERR " metadata (e.g. 14 will extract all references up to the\n";
353# print STDERR " 14th century)\n";
354# print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
355# print STDERR " historical dates.\n";
356# print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
357# print STDERR " prefix as the file being processed) and associate it as a\n";
358# print STDERR " cover image\n\n";
359# }
360
361# sub print_usage {
362# print STDERR "\nThis plugin has no plugin specific options\n\n";
363# }
364
365sub new {
366 my $class = shift (@_);
367 my $plugin_name = shift (@_);
368 my $self = {};
369 $self->{'plugin_type'} = "BasPlug";
370 my $enc = "^(";
371 map {$enc .= "$_|";} keys %$encodings::encodings;
372 my $denc = $enc . "ascii|utf8|unicode)\$";
373 $enc .= "ascii|utf8|unicode|auto)\$";
374
375 $self->{'outhandle'} = STDERR;
376 my $year = (localtime)[5]+1900;
377
378 $self->{'textcat'} = new textcat();
379
380 $self->{'num_processed'} = 0;
381 $self->{'num_not_processed'} = 0;
382 $self->{'num_blocked'} = 0;
383 $self->{'num_archives'} = 0;
384
385 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
386 $self->{'option_list'} = [ $options ];
387
388 # general options available to all plugins
389 if (!parsargv::parse(\@_,
390 q^process_exp/.*/^, \$self->{'process_exp'},
391 q^block_exp/.*/^, \$self->{'block_exp'},
392 q^extract_language^, \$self->{'extract_language'},
393 q^extract_acronyms^, \$self->{'extract_acronyms'},
394 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
395 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
396 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
397 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
398 q^extract_email^, \$self->{'extract_email'},
399 q^markup_acronyms^, \$self->{'markup_acronyms'},
400 q^default_language/.{2}/en^, \$self->{'default_language'},
401 q^first/.*/^, \$self->{'first'},
402 q^extract_historical_years^, \$self->{'date_extract'},
403 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
404 q^no_bibliography^, \$self->{'no_biblio'},
405 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
406 q^cover_image^, \$self->{'cover_image'},
407 "allow_extra_options")) {
408
409 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
410 $self->print_txt_usage(""); # Use default resource bundle
411 die "\n";
412 }
413
414 return bless $self, $class;
415}
416
417# initialize BasPlug options
418# if init() is overridden in a sub-class, remember to call BasPlug::init()
419sub init {
420 my $self = shift (@_);
421 my ($verbosity, $outhandle, $failhandle) = @_;
422
423 # verbosity is passed through from the processor
424 $self->{'verbosity'} = $verbosity;
425
426 # as are the outhandle and failhandle
427 $self->{'outhandle'} = $outhandle if defined $outhandle;
428 $self->{'failhandle'} = $failhandle;
429
430 # set process_exp and block_exp to defaults unless they were
431 # explicitly set
432
433 if ((!$self->is_recursive()) and
434 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
435
436 $self->{'process_exp'} = $self->get_default_process_exp ();
437 if ($self->{'process_exp'} eq "") {
438 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
439 }
440 }
441
442 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
443 $self->{'block_exp'} = $self->get_default_block_exp ();
444 }
445}
446
447sub begin {
448 my $self = shift (@_);
449 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
450 $self->initialise_extractors();
451}
452
453sub end {
454 my ($self) = @_;
455 $self->finalise_extractors();
456}
457
458# this function should be overridden to return 1
459# in recursive plugins
460sub is_recursive {
461 my $self = shift (@_);
462
463 return 0;
464}
465
466sub get_default_block_exp {
467 my $self = shift (@_);
468
469 return "";
470}
471
472sub get_default_process_exp {
473 my $self = shift (@_);
474
475 return "";
476}
477
478# The BasPlug read() function. This function does all the right things
479# to make general options work for a given plugin. It calls the process()
480# function which does all the work specific to a plugin (like the old
481# read functions used to do). Most plugins should define their own
482# process() function and let this read() function keep control.
483#
484# recursive plugins (e.g. RecPlug) and specialized plugins like those
485# capable of processing many documents within a single file (e.g.
486# GMLPlug) should normally implement their own version of read()
487#
488# Return number of files processed, undef if can't process
489# Note that $base_dir might be "" and that $file might
490# include directories
491
492sub read {
493 my $self = shift (@_);
494
495 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
496
497 if ($self->is_recursive()) {
498 &gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
499 }
500
501 my $outhandle = $self->{'outhandle'};
502
503 my $filename = $file;
504 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
505
506 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
507 $self->{'num_blocked'} ++;
508 return 0;
509 }
510 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
511 return undef;
512 }
513 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
514
515 # Do encoding stuff
516 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
517
518 # create a new document
519 my $doc_obj = new doc ($filename, "indexed_doc");
520 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
521 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
522 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
523 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
524 my ($filemeta) = $file =~ /([^\\\/]+)$/;
525 # how do we know what encoding the filename is in?
526 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
527 if ($self->{'cover_image'}) {
528 $self->associate_cover_image($doc_obj, $filename);
529 }
530
531 # read in file ($text will be in utf8)
532 my $text = "";
533 $self->read_file ($filename, $encoding, $language, \$text);
534
535 if (!length ($text)) {
536 my $plugin_name = ref ($self);
537 &gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
538
539 my $failhandle = $self->{'failhandle'};
540 &gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
541 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
542 $self->{'num_not_processed'} ++;
543
544 return 0;
545 }
546
547 # include any metadata passed in from previous plugins
548 # note that this metadata is associated with the top level section
549 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
550
551 # do plugin specific processing of doc_obj
552 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli));
553
554 # do any automatic metadata extraction
555 $self->auto_extract_metadata ($doc_obj);
556
557 # add an OID
558 # see if there is a plugin-specific set_OID function...
559 if (defined ($self->can(set_OID))) {
560 # it will need $doc_obj to set the Identifier metadata...
561 $self->set_OID($doc_obj);
562 } else {
563 # use the default set_OID() in doc.pm
564 $doc_obj->set_OID();
565 }
566
567 # process the document
568 $processor->process($doc_obj);
569
570 $self->{'num_processed'} ++;
571
572 return 1; # processed the file
573}
574
575# returns undef if file is rejected by the plugin
576sub process {
577 my $self = shift (@_);
578 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
579
580 &gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
581 # die "Basplug::process function must be implemented in sub-class\n";
582
583 return undef; # never gets here
584}
585
586# uses the multiread package to read in the entire file pointed to
587# by filename and loads the resulting text into $$textref. Input text
588# may be in any of the encodings handled by multiread, output text
589# will be in utf8
590sub read_file {
591 my $self = shift (@_);
592 my ($filename, $encoding, $language, $textref) = @_;
593
594 if (!-r $filename)
595 {
596 my $outhandle = $self->{'outhandle'};
597 &gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
598 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
599 return;
600 }
601
602 $$textref = "";
603
604 open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
605 # open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
606
607 if ($encoding eq "ascii") {
608 undef $/;
609 $$textref = <FILE>;
610 $/ = "\n";
611 } else {
612 my $reader = new multiread();
613 $reader->set_handle ('BasPlug::FILE');
614 $reader->set_encoding ($encoding);
615 $reader->read_file ($textref);
616
617 if ($language eq "zh") {
618 # segment the Chinese words
619 $$textref = &cnseg::segment($$textref);
620 }
621 }
622
623 close FILE;
624}
625
626sub textcat_get_language_encoding {
627 my $self = shift (@_);
628 my ($filename) = @_;
629
630 my ($language, $encoding, $extracted_encoding);
631 if ($self->{'input_encoding'} eq "auto") {
632 # use textcat to automatically work out the input encoding and language
633 ($language, $encoding) = $self->get_language_encoding ($filename);
634 } elsif ($self->{'extract_language'}) {
635 # use textcat to get language metadata
636 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
637 $encoding = $self->{'input_encoding'};
638 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
639 my $plugin_name = ref ($self);
640 my $outhandle = $self->{'outhandle'};
641 &gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
642 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
643 # print $outhandle "appears to be encoded as $extracted_encoding.\n";
644 }
645 } else {
646 $language = $self->{'default_language'};
647 $encoding = $self->{'input_encoding'};
648 }
649 return ($language, $encoding);
650}
651
652# Uses textcat to work out the encoding and language of the text in
653# $filename. All html tags are removed before processing.
654# returns an array containing "language" and "encoding"
655sub get_language_encoding {
656 my $self = shift (@_);
657 my ($filename) = @_;
658 my $outhandle = $self->{'outhandle'};
659
660 # read in file
661 open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
662 undef $/;
663 my $text = <FILE>;
664 $/ = "\n";
665 close FILE;
666
667 # remove <title>stuff</title> -- as titles tend often to be in English
668 # for foreign language documents
669 $text =~ s/<title>.*?<\/title>//i;
670
671 # remove all HTML tags
672 $text =~ s/<[^>]*>//sg;
673
674 # get the language/encoding
675 my $results = $self->{'textcat'}->classify(\$text);
676
677 # if textcat returns 3 or less possibilities we'll use the
678 # first one in the list - otherwise use the defaults
679 if (scalar @$results > 3) {
680 # changed 12 Feb 2003 by jrm21
681 # use the most popular encoding at least... otherwise we might
682 # generate invalid archive files!
683 my %guessed_encodings = ();
684 foreach my $result (@$results) {
685 $result =~ /([^\-]+)$/;
686 my $enc=$1;
687 if (!defined($guessed_encodings{$enc})) {
688 $guessed_encodings{$enc}=0;
689 }
690 $guessed_encodings{$enc}++;
691 }
692 my $best_encoding="";
693 $guessed_encodings{""}=-1;
694 foreach my $enc (keys %guessed_encodings) {
695 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
696 $best_encoding=$enc;
697 }
698 }
699
700 if ($self->{'input_encoding'} ne 'auto') {
701 if ($self->{'extract_language'} && $self->{'verbosity'}) {
702 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
703 # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
704 # print $outhandle "defaulting to $self->{'default_language'}\n";
705 }
706 return ($self->{'default_language'}, $self->{'input_encoding'});
707
708 } else {
709 if ($self->{'verbosity'}) {
710 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
711 # print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
712 # print $outhandle "defaulting to $self->{'default_language'}.\n";
713 }
714 return ($self->{'default_language'}, $best_encoding);
715 }
716 }
717
718 # format language/encoding
719 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
720 if (!defined $language) {
721 if ($self->{'verbosity'}) {
722 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
723 # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
724 # print $outhandle "defaulting to $self->{'default_language'}\n";
725 }
726 $language = $self->{'default_language'};
727 }
728 if (!defined $encoding) {
729 if ($self->{'verbosity'}) {
730 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
731 # print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
732 # print $outhandle "defaulting to $self->{'default_encoding'}\n";
733 }
734 $encoding = $self->{'default_encoding'};
735 }
736
737 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
738 !defined $encodings::encodings->{$encoding}) {
739 if ($self->{'verbosity'}) {
740 &gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", $filename, $encoding, $self->{'default_encoding'});
741 # print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
742 # print $outhandle "using $self->{'default_encoding'}\n";
743 }
744 $encoding = $self->{'default_encoding'};
745 }
746
747 return ($language, $encoding);
748}
749
750# add any extra metadata that's been passed around from one
751# plugin to another.
752# extra_metadata uses add_utf8_metadata so it expects metadata values
753# to already be in utf8
754sub extra_metadata {
755 my $self = shift (@_);
756 my ($doc_obj, $cursection, $metadata) = @_;
757
758 foreach my $field (keys(%$metadata)) {
759 # $metadata->{$field} may be an array reference
760 if (ref ($metadata->{$field}) eq "ARRAY") {
761 map {
762 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
763 } @{$metadata->{$field}};
764 } else {
765 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
766 }
767 }
768}
769
770# initialise metadata extractors
771sub initialise_extractors {
772 my $self = shift (@_);
773
774 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
775 &acronym::initialise_acronyms();
776 }
777}
778
779# finalise metadata extractors
780sub finalise_extractors {
781 my $self = shift (@_);
782
783 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
784 &acronym::finalise_acronyms();
785 }
786}
787
788# FIRSTNNN: extract the first NNN characters as metadata
789sub extract_first_NNNN_characters {
790 my $self = shift (@_);
791 my ($textref, $doc_obj, $thissection) = @_;
792
793 foreach my $size (split /,/, $self->{'first'}) {
794 my $tmptext = $$textref;
795 $tmptext =~ s/^\s+//;
796 $tmptext =~ s/\s+$//;
797 $tmptext =~ s/\s+/ /gs;
798 $tmptext = substr ($tmptext, 0, $size);
799 $tmptext =~ s/\s\S*$/&#8230;/;
800 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
801 }
802}
803
804sub extract_email {
805 my $self = shift (@_);
806 my ($textref, $doc_obj, $thissection) = @_;
807 my $outhandle = $self->{'outhandle'};
808
809 # print $outhandle " extracting email addresses ...\n"
810 &gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
811 if ($self->{'verbosity'} > 2);
812
813 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
814 @email = sort @email;
815
816 my @email2 = ();
817 foreach my $address (@email) {
818 if (!(join(" ",@email2) =~ m/$address/ )) {
819 push @email2, $address;
820 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
821 # print $outhandle " extracting $address\n"
822 &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
823 if ($self->{'verbosity'} > 3);
824 }
825 }
826 # print $outhandle " done extracting email addresses.\n"
827 &gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
828 if ($self->{'verbosity'} > 2);
829}
830
831# extract metadata
832sub auto_extract_metadata {
833
834 my $self = shift (@_);
835 my ($doc_obj) = @_;
836
837 if ($self->{'extract_email'}) {
838 my $thissection = $doc_obj->get_top_section();
839 while (defined $thissection) {
840 my $text = $doc_obj->get_text($thissection);
841 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
842 $thissection = $doc_obj->get_next_section ($thissection);
843 }
844 }
845
846
847#adding kea keyphrases
848 if ($self->{'kea'}) {
849
850 my $thissection = $doc_obj->get_top_section();
851 my $text = "";
852 my @list;
853
854 while (defined $thissection) { #loop through sections to gather whole doc
855 my $sectiontext = $doc_obj->get_text($thissection);
856 $text = $text.$sectiontext;
857 $thissection = $doc_obj->get_next_section ($thissection);
858 }
859
860 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
861 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
862 } else { #otherwise call Kea with no options
863 @list = &Kea::extract_KeyPhrases ($text);
864 }
865
866 if(@list){ #if a list of kea keyphrases was returned (ie not empty)
867 my $keyphrases = $list[0]; #first arg is keyphrase list
868 my $stems = $list[1]; #second arg is stemmed keyphrase list
869 &gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
870 # print STDERR "keyphrases: $keyphrases\n";
871 &gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
872 # print STDERR "stems: $stems\n";
873 $thissection = $doc_obj->get_top_section(); #add metadata to top section
874 $doc_obj->add_metadata($thissection, "kea", $keyphrases);
875 $doc_obj->add_metadata($thissection, "stems", $stems);
876 }
877 } #end of kea
878
879 if ($self->{'first'}) {
880 my $thissection = $doc_obj->get_top_section();
881 while (defined $thissection) {
882 my $text = $doc_obj->get_text($thissection);
883 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
884 $thissection = $doc_obj->get_next_section ($thissection);
885 }
886 }
887
888 if ($self->{'extract_acronyms'}) {
889 my $thissection = $doc_obj->get_top_section();
890 while (defined $thissection) {
891 my $text = $doc_obj->get_text($thissection);
892 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
893 $thissection = $doc_obj->get_next_section ($thissection);
894 }
895 }
896
897 if ($self->{'markup_acronyms'}) {
898 my $thissection = $doc_obj->get_top_section();
899 while (defined $thissection) {
900 my $text = $doc_obj->get_text($thissection);
901 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
902 $doc_obj->delete_text($thissection);
903 $doc_obj->add_text($thissection, $text);
904 $thissection = $doc_obj->get_next_section ($thissection);
905 }
906 }
907
908 if($self->{'date_extract'}) {
909 my $thissection = $doc_obj->get_top_section();
910 while (defined $thissection) {
911
912 my $text = $doc_obj->get_text($thissection);
913 &DateExtract::get_date_metadata($text, $doc_obj,
914 $thissection,
915 $self->{'no_biblio'},
916 $self->{'max_year'},
917 $self->{'max_century'});
918 $thissection = $doc_obj->get_next_section ($thissection);
919 }
920 }
921}
922
923# extract acronyms from a section in a document. progress is
924# reported to outhandle based on the verbosity. both the Acronym
925# and the AcronymKWIC metadata items are created.
926
927sub extract_acronyms {
928 my $self = shift (@_);
929 my ($textref, $doc_obj, $thissection) = @_;
930 my $outhandle = $self->{'outhandle'};
931
932 # print $outhandle " extracting acronyms ...\n"
933 &gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
934 if ($self->{'verbosity'} > 2);
935
936 my $acro_array = &acronym::acronyms($textref);
937
938 foreach my $acro (@$acro_array) {
939
940 #check that this is the first time ...
941 my $seen_before = "false";
942 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
943 foreach my $thisAcro (@$previous_data) {
944 if ($thisAcro eq $acro->to_string()) {
945 $seen_before = "true";
946 # print $outhandle " already seen ". $acro->to_string() . "\n"
947 &gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
948 if ($self->{'verbosity'} >= 4);
949 }
950 }
951
952 if ($seen_before eq "false") {
953 #write it to the file ...
954 $acro->write_to_file();
955
956 #do the normal acronym
957 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
958 # print $outhandle " adding ". $acro->to_string() . "\n"
959 &gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
960 if ($self->{'verbosity'} > 3);
961 }
962 }
963
964 # print $outhandle " done extracting acronyms. \n"
965 &gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
966 if ($self->{'verbosity'} > 2);
967}
968
969sub markup_acronyms {
970 my $self = shift (@_);
971 my ($text, $doc_obj, $thissection) = @_;
972 my $outhandle = $self->{'outhandle'};
973
974 # print $outhandle " marking up acronyms ...\n"
975 &gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
976 if ($self->{'verbosity'} > 2);
977
978 #self is passed in to check for verbosity ...
979 $text = &acronym::markup_acronyms($text, $self);
980
981 # print $outhandle " done marking up acronyms. \n"
982 &gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
983 if ($self->{'verbosity'} > 2);
984
985 return $text;
986}
987
988sub compile_stats {
989 my $self = shift(@_);
990 my ($stats) = @_;
991
992 $stats->{'num_processed'} += $self->{'num_processed'};
993 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
994 $stats->{'num_archives'} += $self->{'num_archives'};
995
996}
997
998sub associate_cover_image {
999 my $self = shift(@_);
1000 my ($doc_obj, $filename) = @_;
1001
1002 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1003 if (-e $filename) {
1004 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1005 } else {
1006 $filename =~ s/jpg$/JPG/;
1007 if (-e $filename) {
1008 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1009 }
1010 }
1011}
1012
10131;
Note: See TracBrowser for help on using the repository browser.