source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 4785

Last change on this file since 4785 was 4785, checked in by mdewsnip, 21 years ago

Commented out print_usage functions - plugins should now call $self->print_txt_usage() to display their usage text. Updates to the options of a plugin should be made in the $options and $arguments data structures at the top of the plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 34.8 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use Kea;
35use parsargv;
36use multiread;
37use encodings;
38use cnseg;
39use acronym;
40use textcat;
41use doc;
42use diagnostics;
43use DateExtract;
44use ghtml;
45use printusage;
46
47my $unicode_list =
48 [ { 'name' => "auto",
49 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
50 { 'name' => "ascii",
51 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
52 { 'name' => "utf8",
53 'desc' => "either utf8 or unicode -- automatically detected." },
54 { 'name' => "unicode",
55 'desc' => "just unicode" } ];
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
60 'type' => "string",
61 'deft' => "",
62 'reqd' => "no" },
63 { 'name' => "block_exp",
64 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
65 'type' => 'string',
66 'deft' => "",
67 'reqd' => "no" },
68 { 'name' => "input_encoding",
69 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.",
70 'type' => "enum",
71 'list' => $unicode_list,
72 'reqd' => "no" ,
73 'deft' => "auto" } ,
74 { 'name' => "default_encoding",
75 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.",
76 'type' => "enum",
77 'reqd' => "no",
78 'deft' => "utf8" },
79 { 'name' => "extract_language",
80 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
81 'type' => "flag",
82 'reqd' => "no" },
83 { 'name' => "default_language",
84 'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.",
85 'type' => "language",
86 'deft' => "en",
87 'reqd' => "no" },
88 { 'name' => "extract_acronyms",
89 'desc' => "Extract acronyms from within text and set as metadata.",
90 'type' => "flag",
91 'reqd' => "no" },
92 { 'name' => "markup_acronyms",
93 'desc' => "Add acronym metadata into document text.",
94 'type' => "flag",
95 'reqd' => "no" },
96 { 'name' => "first",
97 'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.",
98 'type' => "string",
99 'reqd' => "no" },
100 { 'name' => "extract_email",
101 'desc' => "Extract email addresses as metadata.",
102 'type' => "flag",
103 'reqd' => "no" },
104 { 'name' => "extract_historical_years",
105 'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.",
106 'type' => "flag",
107 'reqd' => "no" },
108 { 'name' => "maximum_year",
109 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
110 'type' => "int",
111 'deft' => (localtime)[5]+1900,
112 'reqd' => "no"},
113 { 'name' => "maximum_century",
114 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
115 'type' => "int",
116 'deft' => "-1",
117 'reqd' => "no" },
118 { 'name' => "no_bibliography",
119 'desc' => "Do not try to block bibliographic dates when extracting historical dates.",
120 'type' => "flag",
121 'reqd' => "no"},
122 { 'name' => "cover_image",
123 'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.",
124 'type' => "flag",
125 'reqd' => "no" } ];
126
127my $options = { 'name' => "BasPlug",
128 'desc' => "Base class for all the import plugins.",
129 'inherits' => "No",
130 'args' => $arguments };
131
132
133sub print_xml_usage
134{
135 local $self = shift(@_);
136
137 print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n";
138 $self->print_xml();
139}
140
141
142sub print_xml
143{
144 local $self = shift(@_);
145
146 local $optionlistref = $self->{'option_list'};
147 local @optionlist = @$optionlistref;
148 local $pluginoptions = pop(@$optionlistref);
149 return if (!defined($pluginoptions));
150
151 print STDERR "<PlugInfo>\n";
152 print STDERR " <Name>$pluginoptions->{'name'}</Name>\n";
153 print STDERR " <Desc>$pluginoptions->{'desc'}</Desc>\n";
154 print STDERR " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n";
155 print STDERR " <Arguments>\n";
156 if (defined($pluginoptions->{'args'})) {
157 &PrintUsage::print_options_xml($pluginoptions->{'args'});
158 }
159
160 # Recurse up the plugin hierarchy
161 $self->print_xml();
162
163 print STDERR " </Arguments>\n";
164 print STDERR "</PlugInfo>\n";
165}
166
167
168sub print_txt_usage
169{
170 local $self = shift(@_);
171
172 # Print the usage message for a plugin (recursively)
173 local $descoffset = $self->determine_description_offset(0);
174 $self->print_plugin_usage($descoffset, 1);
175}
176
177
178sub determine_description_offset
179{
180 local $self = shift(@_);
181 local $maxoffset = shift(@_);
182
183 local $optionlistref = $self->{'option_list'};
184 local @optionlist = @$optionlistref;
185 local $pluginoptions = pop(@$optionlistref);
186 return $maxoffset if (!defined($pluginoptions));
187
188 # Find the length of the longest option string of this plugin
189 local $pluginargs = $pluginoptions->{'args'};
190 if (defined($pluginargs)) {
191 local $longest = &PrintUsage::find_longest_option_string($pluginargs);
192 if ($longest > $maxoffset) {
193 $maxoffset = $longest;
194 }
195 }
196
197 # Recurse up the plugin hierarchy
198 $maxoffset = $self->determine_description_offset($maxoffset);
199 $self->{'option_list'} = \@optionlist;
200 return $maxoffset;
201}
202
203
204sub print_plugin_usage
205{
206 local $self = shift(@_);
207 local $descoffset = shift(@_);
208 local $isleafclass = shift(@_);
209
210 local $optionlistref = $self->{'option_list'};
211 local @optionlist = @$optionlistref;
212 local $pluginoptions = pop(@$optionlistref);
213 return if (!defined($pluginoptions));
214
215 local $pluginname = $pluginoptions->{'name'};
216 local $pluginargs = $pluginoptions->{'args'};
217
218 # Produce the usage information using the data structure above
219 if ($isleafclass) {
220 print STDERR " usage: plugin $pluginname [options]\n\n";
221 }
222
223 # Display the plugin options, if there are some
224 if (defined($pluginargs)) {
225 # Calculate the column offset of the option descriptions
226 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
227
228 if ($isleafclass) {
229 print STDERR " specific options:\n";
230 }
231 else {
232 print STDERR " general options (from $pluginname):\n";
233 }
234
235 # Display the plugin options
236 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
237 }
238
239 # Recurse up the plugin hierarchy
240 $self->print_plugin_usage($descoffset, 0);
241 $self->{'option_list'} = \@optionlist;
242}
243
244
245# sub print_general_usage {
246# my ($plugin_name) = @_;
247
248# print STDERR "\n usage: plugin $plugin_name [options]\n\n";
249
250# print STDERR " -process_exp A perl regular expression to match against filenames.\n";
251# print STDERR " Matching filenames will be processed by this plugin.\n";
252# print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
253# print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
254# print STDERR " .htm or .html (case-insensitive).\n\n";
255
256# print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
257# print STDERR " being passed to any later plugins in the list. This has no\n";
258# print STDERR " real effect other than to prevent lots of warning messages\n";
259# print STDERR " about input files you don't care about. Each plugin might\n";
260# print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
261# print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
262# print STDERR " file extensions.\n\n";
263
264
265# print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
266# print STDERR " converted from these encodings and stored internally as\n";
267# print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
268# print STDERR " are:\n";
269
270# print STDERR " auto: Use text categorization algorithm to automatically\n";
271# print STDERR " identify the encoding of each source document. This\n";
272# print STDERR " will be slower than explicitly setting the encoding\n";
273# print STDERR " but will work where more than one encoding is used\n";
274# print STDERR " within the same collection.\n";
275
276# print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
277# print STDERR " using iso_8859_1. Beware of using this on a collection\n";
278# print STDERR " of documents that may contain characters outside the\n";
279# print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
280# print STDERR " documents containing accents), use iso_8859_1 instead.\n";
281
282# print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
283# print STDERR " unicode: just unicode\n";
284
285# my $e = $encodings::encodings;
286# foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
287# print STDERR " $enc: $e->{$enc}->{'name'}\n";
288# }
289# print STDERR "\n";
290# print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
291# print STDERR " the text categorization algorithm fails to extract the\n";
292# print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
293# print STDERR " The default is iso_8859_1.\n\n";
294
295# print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
296# print STDERR " metadata. Note that this will be done automatically if\n";
297# print STDERR " -input_encoding is 'auto'.\n\n";
298# print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
299# print STDERR " the 'Language' metadata element will be set to this value.\n";
300# print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
301# print STDERR " en = English). Note that if -input_encoding is not set to\n";
302# print STDERR " 'auto' and -extract_language is not set, all documents will\n";
303# print STDERR " have their 'Language' metadata set to this value.\n\n";
304
305# print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
306
307# print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
308
309# print STDERR " -first Comma separated list of first sizes to extract from the\n";
310# print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
311
312# print STDERR " -extract_email Extract email addresses as metadata\n\n";
313
314# print STDERR " -extract_historical_years Extract time-period information from historical\n";
315# print STDERR " documents. This is stored as metadata with the document.\n";
316# print STDERR " There is a search interface for this metadata, which you \n";
317# print STDERR " can include in your collection by adding the statement:\n";
318# print STDERR " format QueryInterface DateSearch\n";
319# print STDERR " to your collection configuration file\n";
320# print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
321# print STDERR " Common Era date, such as 1950)\n";
322# print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
323# print STDERR " metadata (e.g. 14 will extract all references up to the\n";
324# print STDERR " 14th century)\n";
325# print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
326# print STDERR " historical dates.\n";
327# print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
328# print STDERR " prefix as the file being processed) and associate it as a\n";
329# print STDERR " cover image\n\n";
330# }
331
332# sub print_usage {
333# print STDERR "\nThis plugin has no plugin specific options\n\n";
334# }
335
336sub new {
337 my $class = shift (@_);
338 my $plugin_name = shift (@_);
339 my $self = {};
340
341 my $enc = "^(";
342 map {$enc .= "$_|";} keys %$encodings::encodings;
343 my $denc = $enc . "ascii|utf8|unicode)\$";
344 $enc .= "ascii|utf8|unicode|auto)\$";
345
346 $self->{'outhandle'} = STDERR;
347 my $year = (localtime)[5]+1900;
348
349 $self->{'textcat'} = new textcat();
350
351 $self->{'num_processed'} = 0;
352 $self->{'num_not_processed'} = 0;
353 $self->{'num_blocked'} = 0;
354 $self->{'num_archives'} = 0;
355
356 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
357 $self->{'option_list'} = [ $options ];
358
359 # general options available to all plugins
360 if (!parsargv::parse(\@_,
361 q^process_exp/.*/^, \$self->{'process_exp'},
362 q^block_exp/.*/^, \$self->{'block_exp'},
363 q^extract_language^, \$self->{'extract_language'},
364 q^extract_acronyms^, \$self->{'extract_acronyms'},
365 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
366 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
367 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
368 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
369 q^extract_email^, \$self->{'extract_email'},
370 q^markup_acronyms^, \$self->{'markup_acronyms'},
371 q^default_language/.{2}/en^, \$self->{'default_language'},
372 q^first/.*/^, \$self->{'first'},
373 q^extract_historical_years^, \$self->{'date_extract'},
374 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
375 q^no_bibliography^, \$self->{'no_biblio'},
376 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
377 q^cover_image^, \$self->{'cover_image'},
378 "allow_extra_options")) {
379
380 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
381 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
382 # &print_general_usage($plugin_name);
383 $self->print_txt_usage();
384 die "\n";
385 }
386
387 return bless $self, $class;
388}
389
390# initialize BasPlug options
391# if init() is overridden in a sub-class, remember to call BasPlug::init()
392sub init {
393 my $self = shift (@_);
394 my ($verbosity, $outhandle, $failhandle) = @_;
395
396 # verbosity is passed through from the processor
397 $self->{'verbosity'} = $verbosity;
398
399 # as are the outhandle and failhandle
400 $self->{'outhandle'} = $outhandle if defined $outhandle;
401 $self->{'failhandle'} = $failhandle;
402
403 # set process_exp and block_exp to defaults unless they were
404 # explicitly set
405
406 if ((!$self->is_recursive()) and
407 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
408
409 $self->{'process_exp'} = $self->get_default_process_exp ();
410 if ($self->{'process_exp'} eq "") {
411 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
412 }
413 }
414
415 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
416 $self->{'block_exp'} = $self->get_default_block_exp ();
417 }
418}
419
420sub begin {
421 my $self = shift (@_);
422 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
423 $self->initialise_extractors();
424}
425
426sub end {
427 my ($self) = @_;
428 $self->finalise_extractors();
429}
430
431# this function should be overridden to return 1
432# in recursive plugins
433sub is_recursive {
434 my $self = shift (@_);
435
436 return 0;
437}
438
439sub get_default_block_exp {
440 my $self = shift (@_);
441
442 return "";
443}
444
445sub get_default_process_exp {
446 my $self = shift (@_);
447
448 return "";
449}
450
451# The BasPlug read() function. This function does all the right things
452# to make general options work for a given plugin. It calls the process()
453# function which does all the work specific to a plugin (like the old
454# read functions used to do). Most plugins should define their own
455# process() function and let this read() function keep control.
456#
457# recursive plugins (e.g. RecPlug) and specialized plugins like those
458# capable of processing many documents within a single file (e.g.
459# GMLPlug) should normally implement their own version of read()
460#
461# Return number of files processed, undef if can't process
462# Note that $base_dir might be "" and that $file might
463# include directories
464
465sub read {
466 my $self = shift (@_);
467
468 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
469
470 if ($self->is_recursive()) {
471 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
472 }
473
474 my $outhandle = $self->{'outhandle'};
475
476 my $filename = $file;
477 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
478
479 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
480 $self->{'num_blocked'} ++;
481 return 0;
482 }
483 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
484 return undef;
485 }
486 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
487
488 # Do encoding stuff
489 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
490
491 # create a new document
492 my $doc_obj = new doc ($filename, "indexed_doc");
493 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
494 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
495 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
496 my ($filemeta) = $file =~ /([^\\\/]+)$/;
497 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
498 if ($self->{'cover_image'}) {
499 $self->associate_cover_image($doc_obj, $filename);
500 }
501
502 # read in file ($text will be in utf8)
503 my $text = "";
504 $self->read_file ($filename, $encoding, $language, \$text);
505
506 if (!length ($text)) {
507 my $plugin_name = ref ($self);
508 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
509
510 my $failhandle = $self->{'failhandle'};
511 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
512 $self->{'num_not_processed'} ++;
513
514 return 0;
515 }
516
517 # include any metadata passed in from previous plugins
518 # note that this metadata is associated with the top level section
519 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
520
521 # do plugin specific processing of doc_obj
522 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
523
524 # do any automatic metadata extraction
525 $self->auto_extract_metadata ($doc_obj);
526
527 # add an OID
528 # see if there is a plugin-specific set_OID function...
529 if (defined ($self->can(set_OID))) {
530 # it will need $doc_obj to set the Identifier metadata...
531 $self->set_OID($doc_obj);
532 } else {
533 # use the default set_OID() in doc.pm
534 $doc_obj->set_OID();
535 }
536
537 # process the document
538 $processor->process($doc_obj);
539
540 $self->{'num_processed'} ++;
541
542 return 1; # processed the file
543}
544
545# returns undef if file is rejected by the plugin
546sub process {
547 my $self = shift (@_);
548 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
549
550 die "Basplug::process function must be implemented in sub-class\n";
551
552 return undef; # never gets here
553}
554
555# uses the multiread package to read in the entire file pointed to
556# by filename and loads the resulting text into $$textref. Input text
557# may be in any of the encodings handled by multiread, output text
558# will be in utf8
559sub read_file {
560 my $self = shift (@_);
561 my ($filename, $encoding, $language, $textref) = @_;
562
563 if (!-r $filename)
564 {
565 my $outhandle = $self->{'outhandle'};
566 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
567 return;
568 }
569
570 $$textref = "";
571
572 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
573
574 if ($encoding eq "ascii") {
575 undef $/;
576 $$textref = <FILE>;
577 $/ = "\n";
578 } else {
579 my $reader = new multiread();
580 $reader->set_handle ('BasPlug::FILE');
581 $reader->set_encoding ($encoding);
582 $reader->read_file ($textref);
583
584 if ($language eq "zh") {
585 # segment the Chinese words
586 $$textref = &cnseg::segment($$textref);
587 }
588 }
589
590 close FILE;
591}
592
593sub textcat_get_language_encoding {
594 my $self = shift (@_);
595 my ($filename) = @_;
596
597 my ($language, $encoding, $extracted_encoding);
598 if ($self->{'input_encoding'} eq "auto") {
599 # use textcat to automatically work out the input encoding and language
600 ($language, $encoding) = $self->get_language_encoding ($filename);
601 } elsif ($self->{'extract_language'}) {
602 # use textcat to get language metadata
603 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
604 $encoding = $self->{'input_encoding'};
605 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
606 my $plugin_name = ref ($self);
607 my $outhandle = $self->{'outhandle'};
608 print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
609 print $outhandle "appears to be encoded as $extracted_encoding.\n";
610 }
611 } else {
612 $language = $self->{'default_language'};
613 $encoding = $self->{'input_encoding'};
614 }
615 return ($language, $encoding);
616}
617
618# Uses textcat to work out the encoding and language of the text in
619# $filename. All html tags are removed before processing.
620# returns an array containing "language" and "encoding"
621sub get_language_encoding {
622 my $self = shift (@_);
623 my ($filename) = @_;
624 my $outhandle = $self->{'outhandle'};
625
626 # read in file
627 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
628 undef $/;
629 my $text = <FILE>;
630 $/ = "\n";
631 close FILE;
632
633 # remove <title>stuff</title> -- as titles tend often to be in English
634 # for foreign language documents
635 $text =~ s/<title>.*?<\/title>//i;
636
637 # remove all HTML tags
638 $text =~ s/<[^>]*>//sg;
639
640 # get the language/encoding
641 my $results = $self->{'textcat'}->classify(\$text);
642
643 # if textcat returns 3 or less possibilities we'll use the
644 # first one in the list - otherwise use the defaults
645 if (scalar @$results > 3) {
646 # changed 12 Feb 2003 by jrm21
647 # use the most popular encoding at least... otherwise we might
648 # generate invalid archive files!
649 my %guessed_encodings = ();
650 foreach my $result (@$results) {
651 $result =~ /([^\-]+)$/;
652 my $enc=$1;
653 if (!defined($guessed_encodings{$enc})) {
654 $guessed_encodings{$enc}=0;
655 }
656 $guessed_encodings{$enc}++;
657 }
658 my $best_encoding="";
659 $guessed_encodings{""}=-1;
660 foreach my $enc (keys %guessed_encodings) {
661 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
662 $best_encoding=$enc;
663 }
664 }
665
666 if ($self->{'input_encoding'} ne 'auto') {
667 if ($self->{'extract_language'} && $self->{'verbosity'}) {
668 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
669 print $outhandle "defaulting to $self->{'default_language'}\n";
670 }
671 return ($self->{'default_language'}, $self->{'input_encoding'});
672
673 } else {
674 if ($self->{'verbosity'}) {
675 print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
676 print $outhandle "defaulting to $self->{'default_language'}.\n";
677 }
678 return ($self->{'default_language'}, $best_encoding);
679 }
680 }
681
682 # format language/encoding
683 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
684 if (!defined $language) {
685 if ($self->{'verbosity'}) {
686 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
687 print $outhandle "defaulting to $self->{'default_language'}\n";
688 }
689 $language = $self->{'default_language'};
690 }
691 if (!defined $encoding) {
692 if ($self->{'verbosity'}) {
693 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
694 print $outhandle "defaulting to $self->{'default_encoding'}\n";
695 }
696 $encoding = $self->{'default_encoding'};
697 }
698
699 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
700 !defined $encodings::encodings->{$encoding}) {
701 if ($self->{'verbosity'}) {
702 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
703 print $outhandle "using $self->{'default_encoding'}\n";
704 }
705 $encoding = $self->{'default_encoding'};
706 }
707
708 return ($language, $encoding);
709}
710
711# add any extra metadata that's been passed around from one
712# plugin to another.
713# extra_metadata uses add_utf8_metadata so it expects metadata values
714# to already be in utf8
715sub extra_metadata {
716 my $self = shift (@_);
717 my ($doc_obj, $cursection, $metadata) = @_;
718
719 foreach my $field (keys(%$metadata)) {
720 # $metadata->{$field} may be an array reference
721 if (ref ($metadata->{$field}) eq "ARRAY") {
722 map {
723 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
724 } @{$metadata->{$field}};
725 } else {
726 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
727 }
728 }
729}
730
731# initialise metadata extractors
732sub initialise_extractors {
733 my $self = shift (@_);
734
735 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
736 &acronym::initialise_acronyms();
737 }
738}
739
740# finalise metadata extractors
741sub finalise_extractors {
742 my $self = shift (@_);
743
744 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
745 &acronym::finalise_acronyms();
746 }
747}
748
749# FIRSTNNN: extract the first NNN characters as metadata
750sub extract_first_NNNN_characters {
751 my $self = shift (@_);
752 my ($textref, $doc_obj, $thissection) = @_;
753
754 foreach my $size (split /,/, $self->{'first'}) {
755 my $tmptext = $$textref;
756 $tmptext =~ s/^\s+//;
757 $tmptext =~ s/\s+$//;
758 $tmptext =~ s/\s+/ /gs;
759 $tmptext = substr ($tmptext, 0, $size);
760 $tmptext =~ s/\s\S*$/&#8230;/;
761 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
762 }
763}
764
765sub extract_email {
766 my $self = shift (@_);
767 my ($textref, $doc_obj, $thissection) = @_;
768 my $outhandle = $self->{'outhandle'};
769
770 print $outhandle " extracting email addresses ...\n"
771 if ($self->{'verbosity'} > 2);
772
773 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
774 @email = sort @email;
775
776 my @email2 = ();
777 foreach my $address (@email) {
778 if (!(join(" ",@email2) =~ m/$address/ )) {
779 push @email2, $address;
780 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
781 print $outhandle " extracting $address\n"
782 if ($self->{'verbosity'} > 3);
783 }
784 }
785 print $outhandle " done extracting email addresses.\n"
786 if ($self->{'verbosity'} > 2);
787
788}
789
790# extract metadata
791sub auto_extract_metadata {
792
793
794 my $self = shift (@_);
795 my ($doc_obj) = @_;
796
797 if ($self->{'extract_email'}) {
798 my $thissection = $doc_obj->get_top_section();
799 while (defined $thissection) {
800 my $text = $doc_obj->get_text($thissection);
801 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
802 $thissection = $doc_obj->get_next_section ($thissection);
803 }
804 }
805
806
807#adding kea keyphrases
808 if ($self->{'kea'}) {
809
810 my $thissection = $doc_obj->get_top_section();
811 my $text = "";
812 my @list;
813
814 while (defined $thissection) { #loop through sections to gather whole doc
815 my $sectiontext = $doc_obj->get_text($thissection);
816 $text = $text.$sectiontext;
817 $thissection = $doc_obj->get_next_section ($thissection);
818 }
819
820 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
821 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
822 } else { #otherwise call Kea with no options
823 @list = &Kea::extract_KeyPhrases ($text);
824 }
825
826 if(@list){ #if a list of kea keyphrases was returned (ie not empty)
827 my $keyphrases = $list[0]; #first arg is keyphrase list
828 my $stems = $list[1]; #second arg is stemmed keyphrase list
829 print STDERR "keyphrases: $keyphrases\n";
830 print STDERR "stems: $stems\n";
831 $thissection = $doc_obj->get_top_section(); #add metadata to top section
832 $doc_obj->add_metadata($thissection, "kea", $keyphrases);
833 $doc_obj->add_metadata($thissection, "stems", $stems);
834 }
835 } #end of kea
836
837 if ($self->{'first'}) {
838 my $thissection = $doc_obj->get_top_section();
839 while (defined $thissection) {
840 my $text = $doc_obj->get_text($thissection);
841 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
842 $thissection = $doc_obj->get_next_section ($thissection);
843 }
844 }
845
846 if ($self->{'extract_acronyms'}) {
847 my $thissection = $doc_obj->get_top_section();
848 while (defined $thissection) {
849 my $text = $doc_obj->get_text($thissection);
850 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
851 $thissection = $doc_obj->get_next_section ($thissection);
852 }
853 }
854
855 if ($self->{'markup_acronyms'}) {
856 my $thissection = $doc_obj->get_top_section();
857 while (defined $thissection) {
858 my $text = $doc_obj->get_text($thissection);
859 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
860 $doc_obj->delete_text($thissection);
861 $doc_obj->add_text($thissection, $text);
862 $thissection = $doc_obj->get_next_section ($thissection);
863 }
864 }
865
866 if($self->{'date_extract'}) {
867 my $thissection = $doc_obj->get_top_section();
868 while (defined $thissection) {
869
870 my $text = $doc_obj->get_text($thissection);
871 &DateExtract::get_date_metadata($text, $doc_obj,
872 $thissection,
873 $self->{'no_biblio'},
874 $self->{'max_year'},
875 $self->{'max_century'});
876 $thissection = $doc_obj->get_next_section ($thissection);
877 }
878 }
879}
880
881# extract acronyms from a section in a document. progress is
882# reported to outhandle based on the verbosity. both the Acronym
883# and the AcronymKWIC metadata items are created.
884
885sub extract_acronyms {
886 my $self = shift (@_);
887 my ($textref, $doc_obj, $thissection) = @_;
888 my $outhandle = $self->{'outhandle'};
889
890 print $outhandle " extracting acronyms ...\n"
891 if ($self->{'verbosity'} > 2);
892
893 my $acro_array = &acronym::acronyms($textref);
894
895 foreach my $acro (@$acro_array) {
896
897 #check that this is the first time ...
898 my $seen_before = "false";
899 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
900 foreach my $thisAcro (@$previous_data) {
901 if ($thisAcro eq $acro->to_string()) {
902 $seen_before = "true";
903 print $outhandle " already seen ". $acro->to_string() . "\n"
904 if ($self->{'verbosity'} >= 4);
905 }
906 }
907
908 if ($seen_before eq "false") {
909 #write it to the file ...
910 $acro->write_to_file();
911
912 #do the normal acronym
913 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
914 print $outhandle " adding ". $acro->to_string() . "\n"
915 if ($self->{'verbosity'} > 3);
916
917 }
918 }
919 print $outhandle " done extracting acronyms. \n"
920 if ($self->{'verbosity'} > 2);
921}
922
923sub markup_acronyms {
924 my $self = shift (@_);
925 my ($text, $doc_obj, $thissection) = @_;
926 my $outhandle = $self->{'outhandle'};
927
928 print $outhandle " marking up acronyms ...\n"
929 if ($self->{'verbosity'} > 2);
930
931 #self is passed in to check for verbosity ...
932 $text = &acronym::markup_acronyms($text, $self);
933
934 print $outhandle " done marking up acronyms. \n"
935 if ($self->{'verbosity'} > 2);
936
937 return $text;
938}
939
940sub compile_stats {
941 my $self = shift(@_);
942 my ($stats) = @_;
943
944 $stats->{'num_processed'} += $self->{'num_processed'};
945 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
946 $stats->{'num_archives'} += $self->{'num_archives'};
947
948}
949
950sub associate_cover_image {
951 my $self = shift(@_);
952 my ($doc_obj, $filename) = @_;
953
954 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
955 if (-e $filename) {
956 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
957 } else {
958 $filename =~ s/jpg$/JPG/;
959 if (-e $filename) {
960 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
961 }
962 }
963}
964
9651;
Note: See TracBrowser for help on using the repository browser.