source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 3614

Last change on this file since 3614 was 3540, checked in by kjdon, 22 years ago

added John T's changes into CVS - added info to enable retrieval of usage info in xml

  • Property svn:keywords set to Author Date Id Revision
File size: 33.1 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28# suppress the annoying "subroutine redefined" warning that various
29# plugins cause under perl 5.6
30$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
31
32use Kea;
33use parsargv;
34use multiread;
35use encodings;
36use cnseg;
37use acronym;
38use textcat;
39use doc;
40use diagnostics;
41use DateExtract;
42use ghtml;
43
44my $unicode_list =
45[ { 'name' => "auto",
46 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
47 { 'name' => "ascii",
48 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
49 { 'name' => "utf8",
50 'desc' => "either utf8 or unicode -- automatically detected." },
51 { 'name' => "unicode",
52 'desc' => "just unicode" } ];
53
54my $arguments =
55 [ { 'name' => "process_exp",
56 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
57 'type' => "string",
58 'deft' => "",
59 'reqd' => "no" },
60 { 'name' => "block_exp",
61 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
62 'type' => 'string',
63 'deft' => "",
64 'reqd' => "no" },
65 { 'name' => "input_encoding",
66 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",
67 'type' => "enum",
68 'list' => $unicode_list,
69 'reqd' => "no" ,
70 'deft' => "auto" } ,
71 { 'name' => "default_encoding",
72 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone. The default is iso_8859_1.",
73 'type' => "flag",
74 'reqd' => "no" },
75 { 'name' => "extract_language",
76 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "default_language",
80 'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.",
81 'type' => "language",
82 'deft' => "en",
83 'reqd' => "no" },
84 { 'name' => "extract_acronyms",
85 'desc' => "Extract acronyms from within text and set as metadata.",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "markup_acronyms",
89 'desc' => "Add acronym metadata into document text.",
90 'type' => "flag",
91 'reqd' => "no" },
92 { 'name' => "first",
93 'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.",
94 'type' => "string",
95 'reqd' => "no" },
96 { 'name' => "extract_email",
97 'desc' => "Extract email addresses as metadata.",
98 'type' => "flag",
99 'reqd' => "no" },
100 { 'name' => "extract_historical_years",
101 'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.",
102 'type' => "flag",
103 'reqd' => "no" },
104 { 'name' => "maximum_year",
105 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
106 'type' => "int",
107 'reqd' => "no"},
108 { 'name' => "maximum_century",
109 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
110 'type' => "int",
111 'reqd' => "no" },
112 { 'name' => "no_bibliography",
113 'desc' => "Do not try and block bibliographic dates when extracting historical dates.",
114 'type' => "flag",
115 'reqd' => "no"},
116 { 'name' => "cover_image",
117 'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.",
118 'type' => "flag",
119 'reqd' => "no" } ];
120
121my $options = { 'name' => "BasPlug",
122 'desc' => "Base class for all the import plugins.",
123 'inherits' => "No",
124 'args' => $arguments,
125 'process_exp' => "",
126 'block_exp' => "" };
127
128sub print_xml_usage {
129 my $self = shift (@_);
130 print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n";
131 $self->print_xml();
132}
133
134sub print_xml {
135 my $self = shift (@_);
136 my $option_list = $self->{'option_list'};
137 my $option = pop( @{$option_list} );
138 if(defined $option)
139 {
140 print STDERR "<PlugInfo>\n";
141 print STDERR " <Name>$option->{'name'}</Name>\n";
142 print STDERR " <Desc>$option->{'desc'}</Desc>\n";
143 print STDERR " <Inherits>$option->{'inherits'}</Inherits>\n";
144 print STDERR " <Arguments>\n";
145 if(defined $option->{'args'})
146 {
147 my $args = $option->{'args'};
148 my $x;
149 foreach $x ( @{$args} )
150 {
151 print STDERR " <Option>\n";
152 print STDERR " <Name>$x->{'name'}</Name>\n";
153 print STDERR " <Desc>$x->{'desc'}</Desc>\n";
154 print STDERR " <Type>$x->{'type'}</Type>\n";
155 print STDERR " <Required>$x->{'reqd'}</Required>\n";
156 if(defined $x->{'list'})
157 {
158 print STDERR " <List>\n";
159 my $list = $x->{'list'};
160 my $y;
161 foreach $y ( @{$list} )
162 {
163 print STDERR " <Value>\n";
164 print STDERR " <Name>$y->{'name'}</Name>\n";
165 print STDERR " <Desc>$y->{'desc'}</Desc>\n";
166 print STDERR " </Value>\n";
167 }
168 # Special case of 'input_encoding'
169 if( $x->{'name'} =~ m/^input_encoding$/i ) {
170 my $e = $encodings::encodings;
171 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
172 print STDERR " <Value>\n";
173 print STDERR " <Name>$enc</Name>\n";
174 print STDERR " <Desc>$e->{$enc}->{'name'}</Desc>\n";
175 print STDERR " </Value>\n";
176 }
177 }
178 print STDERR " </List>\n";
179 }
180 if(defined $x->{'deft'})
181 {
182 print STDERR " <Default>$x->{'deft'}</Default>\n";
183 }
184 print STDERR " </Option>\n";
185 }
186 }
187 if(defined $option_list) {
188 $self->print_xml();
189 }
190
191 print STDERR " </Arguments>\n";
192 print STDERR "</PlugInfo>\n";
193 }
194}
195
196sub print_general_usage {
197 my ($plugin_name) = @_;
198
199 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
200
201 print STDERR " -process_exp A perl regular expression to match against filenames.\n";
202 print STDERR " Matching filenames will be processed by this plugin.\n";
203 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
204 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
205 print STDERR " .htm or .html (case-insensitive).\n\n";
206
207 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
208 print STDERR " being passed to any later plugins in the list. This has no\n";
209 print STDERR " real effect other than to prevent lots of warning messages\n";
210 print STDERR " about input files you don't care about. Each plugin might\n";
211 print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
212 print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
213 print STDERR " file extensions.\n\n";
214
215
216 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
217 print STDERR " converted from these encodings and stored internally as\n";
218 print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
219 print STDERR " are:\n";
220
221 print STDERR " auto: Use text categorization algorithm to automatically\n";
222 print STDERR " identify the encoding of each source document. This\n";
223 print STDERR " will be slower than explicitly setting the encoding\n";
224 print STDERR " but will work where more than one encoding is used\n";
225 print STDERR " within the same collection.\n";
226
227 print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
228 print STDERR " using iso_8859_1. Beware of using this on a collection\n";
229 print STDERR " of documents that may contain characters outside the\n";
230 print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
231 print STDERR " documents containing accents), use iso_8859_1 instead.\n";
232
233 print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
234 print STDERR " unicode: just unicode\n";
235
236 my $e = $encodings::encodings;
237 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
238 print STDERR " $enc: $e->{$enc}->{'name'}\n";
239 }
240 print STDERR "\n";
241 print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
242 print STDERR " the text categorization algorithm fails to extract the\n";
243 print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
244 print STDERR " The default is iso_8859_1.\n\n";
245
246 print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
247 print STDERR " metadata. Note that this will be done automatically if\n";
248 print STDERR " -input_encoding is 'auto'.\n\n";
249 print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
250 print STDERR " the 'Language' metadata element will be set to this value.\n";
251 print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
252 print STDERR " en = English). Note that if -input_encoding is not set to\n";
253 print STDERR " 'auto' and -extract_language is not set, all documents will\n";
254 print STDERR " have their 'Language' metadata set to this value.\n\n";
255
256 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
257
258 print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
259
260 print STDERR " -first Comma separated list of first sizes to extract from the\n";
261 print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
262
263 print STDERR " -extract_email Extract email addresses as metadata\n\n";
264
265 print STDERR " -extract_historical_years Extract time-period information from historical\n";
266 print STDERR " documents. This is stored as metadata with the document.\n";
267 print STDERR " There is a search interface for this metadata, which you \n";
268 print STDERR " can include in your collection by adding the statement:\n";
269 print STDERR " format QueryInterface DateSearch\n";
270 print STDERR " to your collection configuration file\n";
271 print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
272 print STDERR " Common Era date, such as 1950)\n";
273 print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
274 print STDERR " metadata (e.g. 14 will extract all references up to the\n";
275 print STDERR " 14th century)\n";
276 print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
277 print STDERR " historical dates.\n";
278 print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
279 print STDERR " prefix as the file being processed) and associate it as a\n";
280 print STDERR " cover image\n\n";
281}
282
283# print_usage should be overridden for any sub-classes having
284# their own plugin specific options
285sub print_usage {
286 print STDERR "\nThis plugin has no plugin specific options\n\n";
287}
288
289sub new {
290 my $class = shift (@_);
291 my $plugin_name = shift (@_);
292 my $self = {};
293
294 my $enc = "^(";
295 map {$enc .= "$_|";} keys %$encodings::encodings;
296 my $denc = $enc . "ascii|utf8|unicode)\$";
297 $enc .= "ascii|utf8|unicode|auto)\$";
298
299 $self->{'outhandle'} = STDERR;
300 my $year = (localtime)[5]+1900;
301
302 $self->{'textcat'} = new textcat();
303
304 $self->{'num_processed'} = 0;
305 $self->{'num_not_processed'} = 0;
306 $self->{'num_blocked'} = 0;
307 $self->{'num_archives'} = 0;
308
309 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
310 $self->{'option_list'} = [ $options ];
311
312 # general options available to all plugins
313 if (!parsargv::parse(\@_,
314 q^process_exp/.*/^, \$self->{'process_exp'},
315 q^block_exp/.*/^, \$self->{'block_exp'},
316 q^extract_acronyms^, \$self->{'extract_acronyms'},
317 q^extract_keyphrases^, \$self->{'kea'}, #with extra options
318 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
319 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
320 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
321 q^extract_email^, \$self->{'extract_email'},
322 q^markup_acronyms^, \$self->{'markup_acronyms'},
323 q^default_language/.{2}/en^, \$self->{'default_language'},
324 q^first/.*/^, \$self->{'first'},
325 q^extract_historical_years^, \$self->{'date_extract'},
326 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
327 q^no_bibliography^, \$self->{'no_biblio'},
328 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
329 q^cover_image^, \$self->{'cover_image'},
330 "allow_extra_options")) {
331
332 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
333 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
334 &print_general_usage($plugin_name);
335 die "\n";
336 }
337
338 return bless $self, $class;
339}
340
341# initialize BasPlug options
342# if init() is overridden in a sub-class, remember to call BasPlug::init()
343sub init {
344 my $self = shift (@_);
345 my ($verbosity, $outhandle, $failhandle) = @_;
346
347 # verbosity is passed through from the processor
348 $self->{'verbosity'} = $verbosity;
349
350 # as are the outhandle and failhandle
351 $self->{'outhandle'} = $outhandle if defined $outhandle;
352 $self->{'failhandle'} = $failhandle;
353
354 # set process_exp and block_exp to defaults unless they were
355 # explicitly set
356
357 if ((!$self->is_recursive()) and
358 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
359
360 $self->{'process_exp'} = $self->get_default_process_exp ();
361 if ($self->{'process_exp'} eq "") {
362 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
363 }
364 }
365
366 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
367 $self->{'block_exp'} = $self->get_default_block_exp ();
368 }
369}
370
371sub begin {
372 my $self = shift (@_);
373 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
374 $self->initialise_extractors();
375}
376
377sub end {
378 my ($self) = @_;
379 $self->finalise_extractors();
380}
381
382# this function should be overridden to return 1
383# in recursive plugins
384sub is_recursive {
385 my $self = shift (@_);
386
387 return 0;
388}
389
390sub get_default_block_exp {
391 my $self = shift (@_);
392
393 return "";
394}
395
396sub get_default_process_exp {
397 my $self = shift (@_);
398
399 return "";
400}
401
402# The BasPlug read() function. This function does all the right things
403# to make general options work for a given plugin. It calls the process()
404# function which does all the work specific to a plugin (like the old
405# read functions used to do). Most plugins should define their own
406# process() function and let this read() function keep control.
407#
408# recursive plugins (e.g. RecPlug) and specialized plugins like those
409# capable of processing many documents within a single file (e.g.
410# GMLPlug) should normally implement their own version of read()
411#
412# Return number of files processed, undef if can't process
413# Note that $base_dir might be "" and that $file might
414# include directories
415
416sub read {
417 my $self = shift (@_);
418
419 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
420
421 if ($self->is_recursive()) {
422 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
423 }
424
425 my $outhandle = $self->{'outhandle'};
426
427 my $filename = $file;
428 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
429
430 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
431 $self->{'num_blocked'} ++;
432 return 0;
433 }
434 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
435 return undef;
436 }
437 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
438
439 # Do encoding stuff
440 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
441
442 # create a new document
443 my $doc_obj = new doc ($filename, "indexed_doc");
444 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
445 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
446 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
447 my ($filemeta) = $file =~ /([^\\\/]+)$/;
448 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
449 if ($self->{'cover_image'}) {
450 $self->associate_cover_image($doc_obj, $filename);
451 }
452
453 # read in file ($text will be in utf8)
454 my $text = "";
455 $self->read_file ($filename, $encoding, $language, \$text);
456
457 if (!length ($text)) {
458 my $plugin_name = ref ($self);
459 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
460
461 my $failhandle = $self->{'failhandle'};
462 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
463 $self->{'num_not_processed'} ++;
464
465 return 0;
466 }
467
468 # include any metadata passed in from previous plugins
469 # note that this metadata is associated with the top level section
470 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
471
472 # do plugin specific processing of doc_obj
473 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
474
475 # do any automatic metadata extraction
476 $self->auto_extract_metadata ($doc_obj);
477
478 # add an OID
479 # see if there is a plugin-specific set_OID function...
480 if (defined ($self->can(set_OID))) {
481 # it will need $doc_obj to set the Identifier metadata...
482 $self->set_OID($doc_obj);
483 } else {
484 # use the default set_OID() in doc.pm
485 $doc_obj->set_OID();
486 }
487
488 # process the document
489 $processor->process($doc_obj);
490
491 $self->{'num_processed'} ++;
492
493 return 1; # processed the file
494}
495
496# returns undef if file is rejected by the plugin
497sub process {
498 my $self = shift (@_);
499 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
500
501 die "Basplug::process function must be implemented in sub-class\n";
502
503 return undef; # never gets here
504}
505
506# uses the multiread package to read in the entire file pointed to
507# by filename and loads the resulting text into $$textref. Input text
508# may be in any of the encodings handled by multiread, output text
509# will be in utf8
510sub read_file {
511 my $self = shift (@_);
512 my ($filename, $encoding, $language, $textref) = @_;
513
514 if (!-r $filename)
515 {
516 my $outhandle = $self->{'outhandle'};
517 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
518 return;
519 }
520
521 $$textref = "";
522
523 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
524
525 if ($encoding eq "ascii") {
526 undef $/;
527 $$textref = <FILE>;
528 $/ = "\n";
529 } else {
530 my $reader = new multiread();
531 $reader->set_handle ('BasPlug::FILE');
532 $reader->set_encoding ($encoding);
533 $reader->read_file ($textref);
534
535 if ($language eq "zh") {
536 # segment the Chinese words
537 $$textref = &cnseg::segment($$textref);
538 }
539 }
540
541 close FILE;
542}
543
544sub textcat_get_language_encoding {
545 my $self = shift (@_);
546 my ($filename) = @_;
547
548 my ($language, $encoding, $extracted_encoding);
549 if ($self->{'input_encoding'} eq "auto") {
550 # use textcat to automatically work out the input encoding and language
551 ($language, $encoding) = $self->get_language_encoding ($filename);
552 } elsif ($self->{'extract_language'}) {
553 # use textcat to get language metadata
554 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
555 $encoding = $self->{'input_encoding'};
556 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
557 my $plugin_name = ref ($self);
558 my $outhandle = $self->{'outhandle'};
559 print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
560 print $outhandle "appears to be encoded as $extracted_encoding.\n";
561 }
562 } else {
563 $language = $self->{'default_language'};
564 $encoding = $self->{'input_encoding'};
565 }
566 return ($language, $encoding);
567}
568
569# Uses textcat to work out the encoding and language of the text in
570# $filename. All html tags are removed before processing.
571# returns an array containing "language" and "encoding"
572sub get_language_encoding {
573 my $self = shift (@_);
574 my ($filename) = @_;
575 my $outhandle = $self->{'outhandle'};
576
577 # read in file
578 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
579 undef $/;
580 my $text = <FILE>;
581 $/ = "\n";
582 close FILE;
583
584 # remove <title>stuff</title> -- as titles tend often to be in English
585 # for foreign language documents
586 $text =~ s/<title>.*?<\/title>//i;
587
588 # remove all HTML tags
589 $text =~ s/<[^>]*>//sg;
590
591 # get the language/encoding
592 my $results = $self->{'textcat'}->classify(\$text);
593
594 # if textcat returns 3 or less possibilities we'll use the
595 # first one in the list - otherwise use the defaults
596 if (scalar @$results > 3) {
597
598 if ($self->{'input_encoding'} ne 'auto') {
599 if ($self->{'extract_language'} && $self->{'verbosity'}) {
600 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
601 print $outhandle "defaulting to $self->{'default_language'}\n";
602 }
603 return ($self->{'default_language'}, $self->{'input_encoding'});
604
605 } else {
606 if ($self->{'verbosity'}) {
607 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
608 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
609 }
610 return ($self->{'default_language'}, $self->{'default_encoding'});
611 }
612 }
613
614 # format language/encoding
615 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
616 if (!defined $language) {
617 if ($self->{'verbosity'}) {
618 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
619 print $outhandle "defaulting to $self->{'default_language'}\n";
620 }
621 $language = $self->{'default_language'};
622 }
623 if (!defined $encoding) {
624 if ($self->{'verbosity'}) {
625 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
626 print $outhandle "defaulting to $self->{'default_encoding'}\n";
627 }
628 $encoding = $self->{'default_encoding'};
629 }
630
631 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
632 !defined $encodings::encodings->{$encoding}) {
633 if ($self->{'verbosity'}) {
634 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
635 print $outhandle "using $self->{'default_encoding'}\n";
636 }
637 $encoding = $self->{'default_encoding'};
638 }
639
640 return ($language, $encoding);
641}
642
643# add any extra metadata that's been passed around from one
644# plugin to another.
645# extra_metadata uses add_utf8_metadata so it expects metadata values
646# to already be in utf8
647sub extra_metadata {
648 my $self = shift (@_);
649 my ($doc_obj, $cursection, $metadata) = @_;
650
651 foreach my $field (keys(%$metadata)) {
652 # $metadata->{$field} may be an array reference
653 if (ref ($metadata->{$field}) eq "ARRAY") {
654 map {
655 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
656 } @{$metadata->{$field}};
657 } else {
658 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
659 }
660 }
661}
662
663# initialise metadata extractors
664sub initialise_extractors {
665 my $self = shift (@_);
666
667 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
668 &acronym::initialise_acronyms();
669 }
670}
671
672# finalise metadata extractors
673sub finalise_extractors {
674 my $self = shift (@_);
675
676 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
677 &acronym::finalise_acronyms();
678 }
679}
680
681# FIRSTNNN: extract the first NNN characters as metadata
682sub extract_first_NNNN_characters {
683 my $self = shift (@_);
684 my ($textref, $doc_obj, $thissection) = @_;
685
686 foreach my $size (split /,/, $self->{'first'}) {
687 my $tmptext = $$textref;
688 $tmptext =~ s/^\s+//;
689 $tmptext =~ s/\s+$//;
690 $tmptext =~ s/\s+/ /gs;
691 $tmptext = substr ($tmptext, 0, $size);
692 $tmptext =~ s/\s\S*$/&#8230;/;
693 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
694 }
695}
696
697sub extract_email {
698 my $self = shift (@_);
699 my ($textref, $doc_obj, $thissection) = @_;
700 my $outhandle = $self->{'outhandle'};
701
702 print $outhandle " extracting email addresses ...\n"
703 if ($self->{'verbosity'} > 2);
704
705 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
706 @email = sort @email;
707
708 my @email2 = ();
709 foreach my $address (@email) {
710 if (!(join(" ",@email2) =~ m/$address/ )) {
711 push @email2, $address;
712 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
713 print $outhandle " extracting $address\n"
714 if ($self->{'verbosity'} > 3);
715 }
716 }
717 print $outhandle " done extracting email addresses.\n"
718 if ($self->{'verbosity'} > 2);
719
720}
721
722# extract metadata
723sub auto_extract_metadata {
724
725
726 my $self = shift (@_);
727 my ($doc_obj) = @_;
728
729 if ($self->{'extract_email'}) {
730 my $thissection = $doc_obj->get_top_section();
731 while (defined $thissection) {
732 my $text = $doc_obj->get_text($thissection);
733 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
734 $thissection = $doc_obj->get_next_section ($thissection);
735 }
736 }
737
738
739#adding kea keyphrases
740 if ($self->{'kea'}) {
741
742 my $thissection = $doc_obj->get_top_section();
743 my $text = "";
744 my @list;
745
746 while (defined $thissection) { #loop through sections to gather whole doc
747 my $sectiontext = $doc_obj->get_text($thissection);
748 $text = $text.$sectiontext;
749 $thissection = $doc_obj->get_next_section ($thissection);
750 }
751
752 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
753 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
754 } else { #otherwise call Kea with no options
755 @list = &Kea::extract_KeyPhrases ($text);
756 }
757
758 if(@list){ #if a list of kea keyphrases was returned (ie not empty)
759 my $keyphrases = $list[0]; #first arg is keyphrase list
760 my $stems = $list[1]; #second arg is stemmed keyphrase list
761 print STDERR "keyphrases: $keyphrases\n";
762 print STDERR "stems: $stems\n";
763 $thissection = $doc_obj->get_top_section(); #add metadata to top section
764 $doc_obj->add_metadata($thissection, "kea", $keyphrases);
765 $doc_obj->add_metadata($thissection, "stems", $stems);
766 }
767 } #end of kea
768
769 if ($self->{'first'}) {
770 my $thissection = $doc_obj->get_top_section();
771 while (defined $thissection) {
772 my $text = $doc_obj->get_text($thissection);
773 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
774 $thissection = $doc_obj->get_next_section ($thissection);
775 }
776 }
777
778 if ($self->{'extract_acronyms'}) {
779 my $thissection = $doc_obj->get_top_section();
780 while (defined $thissection) {
781 my $text = $doc_obj->get_text($thissection);
782 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
783 $thissection = $doc_obj->get_next_section ($thissection);
784 }
785 }
786
787 if ($self->{'markup_acronyms'}) {
788 my $thissection = $doc_obj->get_top_section();
789 while (defined $thissection) {
790 my $text = $doc_obj->get_text($thissection);
791 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
792 $doc_obj->delete_text($thissection);
793 $doc_obj->add_text($thissection, $text);
794 $thissection = $doc_obj->get_next_section ($thissection);
795 }
796 }
797
798 if($self->{'date_extract'}) {
799 my $thissection = $doc_obj->get_top_section();
800 while (defined $thissection) {
801
802 my $text = $doc_obj->get_text($thissection);
803 &DateExtract::get_date_metadata($text, $doc_obj,
804 $thissection,
805 $self->{'no_biblio'},
806 $self->{'max_year'},
807 $self->{'max_century'});
808 $thissection = $doc_obj->get_next_section ($thissection);
809 }
810 }
811}
812
813# extract acronyms from a section in a document. progress is
814# reported to outhandle based on the verbosity. both the Acronym
815# and the AcronymKWIC metadata items are created.
816
817sub extract_acronyms {
818 my $self = shift (@_);
819 my ($textref, $doc_obj, $thissection) = @_;
820 my $outhandle = $self->{'outhandle'};
821
822 print $outhandle " extracting acronyms ...\n"
823 if ($self->{'verbosity'} > 2);
824
825 my $acro_array = &acronym::acronyms($textref);
826
827 foreach my $acro (@$acro_array) {
828
829 #check that this is the first time ...
830 my $seen_before = "false";
831 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
832 foreach my $thisAcro (@$previous_data) {
833 if ($thisAcro eq $acro->to_string()) {
834 $seen_before = "true";
835 print $outhandle " already seen ". $acro->to_string() . "\n"
836 if ($self->{'verbosity'} >= 4);
837 }
838 }
839
840 if ($seen_before eq "false") {
841 #write it to the file ...
842 $acro->write_to_file();
843
844 #do the normal acronym
845 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
846 print $outhandle " adding ". $acro->to_string() . "\n"
847 if ($self->{'verbosity'} > 3);
848
849 }
850 }
851 print $outhandle " done extracting acronyms. \n"
852 if ($self->{'verbosity'} > 2);
853}
854
855sub markup_acronyms {
856 my $self = shift (@_);
857 my ($text, $doc_obj, $thissection) = @_;
858 my $outhandle = $self->{'outhandle'};
859
860 print $outhandle " marking up acronyms ...\n"
861 if ($self->{'verbosity'} > 2);
862
863 #self is passed in to check for verbosity ...
864 $text = &acronym::markup_acronyms($text, $self);
865
866 print $outhandle " done marking up acronyms. \n"
867 if ($self->{'verbosity'} > 2);
868
869 return $text;
870}
871
872sub compile_stats {
873 my $self = shift(@_);
874 my ($stats) = @_;
875
876 $stats->{'num_processed'} += $self->{'num_processed'};
877 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
878 $stats->{'num_archives'} += $self->{'num_archives'};
879
880}
881
882sub associate_cover_image {
883 my $self = shift(@_);
884 my ($doc_obj, $filename) = @_;
885
886 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
887 if (-e $filename) {
888 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
889 } else {
890 $filename =~ s/jpg$/JPG/;
891 if (-e $filename) {
892 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
893 }
894 }
895}
896
8971;
Note: See TracBrowser for help on using the repository browser.