Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 7644

Last change on this file since 7644 was 7644, checked in by jrm21, 20 years ago
don't print "wrong encoding" message for text in english. textcat thinks all english is in iso-8859-1, so basplug complains if the file was read as utf8.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.0 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	eval {require bytes};
29
30	# suppress the annoying "subroutine redefined" warning that various
31	# plugins cause under perl 5.6
32	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34	use Kea;
35	use parsargv;
36	use multiread;
37	use encodings;
38	use cnseg;
39	use acronym;
40	use textcat;
41	use doc;
42	use diagnostics;
43	use DateExtract;
44	use ghtml;
45	use gsprintf;
46	use printusage;
47
48
49	my $unicode_list =
50	[ { 'name' => "auto",
51	'desc' => "{BasPlug.input_encoding.auto}" },
52	{ 'name' => "ascii",
53	'desc' => "{BasPlug.input_encoding.ascii}" },
54	{ 'name' => "utf8",
55	'desc' => "{BasPlug.input_encoding.utf8}" },
56	{ 'name' => "unicode",
57	'desc' => "{BasPlug.input_encoding.unicode}" } ];
58
59	my $arguments =
60	[ { 'name' => "process_exp",
61	'desc' => "{BasPlug.process_exp}",
62	'type' => "regexp",
63	'deft' => "",
64	'reqd' => "no" },
65	{ 'name' => "block_exp",
66	'desc' => "{BasPlug.block_exp}",
67	'type' => "regexp",
68	'deft' => "",
69	'reqd' => "no" },
70	{ 'name' => "input_encoding",
71	'desc' => "{BasPlug.input_encoding}",
72	'type' => "enum",
73	'list' => $unicode_list,
74	'reqd' => "no" ,
75	'deft' => "auto" } ,
76	{ 'name' => "default_encoding",
77	'desc' => "{BasPlug.default_encoding}",
78	'type' => "enum",
79	'list' => $unicode_list,
80	'reqd' => "no",
81	'deft' => "utf8" },
82	{ 'name' => "extract_language",
83	'desc' => "{BasPlug.extract_language}",
84	'type' => "flag",
85	'reqd' => "no" },
86	{ 'name' => "default_language",
87	'desc' => "{BasPlug.default_language}",
88	'type' => "language",
89	'deft' => "en",
90	'reqd' => "no" },
91	{ 'name' => "extract_acronyms",
92	'desc' => "{BasPlug.extract_acronyms}",
93	'type' => "flag",
94	'reqd' => "no" },
95	{ 'name' => "markup_acronyms",
96	'desc' => "{BasPlug.markup_acronyms}",
97	'type' => "flag",
98	'reqd' => "no" },
99	{ 'name' => "first",
100	'desc' => "{BasPlug.first}",
101	'type' => "string",
102	'reqd' => "no" },
103	{ 'name' => "extract_email",
104	'desc' => "{BasPlug.extract_email}",
105	'type' => "flag",
106	'reqd' => "no" },
107	{ 'name' => "extract_historical_years",
108	'desc' => "{BasPlug.extract_historical_years}",
109	'type' => "flag",
110	'reqd' => "no" },
111	{ 'name' => "maximum_year",
112	'desc' => "{BasPlug.maximum_year}",
113	'type' => "int",
114	'deft' => (localtime)[5]+1900,
115	'reqd' => "no"},
116	{ 'name' => "maximum_century",
117	'desc' => "{BasPlug.maximum_century}",
118	'type' => "string",
119	'deft' => "",
120	'reqd' => "no" },
121	{ 'name' => "no_bibliography",
122	'desc' => "{BasPlug.no_bibliography}",
123	'type' => "flag",
124	'reqd' => "no"},
125	{ 'name' => "cover_image",
126	'desc' => "{BasPlug.cover_image}",
127	'type' => "flag",
128	'reqd' => "no" } ];
129
130	my $options = { 'name' => "BasPlug",
131	'desc' => "{BasPlug.desc}",
132	'abstract' => "yes",
133	'inherits' => "no",
134	'args' => $arguments };
135
136
137	sub gsprintf
138	{
139	return &gsprintf::gsprintf(@_);
140	}
141
142
143	sub get_arguments
144	{
145	local $self = shift(@_);
146	local $optionlistref = $self->{'option_list'};
147	local @optionlist = @$optionlistref;
148	local $pluginoptions = pop(@$optionlistref);
149	local $pluginarguments = $pluginoptions->{'args'};
150	return $pluginarguments;
151	}
152
153
154	sub print_xml_usage
155	{
156	local $self = shift(@_);
157
158	# XML output is always in UTF-8
159	&gsprintf::output_strings_in_UTF8;
160
161	&PrintUsage::print_xml_header();
162	$self->print_xml();
163	}
164
165
166	sub print_xml
167	{
168	local $self = shift(@_);
169
170	local $optionlistref = $self->{'option_list'};
171	local @optionlist = @$optionlistref;
172	local $pluginoptions = pop(@$optionlistref);
173	return if (!defined($pluginoptions));
174
175	&gsprintf(STDERR, "<PlugInfo>\n");
176	&gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
177	my $desc = &gsprintf::lookup_string($pluginoptions->{'desc'});
178	$desc =~ s/</&lt;/g; # doubly escaped
179	$desc =~ s/>/&gt;/g;
180
181	&gsprintf(STDERR, " <Desc>$desc</Desc>\n");
182	&gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
183	&gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
184	&gsprintf(STDERR, " <Arguments>\n");
185	if (defined($pluginoptions->{'args'})) {
186	&PrintUsage::print_options_xml($pluginoptions->{'args'});
187	}
188
189	# Recurse up the plugin hierarchy
190	$self->print_xml();
191
192	&gsprintf(STDERR, " </Arguments>\n");
193	&gsprintf(STDERR, "</PlugInfo>\n");
194	}
195
196
197	sub print_txt_usage
198	{
199	local $self = shift(@_);
200
201	# Print the usage message for a plugin (recursively)
202	local $descoffset = $self->determine_description_offset(0);
203	$self->print_plugin_usage($descoffset, 1);
204	}
205
206
207	sub determine_description_offset
208	{
209	local $self = shift(@_);
210	local $maxoffset = shift(@_);
211
212	local $optionlistref = $self->{'option_list'};
213	local @optionlist = @$optionlistref;
214	local $pluginoptions = pop(@$optionlistref);
215	return $maxoffset if (!defined($pluginoptions));
216
217	# Find the length of the longest option string of this plugin
218	local $pluginargs = $pluginoptions->{'args'};
219	if (defined($pluginargs)) {
220	local $longest = &PrintUsage::find_longest_option_string($pluginargs);
221	if ($longest > $maxoffset) {
222	$maxoffset = $longest;
223	}
224	}
225
226	# Recurse up the plugin hierarchy
227	$maxoffset = $self->determine_description_offset($maxoffset);
228	$self->{'option_list'} = \@optionlist;
229	return $maxoffset;
230	}
231
232
233	sub print_plugin_usage
234	{
235	local $self = shift(@_);
236	local $descoffset = shift(@_);
237	local $isleafclass = shift(@_);
238
239	local $optionlistref = $self->{'option_list'};
240	local @optionlist = @$optionlistref;
241	local $pluginoptions = pop(@$optionlistref);
242	return if (!defined($pluginoptions));
243
244	local $pluginname = $pluginoptions->{'name'};
245	local $pluginargs = $pluginoptions->{'args'};
246	local $plugindesc = $pluginoptions->{'desc'};
247
248	# Produce the usage information using the data structure above
249	if ($isleafclass) {
250	if (defined($plugindesc)) {
251	&gsprintf(STDERR, "$plugindesc\n\n");
252	}
253	&gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
254	}
255
256	# Display the plugin options, if there are some
257	if (defined($pluginargs)) {
258	# Calculate the column offset of the option descriptions
259	local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
260
261	if ($isleafclass) {
262	&gsprintf(STDERR, " {common.specific_options}:\n");
263	}
264	else {
265	&gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
266	}
267
268	# Display the plugin options
269	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
270	}
271
272	# Recurse up the plugin hierarchy
273	$self->print_plugin_usage($descoffset, 0);
274	$self->{'option_list'} = \@optionlist;
275	}
276
277
278	sub new {
279	my $class = shift (@_);
280	my $plugin_name = shift (@_);
281	my $self = {};
282	$self->{'plugin_type'} = "BasPlug";
283	my $enc = "^(";
284	map {$enc .= "$_\|";} keys %$encodings::encodings;
285	my $denc = $enc . "ascii\|utf8\|unicode)\$";
286	$enc .= "ascii\|utf8\|unicode\|auto)\$";
287
288	$self->{'outhandle'} = STDERR;
289	my $year = (localtime)[5]+1900;
290
291	$self->{'textcat'} = new textcat();
292
293	$self->{'num_processed'} = 0;
294	$self->{'num_not_processed'} = 0;
295	$self->{'num_blocked'} = 0;
296	$self->{'num_archives'} = 0;
297
298	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
299	$self->{'option_list'} = [ $options ];
300
301	# general options available to all plugins
302	if (!parsargv::parse(\@_,
303	q^process_exp/.*/^, \$self->{'process_exp'},
304	q^block_exp/.*/^, \$self->{'block_exp'},
305	q^extract_language^, \$self->{'extract_language'},
306	q^extract_acronyms^, \$self->{'extract_acronyms'},
307	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
308	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
309	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
310	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
311	q^extract_email^, \$self->{'extract_email'},
312	q^markup_acronyms^, \$self->{'markup_acronyms'},
313	q^default_language/.{2}/en^, \$self->{'default_language'},
314	q^first/.*/^, \$self->{'first'},
315	q^extract_historical_years^, \$self->{'date_extract'},
316	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
317	q^no_bibliography^, \$self->{'no_biblio'},
318	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
319	q^cover_image^, \$self->{'cover_image'},
320	q^separate_cjk^, \$self->{'separate_cjk'},
321	"allow_extra_options")) {
322
323	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
324	$self->print_txt_usage(""); # Use default resource bundle
325	die "\n";
326	}
327
328	return bless $self, $class;
329	}
330
331	# initialize BasPlug options
332	# if init() is overridden in a sub-class, remember to call BasPlug::init()
333	sub init {
334	my $self = shift (@_);
335	my ($verbosity, $outhandle, $failhandle) = @_;
336
337	# verbosity is passed through from the processor
338	$self->{'verbosity'} = $verbosity;
339
340	# as are the outhandle and failhandle
341	$self->{'outhandle'} = $outhandle if defined $outhandle;
342	$self->{'failhandle'} = $failhandle;
343
344	# set process_exp and block_exp to defaults unless they were
345	# explicitly set
346
347	if ((!$self->is_recursive()) and
348	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
349
350	$self->{'process_exp'} = $self->get_default_process_exp ();
351	if ($self->{'process_exp'} eq "") {
352	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
353	}
354	}
355
356	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
357	$self->{'block_exp'} = $self->get_default_block_exp ();
358	}
359	}
360
361	sub begin {
362	my $self = shift (@_);
363	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
364	$self->initialise_extractors();
365	}
366
367	sub end {
368	my ($self) = @_;
369	$self->finalise_extractors();
370	}
371
372	# this function should be overridden to return 1
373	# in recursive plugins
374	sub is_recursive {
375	my $self = shift (@_);
376
377	return 0;
378	}
379
380	sub get_default_block_exp {
381	my $self = shift (@_);
382
383	return "";
384	}
385
386	sub get_default_process_exp {
387	my $self = shift (@_);
388
389	return "";
390	}
391
392	# The BasPlug read() function. This function does all the right things
393	# to make general options work for a given plugin. It calls the process()
394	# function which does all the work specific to a plugin (like the old
395	# read functions used to do). Most plugins should define their own
396	# process() function and let this read() function keep control.
397	#
398	# recursive plugins (e.g. RecPlug) and specialized plugins like those
399	# capable of processing many documents within a single file (e.g.
400	# GMLPlug) should normally implement their own version of read()
401	#
402	# Return number of files processed, undef if can't recognise, -1 if can't
403	# process
404	# Note that $base_dir might be "" and that $file might
405	# include directories
406
407	sub read {
408	my $self = shift (@_);
409
410	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
411
412	if ($self->is_recursive()) {
413	&gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
414	}
415
416	my $outhandle = $self->{'outhandle'};
417
418	my $filename = $file;
419	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
420
421	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
422	$self->{'num_blocked'} ++;
423	return 0; # blocked
424	}
425	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
426	return undef; # can't recognise
427	}
428	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
429
430	# Do encoding stuff
431	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
432
433	# create a new document
434	my $doc_obj = new doc ($filename, "indexed_doc");
435	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
436	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
437	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
438	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
439	my ($filemeta) = $file =~ /([^\\\/]+)$/;
440	# how do we know what encoding the filename is in?
441	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
442	if ($self->{'cover_image'}) {
443	$self->associate_cover_image($doc_obj, $filename);
444	}
445
446	# read in file ($text will be in utf8)
447	my $text = "";
448	$self->read_file ($filename, $encoding, $language, \$text);
449
450	if (!length ($text)) {
451	my $plugin_name = ref ($self);
452	&gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
453
454	my $failhandle = $self->{'failhandle'};
455	&gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
456	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
457	$self->{'num_not_processed'} ++;
458
459	return 0; # what should we return here?? error but don't want to pass it on
460	}
461
462	# include any metadata passed in from previous plugins
463	# note that this metadata is associated with the top level section
464	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
465
466	# do plugin specific processing of doc_obj
467	return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli));
468
469	# do any automatic metadata extraction
470	$self->auto_extract_metadata ($doc_obj);
471
472	# add an OID
473	# see if there is a plugin-specific set_OID function...
474	if (defined ($self->can(set_OID))) {
475	# it will need $doc_obj to set the Identifier metadata...
476	$self->set_OID($doc_obj);
477	} else {
478	# use the default set_OID() in doc.pm
479	$doc_obj->set_OID();
480	}
481
482	# process the document
483	$processor->process($doc_obj);
484
485	$self->{'num_processed'} ++;
486
487	return 1; # processed the file
488	}
489
490	# returns undef if file is rejected by the plugin
491	sub process {
492	my $self = shift (@_);
493	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
494
495	&gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
496	# die "Basplug::process function must be implemented in sub-class\n";
497
498	return undef; # never gets here
499	}
500
501	# uses the multiread package to read in the entire file pointed to
502	# by filename and loads the resulting text into $$textref. Input text
503	# may be in any of the encodings handled by multiread, output text
504	# will be in utf8
505	sub read_file {
506	my $self = shift (@_);
507	my ($filename, $encoding, $language, $textref) = @_;
508
509	if (!-r $filename)
510	{
511	my $outhandle = $self->{'outhandle'};
512	&gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
513	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
514	return;
515	}
516
517	$$textref = "";
518
519	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
520	# open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
521
522	if ($encoding eq "ascii") {
523	undef $/;
524	$$textref = <FILE>;
525	$/ = "\n";
526	} else {
527	my $reader = new multiread();
528	$reader->set_handle ('BasPlug::FILE');
529	$reader->set_encoding ($encoding);
530	$reader->read_file ($textref);
531
532	#Now segments chinese if the separate_cjk option is set
533	if ($self->{'separate_cjk'}) {
534	# segment the Chinese words
535	$$textref = &cnseg::segment($$textref);
536	}
537	}
538
539	close FILE;
540	}
541
542	sub filename_based_title
543	{
544	my $self = shift (@_);
545	my ($file) = @_;
546
547	my $file_derived_title = $file;
548	$file_derived_title =~ s/_/ /g;
549	$file_derived_title =~ s/\..*?$//;
550
551	return $file_derived_title;
552	}
553
554
555	sub title_fallback
556	{
557	my $self = shift (@_);
558	my ($doc_obj,$section,$file) = @_;
559
560	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
561
562	my $file_derived_title = $self->filename_based_title($file);
563	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
564	}
565	}
566
567	sub textcat_get_language_encoding {
568	my $self = shift (@_);
569	my ($filename) = @_;
570
571	my ($language, $encoding, $extracted_encoding);
572	if ($self->{'input_encoding'} eq "auto") {
573	# use textcat to automatically work out the input encoding and language
574	($language, $encoding) = $self->get_language_encoding ($filename);
575	} elsif ($self->{'extract_language'}) {
576	# use textcat to get language metadata
577	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
578	$encoding = $self->{'input_encoding'};
579	# don't print this message for english... english in utf8 is identical
580	# to english in iso-8859-1 (except for some punctuation). We don't have
581	# a language model for en_utf8, so textcat always says iso-8859-1!
582	if ($extracted_encoding ne $encoding && $language ne "en"
583	&& $self->{'verbosity'}) {
584	my $plugin_name = ref ($self);
585	my $outhandle = $self->{'outhandle'};
586	&gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
587	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
588	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
589	}
590	} else {
591	$language = $self->{'default_language'};
592	$encoding = $self->{'input_encoding'};
593	}
594	return ($language, $encoding);
595	}
596
597	# Uses textcat to work out the encoding and language of the text in
598	# $filename. All html tags are removed before processing.
599	# returns an array containing "language" and "encoding"
600	sub get_language_encoding {
601	my $self = shift (@_);
602	my ($filename) = @_;
603	my $outhandle = $self->{'outhandle'};
604
605	# read in file
606	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
607	undef $/;
608	my $text = <FILE>;
609	$/ = "\n";
610	close FILE;
611
612	# remove <title>stuff</title> -- as titles tend often to be in English
613	# for foreign language documents
614	$text =~ s/<title>.*?<\/title>//i;
615
616	# remove all HTML tags
617	$text =~ s/<[^>]*>//sg;
618
619	# get the language/encoding
620	my $results = $self->{'textcat'}->classify(\$text);
621
622	# if textcat returns 3 or less possibilities we'll use the
623	# first one in the list - otherwise use the defaults
624	if (scalar @$results > 3) {
625	# changed 12 Feb 2003 by jrm21
626	# use the most popular encoding at least... otherwise we might
627	# generate invalid archive files!
628	my %guessed_encodings = ();
629	foreach my $result (@$results) {
630	$result =~ /([^\-]+)$/;
631	my $enc=$1;
632	if (!defined($guessed_encodings{$enc})) {
633	$guessed_encodings{$enc}=0;
634	}
635	$guessed_encodings{$enc}++;
636	}
637	my $best_encoding="";
638	$guessed_encodings{""}=-1;
639	foreach my $enc (keys %guessed_encodings) {
640	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
641	$best_encoding=$enc;
642	}
643	}
644
645	if ($self->{'input_encoding'} ne 'auto') {
646	if ($self->{'extract_language'} && $self->{'verbosity'}) {
647	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
648	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
649	# print $outhandle "defaulting to $self->{'default_language'}\n";
650	}
651	return ($self->{'default_language'}, $self->{'input_encoding'});
652
653	} else {
654	if ($self->{'verbosity'}) {
655	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
656	# print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
657	# print $outhandle "defaulting to $self->{'default_language'}.\n";
658	}
659	return ($self->{'default_language'}, $best_encoding);
660	}
661	}
662
663	# format language/encoding
664	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
665	if (!defined $language) {
666	if ($self->{'verbosity'}) {
667	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
668	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
669	# print $outhandle "defaulting to $self->{'default_language'}\n";
670	}
671	$language = $self->{'default_language'};
672	}
673	if (!defined $encoding) {
674	if ($self->{'verbosity'}) {
675	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
676	# print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
677	# print $outhandle "defaulting to $self->{'default_encoding'}\n";
678	}
679	$encoding = $self->{'default_encoding'};
680	}
681
682	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
683	!defined $encodings::encodings->{$encoding}) {
684	if ($self->{'verbosity'}) {
685	&gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", $filename, $encoding, $self->{'default_encoding'});
686	# print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
687	# print $outhandle "using $self->{'default_encoding'}\n";
688	}
689	$encoding = $self->{'default_encoding'};
690	}
691
692	return ($language, $encoding);
693	}
694
695	# add any extra metadata that's been passed around from one
696	# plugin to another.
697	# extra_metadata uses add_utf8_metadata so it expects metadata values
698	# to already be in utf8
699	sub extra_metadata {
700	my $self = shift (@_);
701	my ($doc_obj, $cursection, $metadata) = @_;
702
703	foreach my $field (keys(%$metadata)) {
704	# $metadata->{$field} may be an array reference
705	if (ref ($metadata->{$field}) eq "ARRAY") {
706	map {
707	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
708	} @{$metadata->{$field}};
709	} else {
710	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
711	}
712	}
713	}
714
715	# initialise metadata extractors
716	sub initialise_extractors {
717	my $self = shift (@_);
718
719	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
720	&acronym::initialise_acronyms();
721	}
722	}
723
724	# finalise metadata extractors
725	sub finalise_extractors {
726	my $self = shift (@_);
727
728	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
729	&acronym::finalise_acronyms();
730	}
731	}
732
733	# FIRSTNNN: extract the first NNN characters as metadata
734	sub extract_first_NNNN_characters {
735	my $self = shift (@_);
736	my ($textref, $doc_obj, $thissection) = @_;
737
738	foreach my $size (split /,/, $self->{'first'}) {
739	my $tmptext = $$textref;
740	$tmptext =~ s/^\s+//;
741	$tmptext =~ s/\s+$//;
742	$tmptext =~ s/\s+/ /gs;
743	$tmptext = substr ($tmptext, 0, $size);
744	$tmptext =~ s/\s\S*$/…/;
745	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
746	}
747	}
748
749	sub extract_email {
750	my $self = shift (@_);
751	my ($textref, $doc_obj, $thissection) = @_;
752	my $outhandle = $self->{'outhandle'};
753
754	# print $outhandle " extracting email addresses ...\n"
755	&gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
756	if ($self->{'verbosity'} > 2);
757
758	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
759	@email = sort @email;
760
761	my @email2 = ();
762	foreach my $address (@email) {
763	if (!(join(" ",@email2) =~ m/$address/ )) {
764	push @email2, $address;
765	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
766	# print $outhandle " extracting $address\n"
767	&gsprintf($outhandle, " {BasPlug.extracting} $address\n")
768	if ($self->{'verbosity'} > 3);
769	}
770	}
771	# print $outhandle " done extracting email addresses.\n"
772	&gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
773	if ($self->{'verbosity'} > 2);
774	}
775
776	# extract metadata
777	sub auto_extract_metadata {
778
779	my $self = shift (@_);
780	my ($doc_obj) = @_;
781
782	if ($self->{'extract_email'}) {
783	my $thissection = $doc_obj->get_top_section();
784	while (defined $thissection) {
785	my $text = $doc_obj->get_text($thissection);
786	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
787	$thissection = $doc_obj->get_next_section ($thissection);
788	}
789	}
790
791
792	#adding kea keyphrases
793	if ($self->{'kea'}) {
794
795	my $thissection = $doc_obj->get_top_section();
796	my $text = "";
797	my @list;
798
799	while (defined $thissection) { #loop through sections to gather whole doc
800	my $sectiontext = $doc_obj->get_text($thissection);
801	$text = $text.$sectiontext;
802	$thissection = $doc_obj->get_next_section ($thissection);
803	}
804
805	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
806	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
807	} else { #otherwise call Kea with no options
808	@list = &Kea::extract_KeyPhrases ($text);
809	}
810
811	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
812	my $keyphrases = $list[0]; #first arg is keyphrase list
813	my $stems = $list[1]; #second arg is stemmed keyphrase list
814	&gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
815	# print STDERR "keyphrases: $keyphrases\n";
816	&gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
817	# print STDERR "stems: $stems\n";
818	$thissection = $doc_obj->get_top_section(); #add metadata to top section
819	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
820	$doc_obj->add_metadata($thissection, "stems", $stems);
821	}
822	} #end of kea
823
824	if ($self->{'first'}) {
825	my $thissection = $doc_obj->get_top_section();
826	while (defined $thissection) {
827	my $text = $doc_obj->get_text($thissection);
828	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
829	$thissection = $doc_obj->get_next_section ($thissection);
830	}
831	}
832
833	if ($self->{'extract_acronyms'}) {
834	my $thissection = $doc_obj->get_top_section();
835	while (defined $thissection) {
836	my $text = $doc_obj->get_text($thissection);
837	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
838	$thissection = $doc_obj->get_next_section ($thissection);
839	}
840	}
841
842	if ($self->{'markup_acronyms'}) {
843	my $thissection = $doc_obj->get_top_section();
844	while (defined $thissection) {
845	my $text = $doc_obj->get_text($thissection);
846	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
847	$doc_obj->delete_text($thissection);
848	$doc_obj->add_text($thissection, $text);
849	$thissection = $doc_obj->get_next_section ($thissection);
850	}
851	}
852
853	if($self->{'date_extract'}) {
854	my $thissection = $doc_obj->get_top_section();
855	while (defined $thissection) {
856
857	my $text = $doc_obj->get_text($thissection);
858	&DateExtract::get_date_metadata($text, $doc_obj,
859	$thissection,
860	$self->{'no_biblio'},
861	$self->{'max_year'},
862	$self->{'max_century'});
863	$thissection = $doc_obj->get_next_section ($thissection);
864	}
865	}
866	}
867
868	# extract acronyms from a section in a document. progress is
869	# reported to outhandle based on the verbosity. both the Acronym
870	# and the AcronymKWIC metadata items are created.
871
872	sub extract_acronyms {
873	my $self = shift (@_);
874	my ($textref, $doc_obj, $thissection) = @_;
875	my $outhandle = $self->{'outhandle'};
876
877	# print $outhandle " extracting acronyms ...\n"
878	&gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
879	if ($self->{'verbosity'} > 2);
880
881	my $acro_array = &acronym::acronyms($textref);
882
883	foreach my $acro (@$acro_array) {
884
885	#check that this is the first time ...
886	my $seen_before = "false";
887	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
888	foreach my $thisAcro (@$previous_data) {
889	if ($thisAcro eq $acro->to_string()) {
890	$seen_before = "true";
891	# print $outhandle " already seen ". $acro->to_string() . "\n"
892	&gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
893	if ($self->{'verbosity'} >= 4);
894	}
895	}
896
897	if ($seen_before eq "false") {
898	#write it to the file ...
899	$acro->write_to_file();
900
901	#do the normal acronym
902	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
903	# print $outhandle " adding ". $acro->to_string() . "\n"
904	&gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
905	if ($self->{'verbosity'} > 3);
906	}
907	}
908
909	# print $outhandle " done extracting acronyms. \n"
910	&gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
911	if ($self->{'verbosity'} > 2);
912	}
913
914	sub markup_acronyms {
915	my $self = shift (@_);
916	my ($text, $doc_obj, $thissection) = @_;
917	my $outhandle = $self->{'outhandle'};
918
919	# print $outhandle " marking up acronyms ...\n"
920	&gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
921	if ($self->{'verbosity'} > 2);
922
923	#self is passed in to check for verbosity ...
924	$text = &acronym::markup_acronyms($text, $self);
925
926	# print $outhandle " done marking up acronyms. \n"
927	&gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
928	if ($self->{'verbosity'} > 2);
929
930	return $text;
931	}
932
933	sub compile_stats {
934	my $self = shift(@_);
935	my ($stats) = @_;
936
937	$stats->{'num_processed'} += $self->{'num_processed'};
938	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
939	$stats->{'num_archives'} += $self->{'num_archives'};
940
941	}
942
943	sub associate_cover_image {
944	my $self = shift(@_);
945	my ($doc_obj, $filename) = @_;
946
947	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
948	if (-e $filename) {
949	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
950	} else {
951	$filename =~ s/jpg$/JPG/;
952	if (-e $filename) {
953	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
954	}
955	}
956	}
957
958	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: