Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: tags/gsdl-2_40-distribution/gsdl/perllib/plugins/BasPlug.pm@ 4846

Last change on this file since 4846 was 4846, checked in by (none), 21 years ago
This commit was manufactured by cvs2svn to create tag 'gsdl-2_40-distribution'.
Property svn:keywords set to `Author Date Id Revision`
File size: 34.9 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	eval {require bytes};
29
30	# suppress the annoying "subroutine redefined" warning that various
31	# plugins cause under perl 5.6
32	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34	use Kea;
35	use parsargv;
36	use multiread;
37	use encodings;
38	use cnseg;
39	use acronym;
40	use textcat;
41	use doc;
42	use diagnostics;
43	use DateExtract;
44	use ghtml;
45	use printusage;
46
47	my $unicode_list =
48	[ { 'name' => "auto",
49	'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
50	{ 'name' => "ascii",
51	'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
52	{ 'name' => "utf8",
53	'desc' => "either utf8 or unicode -- automatically detected." },
54	{ 'name' => "unicode",
55	'desc' => "just unicode" } ];
56
57	my $arguments =
58	[ { 'name' => "process_exp",
59	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
60	'type' => "string",
61	'deft' => "",
62	'reqd' => "no" },
63	{ 'name' => "block_exp",
64	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
65	'type' => 'string',
66	'deft' => "",
67	'reqd' => "no" },
68	{ 'name' => "input_encoding",
69	'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.",
70	'type' => "enum",
71	'list' => $unicode_list,
72	'reqd' => "no" ,
73	'deft' => "auto" } ,
74	{ 'name' => "default_encoding",
75	'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.",
76	'type' => "enum",
77	'reqd' => "no",
78	'deft' => "utf8" },
79	{ 'name' => "extract_language",
80	'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
81	'type' => "flag",
82	'reqd' => "no" },
83	{ 'name' => "default_language",
84	'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.",
85	'type' => "language",
86	'deft' => "en",
87	'reqd' => "no" },
88	{ 'name' => "extract_acronyms",
89	'desc' => "Extract acronyms from within text and set as metadata.",
90	'type' => "flag",
91	'reqd' => "no" },
92	{ 'name' => "markup_acronyms",
93	'desc' => "Add acronym metadata into document text.",
94	'type' => "flag",
95	'reqd' => "no" },
96	{ 'name' => "first",
97	'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.",
98	'type' => "string",
99	'reqd' => "no" },
100	{ 'name' => "extract_email",
101	'desc' => "Extract email addresses as metadata.",
102	'type' => "flag",
103	'reqd' => "no" },
104	{ 'name' => "extract_historical_years",
105	'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.",
106	'type' => "flag",
107	'reqd' => "no" },
108	{ 'name' => "maximum_year",
109	'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
110	'type' => "int",
111	'deft' => (localtime)[5]+1900,
112	'reqd' => "no"},
113	{ 'name' => "maximum_century",
114	'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
115	'type' => "int",
116	'deft' => "-1",
117	'reqd' => "no" },
118	{ 'name' => "no_bibliography",
119	'desc' => "Do not try to block bibliographic dates when extracting historical dates.",
120	'type' => "flag",
121	'reqd' => "no"},
122	{ 'name' => "cover_image",
123	'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.",
124	'type' => "flag",
125	'reqd' => "no" } ];
126
127	my $options = { 'name' => "BasPlug",
128	'desc' => "Base class for all the import plugins.",
129	'inherits' => "No",
130	'args' => $arguments };
131
132
133	sub print_xml_usage
134	{
135	local $self = shift(@_);
136
137	print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n";
138	$self->print_xml();
139	}
140
141
142	sub print_xml
143	{
144	local $self = shift(@_);
145
146	local $optionlistref = $self->{'option_list'};
147	local @optionlist = @$optionlistref;
148	local $pluginoptions = pop(@$optionlistref);
149	return if (!defined($pluginoptions));
150
151	print STDERR "<PlugInfo>\n";
152	print STDERR " <Name>$pluginoptions->{'name'}</Name>\n";
153	print STDERR " <Desc>$pluginoptions->{'desc'}</Desc>\n";
154	print STDERR " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n";
155	print STDERR " <Arguments>\n";
156	if (defined($pluginoptions->{'args'})) {
157	&PrintUsage::print_options_xml($pluginoptions->{'args'});
158	}
159
160	# Recurse up the plugin hierarchy
161	$self->print_xml();
162
163	print STDERR " </Arguments>\n";
164	print STDERR "</PlugInfo>\n";
165	}
166
167
168	sub print_txt_usage
169	{
170	local $self = shift(@_);
171
172	# Print the usage message for a plugin (recursively)
173	local $descoffset = $self->determine_description_offset(0);
174	$self->print_plugin_usage($descoffset, 1);
175	}
176
177
178	sub determine_description_offset
179	{
180	local $self = shift(@_);
181	local $maxoffset = shift(@_);
182
183	local $optionlistref = $self->{'option_list'};
184	local @optionlist = @$optionlistref;
185	local $pluginoptions = pop(@$optionlistref);
186	return $maxoffset if (!defined($pluginoptions));
187
188	# Find the length of the longest option string of this plugin
189	local $pluginargs = $pluginoptions->{'args'};
190	if (defined($pluginargs)) {
191	local $longest = &PrintUsage::find_longest_option_string($pluginargs);
192	if ($longest > $maxoffset) {
193	$maxoffset = $longest;
194	}
195	}
196
197	# Recurse up the plugin hierarchy
198	$maxoffset = $self->determine_description_offset($maxoffset);
199	$self->{'option_list'} = \@optionlist;
200	return $maxoffset;
201	}
202
203
204	sub print_plugin_usage
205	{
206	local $self = shift(@_);
207	local $descoffset = shift(@_);
208	local $isleafclass = shift(@_);
209
210	local $optionlistref = $self->{'option_list'};
211	local @optionlist = @$optionlistref;
212	local $pluginoptions = pop(@$optionlistref);
213	return if (!defined($pluginoptions));
214
215	local $pluginname = $pluginoptions->{'name'};
216	local $pluginargs = $pluginoptions->{'args'};
217
218	# Produce the usage information using the data structure above
219	if ($isleafclass) {
220	print STDERR " usage: plugin $pluginname [options]\n\n";
221	}
222
223	# Display the plugin options, if there are some
224	if (defined($pluginargs)) {
225	# Calculate the column offset of the option descriptions
226	local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
227
228	if ($isleafclass) {
229	print STDERR " specific options:\n";
230	}
231	else {
232	print STDERR " general options (from $pluginname):\n";
233	}
234
235	# Display the plugin options
236	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
237	}
238
239	# Recurse up the plugin hierarchy
240	$self->print_plugin_usage($descoffset, 0);
241	$self->{'option_list'} = \@optionlist;
242	}
243
244
245	# sub print_general_usage {
246	# my ($plugin_name) = @_;
247
248	# print STDERR "\n usage: plugin $plugin_name [options]\n\n";
249
250	# print STDERR " -process_exp A perl regular expression to match against filenames.\n";
251	# print STDERR " Matching filenames will be processed by this plugin.\n";
252	# print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
253	# print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
254	# print STDERR " .htm or .html (case-insensitive).\n\n";
255
256	# print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
257	# print STDERR " being passed to any later plugins in the list. This has no\n";
258	# print STDERR " real effect other than to prevent lots of warning messages\n";
259	# print STDERR " about input files you don't care about. Each plugin might\n";
260	# print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
261	# print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
262	# print STDERR " file extensions.\n\n";
263
264
265	# print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
266	# print STDERR " converted from these encodings and stored internally as\n";
267	# print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
268	# print STDERR " are:\n";
269
270	# print STDERR " auto: Use text categorization algorithm to automatically\n";
271	# print STDERR " identify the encoding of each source document. This\n";
272	# print STDERR " will be slower than explicitly setting the encoding\n";
273	# print STDERR " but will work where more than one encoding is used\n";
274	# print STDERR " within the same collection.\n";
275
276	# print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
277	# print STDERR " using iso_8859_1. Beware of using this on a collection\n";
278	# print STDERR " of documents that may contain characters outside the\n";
279	# print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
280	# print STDERR " documents containing accents), use iso_8859_1 instead.\n";
281
282	# print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
283	# print STDERR " unicode: just unicode\n";
284
285	# my $e = $encodings::encodings;
286	# foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
287	# print STDERR " $enc: $e->{$enc}->{'name'}\n";
288	# }
289	# print STDERR "\n";
290	# print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
291	# print STDERR " the text categorization algorithm fails to extract the\n";
292	# print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
293	# print STDERR " The default is iso_8859_1.\n\n";
294
295	# print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
296	# print STDERR " metadata. Note that this will be done automatically if\n";
297	# print STDERR " -input_encoding is 'auto'.\n\n";
298	# print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
299	# print STDERR " the 'Language' metadata element will be set to this value.\n";
300	# print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
301	# print STDERR " en = English). Note that if -input_encoding is not set to\n";
302	# print STDERR " 'auto' and -extract_language is not set, all documents will\n";
303	# print STDERR " have their 'Language' metadata set to this value.\n\n";
304
305	# print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
306
307	# print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
308
309	# print STDERR " -first Comma separated list of first sizes to extract from the\n";
310	# print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
311
312	# print STDERR " -extract_email Extract email addresses as metadata\n\n";
313
314	# print STDERR " -extract_historical_years Extract time-period information from historical\n";
315	# print STDERR " documents. This is stored as metadata with the document.\n";
316	# print STDERR " There is a search interface for this metadata, which you \n";
317	# print STDERR " can include in your collection by adding the statement:\n";
318	# print STDERR " format QueryInterface DateSearch\n";
319	# print STDERR " to your collection configuration file\n";
320	# print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
321	# print STDERR " Common Era date, such as 1950)\n";
322	# print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
323	# print STDERR " metadata (e.g. 14 will extract all references up to the\n";
324	# print STDERR " 14th century)\n";
325	# print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
326	# print STDERR " historical dates.\n";
327	# print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
328	# print STDERR " prefix as the file being processed) and associate it as a\n";
329	# print STDERR " cover image\n\n";
330	# }
331
332	# sub print_usage {
333	# print STDERR "\nThis plugin has no plugin specific options\n\n";
334	# }
335
336	sub new {
337	my $class = shift (@_);
338	my $plugin_name = shift (@_);
339	my $self = {};
340
341	my $enc = "^(";
342	map {$enc .= "$_\|";} keys %$encodings::encodings;
343	my $denc = $enc . "ascii\|utf8\|unicode)\$";
344	$enc .= "ascii\|utf8\|unicode\|auto)\$";
345
346	$self->{'outhandle'} = STDERR;
347	my $year = (localtime)[5]+1900;
348
349	$self->{'textcat'} = new textcat();
350
351	$self->{'num_processed'} = 0;
352	$self->{'num_not_processed'} = 0;
353	$self->{'num_blocked'} = 0;
354	$self->{'num_archives'} = 0;
355
356	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
357	$self->{'option_list'} = [ $options ];
358
359	# general options available to all plugins
360	if (!parsargv::parse(\@_,
361	q^process_exp/.*/^, \$self->{'process_exp'},
362	q^block_exp/.*/^, \$self->{'block_exp'},
363	q^extract_language^, \$self->{'extract_language'},
364	q^extract_acronyms^, \$self->{'extract_acronyms'},
365	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
366	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
367	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
368	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
369	q^extract_email^, \$self->{'extract_email'},
370	q^markup_acronyms^, \$self->{'markup_acronyms'},
371	q^default_language/.{2}/en^, \$self->{'default_language'},
372	q^first/.*/^, \$self->{'first'},
373	q^extract_historical_years^, \$self->{'date_extract'},
374	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
375	q^no_bibliography^, \$self->{'no_biblio'},
376	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
377	q^cover_image^, \$self->{'cover_image'},
378	"allow_extra_options")) {
379
380	print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
381	print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
382	# &print_general_usage($plugin_name);
383	$self->print_txt_usage();
384	die "\n";
385	}
386
387	return bless $self, $class;
388	}
389
390	# initialize BasPlug options
391	# if init() is overridden in a sub-class, remember to call BasPlug::init()
392	sub init {
393	my $self = shift (@_);
394	my ($verbosity, $outhandle, $failhandle) = @_;
395
396	# verbosity is passed through from the processor
397	$self->{'verbosity'} = $verbosity;
398
399	# as are the outhandle and failhandle
400	$self->{'outhandle'} = $outhandle if defined $outhandle;
401	$self->{'failhandle'} = $failhandle;
402
403	# set process_exp and block_exp to defaults unless they were
404	# explicitly set
405
406	if ((!$self->is_recursive()) and
407	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
408
409	$self->{'process_exp'} = $self->get_default_process_exp ();
410	if ($self->{'process_exp'} eq "") {
411	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
412	}
413	}
414
415	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
416	$self->{'block_exp'} = $self->get_default_block_exp ();
417	}
418	}
419
420	sub begin {
421	my $self = shift (@_);
422	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
423	$self->initialise_extractors();
424	}
425
426	sub end {
427	my ($self) = @_;
428	$self->finalise_extractors();
429	}
430
431	# this function should be overridden to return 1
432	# in recursive plugins
433	sub is_recursive {
434	my $self = shift (@_);
435
436	return 0;
437	}
438
439	sub get_default_block_exp {
440	my $self = shift (@_);
441
442	return "";
443	}
444
445	sub get_default_process_exp {
446	my $self = shift (@_);
447
448	return "";
449	}
450
451	# The BasPlug read() function. This function does all the right things
452	# to make general options work for a given plugin. It calls the process()
453	# function which does all the work specific to a plugin (like the old
454	# read functions used to do). Most plugins should define their own
455	# process() function and let this read() function keep control.
456	#
457	# recursive plugins (e.g. RecPlug) and specialized plugins like those
458	# capable of processing many documents within a single file (e.g.
459	# GMLPlug) should normally implement their own version of read()
460	#
461	# Return number of files processed, undef if can't process
462	# Note that $base_dir might be "" and that $file might
463	# include directories
464
465	sub read {
466	my $self = shift (@_);
467
468	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
469
470	if ($self->is_recursive()) {
471	die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
472	}
473
474	my $outhandle = $self->{'outhandle'};
475
476	my $filename = $file;
477	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
478
479	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
480	$self->{'num_blocked'} ++;
481	return 0;
482	}
483	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
484	return undef;
485	}
486	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
487
488	# Do encoding stuff
489	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
490
491	# create a new document
492	my $doc_obj = new doc ($filename, "indexed_doc");
493	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
494	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
495	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
496	my ($filemeta) = $file =~ /([^\\\/]+)$/;
497	# how do we know what encoding the filename is in?
498	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
499	if ($self->{'cover_image'}) {
500	$self->associate_cover_image($doc_obj, $filename);
501	}
502
503	# read in file ($text will be in utf8)
504	my $text = "";
505	$self->read_file ($filename, $encoding, $language, \$text);
506
507	if (!length ($text)) {
508	my $plugin_name = ref ($self);
509	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
510
511	my $failhandle = $self->{'failhandle'};
512	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
513	$self->{'num_not_processed'} ++;
514
515	return 0;
516	}
517
518	# include any metadata passed in from previous plugins
519	# note that this metadata is associated with the top level section
520	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
521
522	# do plugin specific processing of doc_obj
523	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
524
525	# do any automatic metadata extraction
526	$self->auto_extract_metadata ($doc_obj);
527
528	# add an OID
529	# see if there is a plugin-specific set_OID function...
530	if (defined ($self->can(set_OID))) {
531	# it will need $doc_obj to set the Identifier metadata...
532	$self->set_OID($doc_obj);
533	} else {
534	# use the default set_OID() in doc.pm
535	$doc_obj->set_OID();
536	}
537
538	# process the document
539	$processor->process($doc_obj);
540
541	$self->{'num_processed'} ++;
542
543	return 1; # processed the file
544	}
545
546	# returns undef if file is rejected by the plugin
547	sub process {
548	my $self = shift (@_);
549	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
550
551	die "Basplug::process function must be implemented in sub-class\n";
552
553	return undef; # never gets here
554	}
555
556	# uses the multiread package to read in the entire file pointed to
557	# by filename and loads the resulting text into $$textref. Input text
558	# may be in any of the encodings handled by multiread, output text
559	# will be in utf8
560	sub read_file {
561	my $self = shift (@_);
562	my ($filename, $encoding, $language, $textref) = @_;
563
564	if (!-r $filename)
565	{
566	my $outhandle = $self->{'outhandle'};
567	print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
568	return;
569	}
570
571	$$textref = "";
572
573	open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
574
575	if ($encoding eq "ascii") {
576	undef $/;
577	$$textref = <FILE>;
578	$/ = "\n";
579	} else {
580	my $reader = new multiread();
581	$reader->set_handle ('BasPlug::FILE');
582	$reader->set_encoding ($encoding);
583	$reader->read_file ($textref);
584
585	if ($language eq "zh") {
586	# segment the Chinese words
587	$$textref = &cnseg::segment($$textref);
588	}
589	}
590
591	close FILE;
592	}
593
594	sub textcat_get_language_encoding {
595	my $self = shift (@_);
596	my ($filename) = @_;
597
598	my ($language, $encoding, $extracted_encoding);
599	if ($self->{'input_encoding'} eq "auto") {
600	# use textcat to automatically work out the input encoding and language
601	($language, $encoding) = $self->get_language_encoding ($filename);
602	} elsif ($self->{'extract_language'}) {
603	# use textcat to get language metadata
604	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
605	$encoding = $self->{'input_encoding'};
606	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
607	my $plugin_name = ref ($self);
608	my $outhandle = $self->{'outhandle'};
609	print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
610	print $outhandle "appears to be encoded as $extracted_encoding.\n";
611	}
612	} else {
613	$language = $self->{'default_language'};
614	$encoding = $self->{'input_encoding'};
615	}
616	return ($language, $encoding);
617	}
618
619	# Uses textcat to work out the encoding and language of the text in
620	# $filename. All html tags are removed before processing.
621	# returns an array containing "language" and "encoding"
622	sub get_language_encoding {
623	my $self = shift (@_);
624	my ($filename) = @_;
625	my $outhandle = $self->{'outhandle'};
626
627	# read in file
628	open (FILE, $filename) \|\| die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
629	undef $/;
630	my $text = <FILE>;
631	$/ = "\n";
632	close FILE;
633
634	# remove <title>stuff</title> -- as titles tend often to be in English
635	# for foreign language documents
636	$text =~ s/<title>.*?<\/title>//i;
637
638	# remove all HTML tags
639	$text =~ s/<[^>]*>//sg;
640
641	# get the language/encoding
642	my $results = $self->{'textcat'}->classify(\$text);
643
644	# if textcat returns 3 or less possibilities we'll use the
645	# first one in the list - otherwise use the defaults
646	if (scalar @$results > 3) {
647	# changed 12 Feb 2003 by jrm21
648	# use the most popular encoding at least... otherwise we might
649	# generate invalid archive files!
650	my %guessed_encodings = ();
651	foreach my $result (@$results) {
652	$result =~ /([^\-]+)$/;
653	my $enc=$1;
654	if (!defined($guessed_encodings{$enc})) {
655	$guessed_encodings{$enc}=0;
656	}
657	$guessed_encodings{$enc}++;
658	}
659	my $best_encoding="";
660	$guessed_encodings{""}=-1;
661	foreach my $enc (keys %guessed_encodings) {
662	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
663	$best_encoding=$enc;
664	}
665	}
666
667	if ($self->{'input_encoding'} ne 'auto') {
668	if ($self->{'extract_language'} && $self->{'verbosity'}) {
669	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
670	print $outhandle "defaulting to $self->{'default_language'}\n";
671	}
672	return ($self->{'default_language'}, $self->{'input_encoding'});
673
674	} else {
675	if ($self->{'verbosity'}) {
676	print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
677	print $outhandle "defaulting to $self->{'default_language'}.\n";
678	}
679	return ($self->{'default_language'}, $best_encoding);
680	}
681	}
682
683	# format language/encoding
684	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
685	if (!defined $language) {
686	if ($self->{'verbosity'}) {
687	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
688	print $outhandle "defaulting to $self->{'default_language'}\n";
689	}
690	$language = $self->{'default_language'};
691	}
692	if (!defined $encoding) {
693	if ($self->{'verbosity'}) {
694	print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
695	print $outhandle "defaulting to $self->{'default_encoding'}\n";
696	}
697	$encoding = $self->{'default_encoding'};
698	}
699
700	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
701	!defined $encodings::encodings->{$encoding}) {
702	if ($self->{'verbosity'}) {
703	print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
704	print $outhandle "using $self->{'default_encoding'}\n";
705	}
706	$encoding = $self->{'default_encoding'};
707	}
708
709	return ($language, $encoding);
710	}
711
712	# add any extra metadata that's been passed around from one
713	# plugin to another.
714	# extra_metadata uses add_utf8_metadata so it expects metadata values
715	# to already be in utf8
716	sub extra_metadata {
717	my $self = shift (@_);
718	my ($doc_obj, $cursection, $metadata) = @_;
719
720	foreach my $field (keys(%$metadata)) {
721	# $metadata->{$field} may be an array reference
722	if (ref ($metadata->{$field}) eq "ARRAY") {
723	map {
724	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
725	} @{$metadata->{$field}};
726	} else {
727	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
728	}
729	}
730	}
731
732	# initialise metadata extractors
733	sub initialise_extractors {
734	my $self = shift (@_);
735
736	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
737	&acronym::initialise_acronyms();
738	}
739	}
740
741	# finalise metadata extractors
742	sub finalise_extractors {
743	my $self = shift (@_);
744
745	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
746	&acronym::finalise_acronyms();
747	}
748	}
749
750	# FIRSTNNN: extract the first NNN characters as metadata
751	sub extract_first_NNNN_characters {
752	my $self = shift (@_);
753	my ($textref, $doc_obj, $thissection) = @_;
754
755	foreach my $size (split /,/, $self->{'first'}) {
756	my $tmptext = $$textref;
757	$tmptext =~ s/^\s+//;
758	$tmptext =~ s/\s+$//;
759	$tmptext =~ s/\s+/ /gs;
760	$tmptext = substr ($tmptext, 0, $size);
761	$tmptext =~ s/\s\S*$/…/;
762	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
763	}
764	}
765
766	sub extract_email {
767	my $self = shift (@_);
768	my ($textref, $doc_obj, $thissection) = @_;
769	my $outhandle = $self->{'outhandle'};
770
771	print $outhandle " extracting email addresses ...\n"
772	if ($self->{'verbosity'} > 2);
773
774	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
775	@email = sort @email;
776
777	my @email2 = ();
778	foreach my $address (@email) {
779	if (!(join(" ",@email2) =~ m/$address/ )) {
780	push @email2, $address;
781	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
782	print $outhandle " extracting $address\n"
783	if ($self->{'verbosity'} > 3);
784	}
785	}
786	print $outhandle " done extracting email addresses.\n"
787	if ($self->{'verbosity'} > 2);
788
789	}
790
791	# extract metadata
792	sub auto_extract_metadata {
793
794
795	my $self = shift (@_);
796	my ($doc_obj) = @_;
797
798	if ($self->{'extract_email'}) {
799	my $thissection = $doc_obj->get_top_section();
800	while (defined $thissection) {
801	my $text = $doc_obj->get_text($thissection);
802	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
803	$thissection = $doc_obj->get_next_section ($thissection);
804	}
805	}
806
807
808	#adding kea keyphrases
809	if ($self->{'kea'}) {
810
811	my $thissection = $doc_obj->get_top_section();
812	my $text = "";
813	my @list;
814
815	while (defined $thissection) { #loop through sections to gather whole doc
816	my $sectiontext = $doc_obj->get_text($thissection);
817	$text = $text.$sectiontext;
818	$thissection = $doc_obj->get_next_section ($thissection);
819	}
820
821	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
822	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
823	} else { #otherwise call Kea with no options
824	@list = &Kea::extract_KeyPhrases ($text);
825	}
826
827	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
828	my $keyphrases = $list[0]; #first arg is keyphrase list
829	my $stems = $list[1]; #second arg is stemmed keyphrase list
830	print STDERR "keyphrases: $keyphrases\n";
831	print STDERR "stems: $stems\n";
832	$thissection = $doc_obj->get_top_section(); #add metadata to top section
833	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
834	$doc_obj->add_metadata($thissection, "stems", $stems);
835	}
836	} #end of kea
837
838	if ($self->{'first'}) {
839	my $thissection = $doc_obj->get_top_section();
840	while (defined $thissection) {
841	my $text = $doc_obj->get_text($thissection);
842	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
843	$thissection = $doc_obj->get_next_section ($thissection);
844	}
845	}
846
847	if ($self->{'extract_acronyms'}) {
848	my $thissection = $doc_obj->get_top_section();
849	while (defined $thissection) {
850	my $text = $doc_obj->get_text($thissection);
851	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
852	$thissection = $doc_obj->get_next_section ($thissection);
853	}
854	}
855
856	if ($self->{'markup_acronyms'}) {
857	my $thissection = $doc_obj->get_top_section();
858	while (defined $thissection) {
859	my $text = $doc_obj->get_text($thissection);
860	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
861	$doc_obj->delete_text($thissection);
862	$doc_obj->add_text($thissection, $text);
863	$thissection = $doc_obj->get_next_section ($thissection);
864	}
865	}
866
867	if($self->{'date_extract'}) {
868	my $thissection = $doc_obj->get_top_section();
869	while (defined $thissection) {
870
871	my $text = $doc_obj->get_text($thissection);
872	&DateExtract::get_date_metadata($text, $doc_obj,
873	$thissection,
874	$self->{'no_biblio'},
875	$self->{'max_year'},
876	$self->{'max_century'});
877	$thissection = $doc_obj->get_next_section ($thissection);
878	}
879	}
880	}
881
882	# extract acronyms from a section in a document. progress is
883	# reported to outhandle based on the verbosity. both the Acronym
884	# and the AcronymKWIC metadata items are created.
885
886	sub extract_acronyms {
887	my $self = shift (@_);
888	my ($textref, $doc_obj, $thissection) = @_;
889	my $outhandle = $self->{'outhandle'};
890
891	print $outhandle " extracting acronyms ...\n"
892	if ($self->{'verbosity'} > 2);
893
894	my $acro_array = &acronym::acronyms($textref);
895
896	foreach my $acro (@$acro_array) {
897
898	#check that this is the first time ...
899	my $seen_before = "false";
900	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
901	foreach my $thisAcro (@$previous_data) {
902	if ($thisAcro eq $acro->to_string()) {
903	$seen_before = "true";
904	print $outhandle " already seen ". $acro->to_string() . "\n"
905	if ($self->{'verbosity'} >= 4);
906	}
907	}
908
909	if ($seen_before eq "false") {
910	#write it to the file ...
911	$acro->write_to_file();
912
913	#do the normal acronym
914	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
915	print $outhandle " adding ". $acro->to_string() . "\n"
916	if ($self->{'verbosity'} > 3);
917
918	}
919	}
920	print $outhandle " done extracting acronyms. \n"
921	if ($self->{'verbosity'} > 2);
922	}
923
924	sub markup_acronyms {
925	my $self = shift (@_);
926	my ($text, $doc_obj, $thissection) = @_;
927	my $outhandle = $self->{'outhandle'};
928
929	print $outhandle " marking up acronyms ...\n"
930	if ($self->{'verbosity'} > 2);
931
932	#self is passed in to check for verbosity ...
933	$text = &acronym::markup_acronyms($text, $self);
934
935	print $outhandle " done marking up acronyms. \n"
936	if ($self->{'verbosity'} > 2);
937
938	return $text;
939	}
940
941	sub compile_stats {
942	my $self = shift(@_);
943	my ($stats) = @_;
944
945	$stats->{'num_processed'} += $self->{'num_processed'};
946	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
947	$stats->{'num_archives'} += $self->{'num_archives'};
948
949	}
950
951	sub associate_cover_image {
952	my $self = shift(@_);
953	my ($doc_obj, $filename) = @_;
954
955	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
956	if (-e $filename) {
957	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
958	} else {
959	$filename =~ s/jpg$/JPG/;
960	if (-e $filename) {
961	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
962	}
963	}
964	}
965
966	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: