Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 3614

Last change on this file since 3614 was 3540, checked in by kjdon, 22 years ago
added John T's changes into CVS - added info to enable retrieval of usage info in xml
Property svn:keywords set to `Author Date Id Revision`
File size: 33.1 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	# suppress the annoying "subroutine redefined" warning that various
29	# plugins cause under perl 5.6
30	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
31
32	use Kea;
33	use parsargv;
34	use multiread;
35	use encodings;
36	use cnseg;
37	use acronym;
38	use textcat;
39	use doc;
40	use diagnostics;
41	use DateExtract;
42	use ghtml;
43
44	my $unicode_list =
45	[ { 'name' => "auto",
46	'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
47	{ 'name' => "ascii",
48	'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
49	{ 'name' => "utf8",
50	'desc' => "either utf8 or unicode -- automatically detected." },
51	{ 'name' => "unicode",
52	'desc' => "just unicode" } ];
53
54	my $arguments =
55	[ { 'name' => "process_exp",
56	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
57	'type' => "string",
58	'deft' => "",
59	'reqd' => "no" },
60	{ 'name' => "block_exp",
61	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
62	'type' => 'string',
63	'deft' => "",
64	'reqd' => "no" },
65	{ 'name' => "input_encoding",
66	'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",
67	'type' => "enum",
68	'list' => $unicode_list,
69	'reqd' => "no" ,
70	'deft' => "auto" } ,
71	{ 'name' => "default_encoding",
72	'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone. The default is iso_8859_1.",
73	'type' => "flag",
74	'reqd' => "no" },
75	{ 'name' => "extract_language",
76	'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
77	'type' => "flag",
78	'reqd' => "no" },
79	{ 'name' => "default_language",
80	'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.",
81	'type' => "language",
82	'deft' => "en",
83	'reqd' => "no" },
84	{ 'name' => "extract_acronyms",
85	'desc' => "Extract acronyms from within text and set as metadata.",
86	'type' => "flag",
87	'reqd' => "no" },
88	{ 'name' => "markup_acronyms",
89	'desc' => "Add acronym metadata into document text.",
90	'type' => "flag",
91	'reqd' => "no" },
92	{ 'name' => "first",
93	'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.",
94	'type' => "string",
95	'reqd' => "no" },
96	{ 'name' => "extract_email",
97	'desc' => "Extract email addresses as metadata.",
98	'type' => "flag",
99	'reqd' => "no" },
100	{ 'name' => "extract_historical_years",
101	'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.",
102	'type' => "flag",
103	'reqd' => "no" },
104	{ 'name' => "maximum_year",
105	'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
106	'type' => "int",
107	'reqd' => "no"},
108	{ 'name' => "maximum_century",
109	'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
110	'type' => "int",
111	'reqd' => "no" },
112	{ 'name' => "no_bibliography",
113	'desc' => "Do not try and block bibliographic dates when extracting historical dates.",
114	'type' => "flag",
115	'reqd' => "no"},
116	{ 'name' => "cover_image",
117	'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.",
118	'type' => "flag",
119	'reqd' => "no" } ];
120
121	my $options = { 'name' => "BasPlug",
122	'desc' => "Base class for all the import plugins.",
123	'inherits' => "No",
124	'args' => $arguments,
125	'process_exp' => "",
126	'block_exp' => "" };
127
128	sub print_xml_usage {
129	my $self = shift (@_);
130	print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n";
131	$self->print_xml();
132	}
133
134	sub print_xml {
135	my $self = shift (@_);
136	my $option_list = $self->{'option_list'};
137	my $option = pop( @{$option_list} );
138	if(defined $option)
139	{
140	print STDERR "<PlugInfo>\n";
141	print STDERR " <Name>$option->{'name'}</Name>\n";
142	print STDERR " <Desc>$option->{'desc'}</Desc>\n";
143	print STDERR " <Inherits>$option->{'inherits'}</Inherits>\n";
144	print STDERR " <Arguments>\n";
145	if(defined $option->{'args'})
146	{
147	my $args = $option->{'args'};
148	my $x;
149	foreach $x ( @{$args} )
150	{
151	print STDERR " <Option>\n";
152	print STDERR " <Name>$x->{'name'}</Name>\n";
153	print STDERR " <Desc>$x->{'desc'}</Desc>\n";
154	print STDERR " <Type>$x->{'type'}</Type>\n";
155	print STDERR " <Required>$x->{'reqd'}</Required>\n";
156	if(defined $x->{'list'})
157	{
158	print STDERR " <List>\n";
159	my $list = $x->{'list'};
160	my $y;
161	foreach $y ( @{$list} )
162	{
163	print STDERR " <Value>\n";
164	print STDERR " <Name>$y->{'name'}</Name>\n";
165	print STDERR " <Desc>$y->{'desc'}</Desc>\n";
166	print STDERR " </Value>\n";
167	}
168	# Special case of 'input_encoding'
169	if( $x->{'name'} =~ m/^input_encoding$/i ) {
170	my $e = $encodings::encodings;
171	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
172	print STDERR " <Value>\n";
173	print STDERR " <Name>$enc</Name>\n";
174	print STDERR " <Desc>$e->{$enc}->{'name'}</Desc>\n";
175	print STDERR " </Value>\n";
176	}
177	}
178	print STDERR " </List>\n";
179	}
180	if(defined $x->{'deft'})
181	{
182	print STDERR " <Default>$x->{'deft'}</Default>\n";
183	}
184	print STDERR " </Option>\n";
185	}
186	}
187	if(defined $option_list) {
188	$self->print_xml();
189	}
190
191	print STDERR " </Arguments>\n";
192	print STDERR "</PlugInfo>\n";
193	}
194	}
195
196	sub print_general_usage {
197	my ($plugin_name) = @_;
198
199	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
200
201	print STDERR " -process_exp A perl regular expression to match against filenames.\n";
202	print STDERR " Matching filenames will be processed by this plugin.\n";
203	print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
204	print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
205	print STDERR " .htm or .html (case-insensitive).\n\n";
206
207	print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
208	print STDERR " being passed to any later plugins in the list. This has no\n";
209	print STDERR " real effect other than to prevent lots of warning messages\n";
210	print STDERR " about input files you don't care about. Each plugin might\n";
211	print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
212	print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
213	print STDERR " file extensions.\n\n";
214
215
216	print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
217	print STDERR " converted from these encodings and stored internally as\n";
218	print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
219	print STDERR " are:\n";
220
221	print STDERR " auto: Use text categorization algorithm to automatically\n";
222	print STDERR " identify the encoding of each source document. This\n";
223	print STDERR " will be slower than explicitly setting the encoding\n";
224	print STDERR " but will work where more than one encoding is used\n";
225	print STDERR " within the same collection.\n";
226
227	print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
228	print STDERR " using iso_8859_1. Beware of using this on a collection\n";
229	print STDERR " of documents that may contain characters outside the\n";
230	print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
231	print STDERR " documents containing accents), use iso_8859_1 instead.\n";
232
233	print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
234	print STDERR " unicode: just unicode\n";
235
236	my $e = $encodings::encodings;
237	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
238	print STDERR " $enc: $e->{$enc}->{'name'}\n";
239	}
240	print STDERR "\n";
241	print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
242	print STDERR " the text categorization algorithm fails to extract the\n";
243	print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
244	print STDERR " The default is iso_8859_1.\n\n";
245
246	print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
247	print STDERR " metadata. Note that this will be done automatically if\n";
248	print STDERR " -input_encoding is 'auto'.\n\n";
249	print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
250	print STDERR " the 'Language' metadata element will be set to this value.\n";
251	print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
252	print STDERR " en = English). Note that if -input_encoding is not set to\n";
253	print STDERR " 'auto' and -extract_language is not set, all documents will\n";
254	print STDERR " have their 'Language' metadata set to this value.\n\n";
255
256	print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
257
258	print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
259
260	print STDERR " -first Comma separated list of first sizes to extract from the\n";
261	print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
262
263	print STDERR " -extract_email Extract email addresses as metadata\n\n";
264
265	print STDERR " -extract_historical_years Extract time-period information from historical\n";
266	print STDERR " documents. This is stored as metadata with the document.\n";
267	print STDERR " There is a search interface for this metadata, which you \n";
268	print STDERR " can include in your collection by adding the statement:\n";
269	print STDERR " format QueryInterface DateSearch\n";
270	print STDERR " to your collection configuration file\n";
271	print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
272	print STDERR " Common Era date, such as 1950)\n";
273	print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
274	print STDERR " metadata (e.g. 14 will extract all references up to the\n";
275	print STDERR " 14th century)\n";
276	print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
277	print STDERR " historical dates.\n";
278	print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
279	print STDERR " prefix as the file being processed) and associate it as a\n";
280	print STDERR " cover image\n\n";
281	}
282
283	# print_usage should be overridden for any sub-classes having
284	# their own plugin specific options
285	sub print_usage {
286	print STDERR "\nThis plugin has no plugin specific options\n\n";
287	}
288
289	sub new {
290	my $class = shift (@_);
291	my $plugin_name = shift (@_);
292	my $self = {};
293
294	my $enc = "^(";
295	map {$enc .= "$_\|";} keys %$encodings::encodings;
296	my $denc = $enc . "ascii\|utf8\|unicode)\$";
297	$enc .= "ascii\|utf8\|unicode\|auto)\$";
298
299	$self->{'outhandle'} = STDERR;
300	my $year = (localtime)[5]+1900;
301
302	$self->{'textcat'} = new textcat();
303
304	$self->{'num_processed'} = 0;
305	$self->{'num_not_processed'} = 0;
306	$self->{'num_blocked'} = 0;
307	$self->{'num_archives'} = 0;
308
309	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
310	$self->{'option_list'} = [ $options ];
311
312	# general options available to all plugins
313	if (!parsargv::parse(\@_,
314	q^process_exp/.*/^, \$self->{'process_exp'},
315	q^block_exp/.*/^, \$self->{'block_exp'},
316	q^extract_acronyms^, \$self->{'extract_acronyms'},
317	q^extract_keyphrases^, \$self->{'kea'}, #with extra options
318	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
319	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
320	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
321	q^extract_email^, \$self->{'extract_email'},
322	q^markup_acronyms^, \$self->{'markup_acronyms'},
323	q^default_language/.{2}/en^, \$self->{'default_language'},
324	q^first/.*/^, \$self->{'first'},
325	q^extract_historical_years^, \$self->{'date_extract'},
326	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
327	q^no_bibliography^, \$self->{'no_biblio'},
328	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
329	q^cover_image^, \$self->{'cover_image'},
330	"allow_extra_options")) {
331
332	print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
333	print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
334	&print_general_usage($plugin_name);
335	die "\n";
336	}
337
338	return bless $self, $class;
339	}
340
341	# initialize BasPlug options
342	# if init() is overridden in a sub-class, remember to call BasPlug::init()
343	sub init {
344	my $self = shift (@_);
345	my ($verbosity, $outhandle, $failhandle) = @_;
346
347	# verbosity is passed through from the processor
348	$self->{'verbosity'} = $verbosity;
349
350	# as are the outhandle and failhandle
351	$self->{'outhandle'} = $outhandle if defined $outhandle;
352	$self->{'failhandle'} = $failhandle;
353
354	# set process_exp and block_exp to defaults unless they were
355	# explicitly set
356
357	if ((!$self->is_recursive()) and
358	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
359
360	$self->{'process_exp'} = $self->get_default_process_exp ();
361	if ($self->{'process_exp'} eq "") {
362	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
363	}
364	}
365
366	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
367	$self->{'block_exp'} = $self->get_default_block_exp ();
368	}
369	}
370
371	sub begin {
372	my $self = shift (@_);
373	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
374	$self->initialise_extractors();
375	}
376
377	sub end {
378	my ($self) = @_;
379	$self->finalise_extractors();
380	}
381
382	# this function should be overridden to return 1
383	# in recursive plugins
384	sub is_recursive {
385	my $self = shift (@_);
386
387	return 0;
388	}
389
390	sub get_default_block_exp {
391	my $self = shift (@_);
392
393	return "";
394	}
395
396	sub get_default_process_exp {
397	my $self = shift (@_);
398
399	return "";
400	}
401
402	# The BasPlug read() function. This function does all the right things
403	# to make general options work for a given plugin. It calls the process()
404	# function which does all the work specific to a plugin (like the old
405	# read functions used to do). Most plugins should define their own
406	# process() function and let this read() function keep control.
407	#
408	# recursive plugins (e.g. RecPlug) and specialized plugins like those
409	# capable of processing many documents within a single file (e.g.
410	# GMLPlug) should normally implement their own version of read()
411	#
412	# Return number of files processed, undef if can't process
413	# Note that $base_dir might be "" and that $file might
414	# include directories
415
416	sub read {
417	my $self = shift (@_);
418
419	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
420
421	if ($self->is_recursive()) {
422	die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
423	}
424
425	my $outhandle = $self->{'outhandle'};
426
427	my $filename = $file;
428	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
429
430	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
431	$self->{'num_blocked'} ++;
432	return 0;
433	}
434	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
435	return undef;
436	}
437	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
438
439	# Do encoding stuff
440	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
441
442	# create a new document
443	my $doc_obj = new doc ($filename, "indexed_doc");
444	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
445	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
446	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
447	my ($filemeta) = $file =~ /([^\\\/]+)$/;
448	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
449	if ($self->{'cover_image'}) {
450	$self->associate_cover_image($doc_obj, $filename);
451	}
452
453	# read in file ($text will be in utf8)
454	my $text = "";
455	$self->read_file ($filename, $encoding, $language, \$text);
456
457	if (!length ($text)) {
458	my $plugin_name = ref ($self);
459	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
460
461	my $failhandle = $self->{'failhandle'};
462	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
463	$self->{'num_not_processed'} ++;
464
465	return 0;
466	}
467
468	# include any metadata passed in from previous plugins
469	# note that this metadata is associated with the top level section
470	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
471
472	# do plugin specific processing of doc_obj
473	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
474
475	# do any automatic metadata extraction
476	$self->auto_extract_metadata ($doc_obj);
477
478	# add an OID
479	# see if there is a plugin-specific set_OID function...
480	if (defined ($self->can(set_OID))) {
481	# it will need $doc_obj to set the Identifier metadata...
482	$self->set_OID($doc_obj);
483	} else {
484	# use the default set_OID() in doc.pm
485	$doc_obj->set_OID();
486	}
487
488	# process the document
489	$processor->process($doc_obj);
490
491	$self->{'num_processed'} ++;
492
493	return 1; # processed the file
494	}
495
496	# returns undef if file is rejected by the plugin
497	sub process {
498	my $self = shift (@_);
499	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
500
501	die "Basplug::process function must be implemented in sub-class\n";
502
503	return undef; # never gets here
504	}
505
506	# uses the multiread package to read in the entire file pointed to
507	# by filename and loads the resulting text into $$textref. Input text
508	# may be in any of the encodings handled by multiread, output text
509	# will be in utf8
510	sub read_file {
511	my $self = shift (@_);
512	my ($filename, $encoding, $language, $textref) = @_;
513
514	if (!-r $filename)
515	{
516	my $outhandle = $self->{'outhandle'};
517	print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
518	return;
519	}
520
521	$$textref = "";
522
523	open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
524
525	if ($encoding eq "ascii") {
526	undef $/;
527	$$textref = <FILE>;
528	$/ = "\n";
529	} else {
530	my $reader = new multiread();
531	$reader->set_handle ('BasPlug::FILE');
532	$reader->set_encoding ($encoding);
533	$reader->read_file ($textref);
534
535	if ($language eq "zh") {
536	# segment the Chinese words
537	$$textref = &cnseg::segment($$textref);
538	}
539	}
540
541	close FILE;
542	}
543
544	sub textcat_get_language_encoding {
545	my $self = shift (@_);
546	my ($filename) = @_;
547
548	my ($language, $encoding, $extracted_encoding);
549	if ($self->{'input_encoding'} eq "auto") {
550	# use textcat to automatically work out the input encoding and language
551	($language, $encoding) = $self->get_language_encoding ($filename);
552	} elsif ($self->{'extract_language'}) {
553	# use textcat to get language metadata
554	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
555	$encoding = $self->{'input_encoding'};
556	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
557	my $plugin_name = ref ($self);
558	my $outhandle = $self->{'outhandle'};
559	print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
560	print $outhandle "appears to be encoded as $extracted_encoding.\n";
561	}
562	} else {
563	$language = $self->{'default_language'};
564	$encoding = $self->{'input_encoding'};
565	}
566	return ($language, $encoding);
567	}
568
569	# Uses textcat to work out the encoding and language of the text in
570	# $filename. All html tags are removed before processing.
571	# returns an array containing "language" and "encoding"
572	sub get_language_encoding {
573	my $self = shift (@_);
574	my ($filename) = @_;
575	my $outhandle = $self->{'outhandle'};
576
577	# read in file
578	open (FILE, $filename) \|\| die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
579	undef $/;
580	my $text = <FILE>;
581	$/ = "\n";
582	close FILE;
583
584	# remove <title>stuff</title> -- as titles tend often to be in English
585	# for foreign language documents
586	$text =~ s/<title>.*?<\/title>//i;
587
588	# remove all HTML tags
589	$text =~ s/<[^>]*>//sg;
590
591	# get the language/encoding
592	my $results = $self->{'textcat'}->classify(\$text);
593
594	# if textcat returns 3 or less possibilities we'll use the
595	# first one in the list - otherwise use the defaults
596	if (scalar @$results > 3) {
597
598	if ($self->{'input_encoding'} ne 'auto') {
599	if ($self->{'extract_language'} && $self->{'verbosity'}) {
600	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
601	print $outhandle "defaulting to $self->{'default_language'}\n";
602	}
603	return ($self->{'default_language'}, $self->{'input_encoding'});
604
605	} else {
606	if ($self->{'verbosity'}) {
607	print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
608	print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
609	}
610	return ($self->{'default_language'}, $self->{'default_encoding'});
611	}
612	}
613
614	# format language/encoding
615	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
616	if (!defined $language) {
617	if ($self->{'verbosity'}) {
618	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
619	print $outhandle "defaulting to $self->{'default_language'}\n";
620	}
621	$language = $self->{'default_language'};
622	}
623	if (!defined $encoding) {
624	if ($self->{'verbosity'}) {
625	print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
626	print $outhandle "defaulting to $self->{'default_encoding'}\n";
627	}
628	$encoding = $self->{'default_encoding'};
629	}
630
631	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
632	!defined $encodings::encodings->{$encoding}) {
633	if ($self->{'verbosity'}) {
634	print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
635	print $outhandle "using $self->{'default_encoding'}\n";
636	}
637	$encoding = $self->{'default_encoding'};
638	}
639
640	return ($language, $encoding);
641	}
642
643	# add any extra metadata that's been passed around from one
644	# plugin to another.
645	# extra_metadata uses add_utf8_metadata so it expects metadata values
646	# to already be in utf8
647	sub extra_metadata {
648	my $self = shift (@_);
649	my ($doc_obj, $cursection, $metadata) = @_;
650
651	foreach my $field (keys(%$metadata)) {
652	# $metadata->{$field} may be an array reference
653	if (ref ($metadata->{$field}) eq "ARRAY") {
654	map {
655	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
656	} @{$metadata->{$field}};
657	} else {
658	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
659	}
660	}
661	}
662
663	# initialise metadata extractors
664	sub initialise_extractors {
665	my $self = shift (@_);
666
667	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
668	&acronym::initialise_acronyms();
669	}
670	}
671
672	# finalise metadata extractors
673	sub finalise_extractors {
674	my $self = shift (@_);
675
676	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
677	&acronym::finalise_acronyms();
678	}
679	}
680
681	# FIRSTNNN: extract the first NNN characters as metadata
682	sub extract_first_NNNN_characters {
683	my $self = shift (@_);
684	my ($textref, $doc_obj, $thissection) = @_;
685
686	foreach my $size (split /,/, $self->{'first'}) {
687	my $tmptext = $$textref;
688	$tmptext =~ s/^\s+//;
689	$tmptext =~ s/\s+$//;
690	$tmptext =~ s/\s+/ /gs;
691	$tmptext = substr ($tmptext, 0, $size);
692	$tmptext =~ s/\s\S*$/…/;
693	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
694	}
695	}
696
697	sub extract_email {
698	my $self = shift (@_);
699	my ($textref, $doc_obj, $thissection) = @_;
700	my $outhandle = $self->{'outhandle'};
701
702	print $outhandle " extracting email addresses ...\n"
703	if ($self->{'verbosity'} > 2);
704
705	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
706	@email = sort @email;
707
708	my @email2 = ();
709	foreach my $address (@email) {
710	if (!(join(" ",@email2) =~ m/$address/ )) {
711	push @email2, $address;
712	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
713	print $outhandle " extracting $address\n"
714	if ($self->{'verbosity'} > 3);
715	}
716	}
717	print $outhandle " done extracting email addresses.\n"
718	if ($self->{'verbosity'} > 2);
719
720	}
721
722	# extract metadata
723	sub auto_extract_metadata {
724
725
726	my $self = shift (@_);
727	my ($doc_obj) = @_;
728
729	if ($self->{'extract_email'}) {
730	my $thissection = $doc_obj->get_top_section();
731	while (defined $thissection) {
732	my $text = $doc_obj->get_text($thissection);
733	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
734	$thissection = $doc_obj->get_next_section ($thissection);
735	}
736	}
737
738
739	#adding kea keyphrases
740	if ($self->{'kea'}) {
741
742	my $thissection = $doc_obj->get_top_section();
743	my $text = "";
744	my @list;
745
746	while (defined $thissection) { #loop through sections to gather whole doc
747	my $sectiontext = $doc_obj->get_text($thissection);
748	$text = $text.$sectiontext;
749	$thissection = $doc_obj->get_next_section ($thissection);
750	}
751
752	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
753	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
754	} else { #otherwise call Kea with no options
755	@list = &Kea::extract_KeyPhrases ($text);
756	}
757
758	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
759	my $keyphrases = $list[0]; #first arg is keyphrase list
760	my $stems = $list[1]; #second arg is stemmed keyphrase list
761	print STDERR "keyphrases: $keyphrases\n";
762	print STDERR "stems: $stems\n";
763	$thissection = $doc_obj->get_top_section(); #add metadata to top section
764	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
765	$doc_obj->add_metadata($thissection, "stems", $stems);
766	}
767	} #end of kea
768
769	if ($self->{'first'}) {
770	my $thissection = $doc_obj->get_top_section();
771	while (defined $thissection) {
772	my $text = $doc_obj->get_text($thissection);
773	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
774	$thissection = $doc_obj->get_next_section ($thissection);
775	}
776	}
777
778	if ($self->{'extract_acronyms'}) {
779	my $thissection = $doc_obj->get_top_section();
780	while (defined $thissection) {
781	my $text = $doc_obj->get_text($thissection);
782	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
783	$thissection = $doc_obj->get_next_section ($thissection);
784	}
785	}
786
787	if ($self->{'markup_acronyms'}) {
788	my $thissection = $doc_obj->get_top_section();
789	while (defined $thissection) {
790	my $text = $doc_obj->get_text($thissection);
791	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
792	$doc_obj->delete_text($thissection);
793	$doc_obj->add_text($thissection, $text);
794	$thissection = $doc_obj->get_next_section ($thissection);
795	}
796	}
797
798	if($self->{'date_extract'}) {
799	my $thissection = $doc_obj->get_top_section();
800	while (defined $thissection) {
801
802	my $text = $doc_obj->get_text($thissection);
803	&DateExtract::get_date_metadata($text, $doc_obj,
804	$thissection,
805	$self->{'no_biblio'},
806	$self->{'max_year'},
807	$self->{'max_century'});
808	$thissection = $doc_obj->get_next_section ($thissection);
809	}
810	}
811	}
812
813	# extract acronyms from a section in a document. progress is
814	# reported to outhandle based on the verbosity. both the Acronym
815	# and the AcronymKWIC metadata items are created.
816
817	sub extract_acronyms {
818	my $self = shift (@_);
819	my ($textref, $doc_obj, $thissection) = @_;
820	my $outhandle = $self->{'outhandle'};
821
822	print $outhandle " extracting acronyms ...\n"
823	if ($self->{'verbosity'} > 2);
824
825	my $acro_array = &acronym::acronyms($textref);
826
827	foreach my $acro (@$acro_array) {
828
829	#check that this is the first time ...
830	my $seen_before = "false";
831	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
832	foreach my $thisAcro (@$previous_data) {
833	if ($thisAcro eq $acro->to_string()) {
834	$seen_before = "true";
835	print $outhandle " already seen ". $acro->to_string() . "\n"
836	if ($self->{'verbosity'} >= 4);
837	}
838	}
839
840	if ($seen_before eq "false") {
841	#write it to the file ...
842	$acro->write_to_file();
843
844	#do the normal acronym
845	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
846	print $outhandle " adding ". $acro->to_string() . "\n"
847	if ($self->{'verbosity'} > 3);
848
849	}
850	}
851	print $outhandle " done extracting acronyms. \n"
852	if ($self->{'verbosity'} > 2);
853	}
854
855	sub markup_acronyms {
856	my $self = shift (@_);
857	my ($text, $doc_obj, $thissection) = @_;
858	my $outhandle = $self->{'outhandle'};
859
860	print $outhandle " marking up acronyms ...\n"
861	if ($self->{'verbosity'} > 2);
862
863	#self is passed in to check for verbosity ...
864	$text = &acronym::markup_acronyms($text, $self);
865
866	print $outhandle " done marking up acronyms. \n"
867	if ($self->{'verbosity'} > 2);
868
869	return $text;
870	}
871
872	sub compile_stats {
873	my $self = shift(@_);
874	my ($stats) = @_;
875
876	$stats->{'num_processed'} += $self->{'num_processed'};
877	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
878	$stats->{'num_archives'} += $self->{'num_archives'};
879
880	}
881
882	sub associate_cover_image {
883	my $self = shift(@_);
884	my ($doc_obj, $filename) = @_;
885
886	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
887	if (-e $filename) {
888	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
889	} else {
890	$filename =~ s/jpg$/JPG/;
891	if (-e $filename) {
892	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
893	}
894	}
895	}
896
897	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: