Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 6408

Last change on this file since 6408 was 6408, checked in by jmt12, 20 years ago
Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)
Property svn:keywords set to `Author Date Id Revision`
File size: 35.3 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	eval {require bytes};
29
30	# suppress the annoying "subroutine redefined" warning that various
31	# plugins cause under perl 5.6
32	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34	use Kea;
35	use parsargv;
36	use multiread;
37	use encodings;
38	use cnseg;
39	use acronym;
40	use textcat;
41	use doc;
42	use diagnostics;
43	use DateExtract;
44	use ghtml;
45	use gsprintf;
46	use printusage;
47
48
49	my $unicode_list =
50	[ { 'name' => "auto",
51	'desc' => "{BasPlug.input_encoding.auto}" },
52	{ 'name' => "ascii",
53	'desc' => "{BasPlug.input_encoding.ascii}" },
54	{ 'name' => "utf8",
55	'desc' => "{BasPlug.input_encoding.utf8}" },
56	{ 'name' => "unicode",
57	'desc' => "{BasPlug.input_encoding.unicode}" } ];
58
59	my $arguments =
60	[ { 'name' => "process_exp",
61	'desc' => "{BasPlug.process_exp}",
62	'type' => "regexp",
63	'deft' => "",
64	'reqd' => "no" },
65	{ 'name' => "block_exp",
66	'desc' => "{BasPlug.block_exp}",
67	'type' => "regexp",
68	'deft' => "",
69	'reqd' => "no" },
70	{ 'name' => "input_encoding",
71	'desc' => "{BasPlug.input_encoding}",
72	'type' => "enum",
73	'list' => $unicode_list,
74	'reqd' => "no" ,
75	'deft' => "auto" } ,
76	{ 'name' => "default_encoding",
77	'desc' => "{BasPlug.default_encoding}",
78	'type' => "enum",
79	'list' => $unicode_list,
80	'reqd' => "no",
81	'deft' => "utf8" },
82	{ 'name' => "extract_language",
83	'desc' => "{BasPlug.extract_language}",
84	'type' => "flag",
85	'reqd' => "no" },
86	{ 'name' => "default_language",
87	'desc' => "{BasPlug.default_language}",
88	'type' => "language",
89	'deft' => "en",
90	'reqd' => "no" },
91	{ 'name' => "extract_acronyms",
92	'desc' => "{BasPlug.extract_acronyms}",
93	'type' => "flag",
94	'reqd' => "no" },
95	{ 'name' => "markup_acronyms",
96	'desc' => "{BasPlug.markup_acronyms}",
97	'type' => "flag",
98	'reqd' => "no" },
99	{ 'name' => "first",
100	'desc' => "{BasPlug.first}",
101	'type' => "string",
102	'reqd' => "no" },
103	{ 'name' => "extract_email",
104	'desc' => "{BasPlug.extract_email}",
105	'type' => "flag",
106	'reqd' => "no" },
107	{ 'name' => "extract_historical_years",
108	'desc' => "{BasPlug.extract_historical_years}",
109	'type' => "flag",
110	'reqd' => "no" },
111	{ 'name' => "maximum_year",
112	'desc' => "{BasPlug.maximum_year}",
113	'type' => "int",
114	'deft' => (localtime)[5]+1900,
115	'reqd' => "no"},
116	{ 'name' => "maximum_century",
117	'desc' => "{BasPlug.maximum_century}",
118	'type' => "int",
119	'deft' => "-1",
120	'reqd' => "no" },
121	{ 'name' => "no_bibliography",
122	'desc' => "{BasPlug.no_bibliography}",
123	'type' => "flag",
124	'reqd' => "no"},
125	{ 'name' => "cover_image",
126	'desc' => "{BasPlug.cover_image}",
127	'type' => "flag",
128	'reqd' => "no" } ];
129
130	my $options = { 'name' => "BasPlug",
131	'desc' => "{BasPlug.desc}",
132	'abstract' => "yes",
133	'inherits' => "no",
134	'args' => $arguments };
135
136
137	sub gsprintf
138	{
139	return &gsprintf::gsprintf(@_);
140	}
141
142
143	sub get_arguments
144	{
145	local $self = shift(@_);
146	local $optionlistref = $self->{'option_list'};
147	local @optionlist = @$optionlistref;
148	local $pluginoptions = pop(@$optionlistref);
149	local $pluginarguments = $pluginoptions->{'args'};
150	return $pluginarguments;
151	}
152
153
154	sub print_xml_usage
155	{
156	local $self = shift(@_);
157	local $language = shift(@_);
158
159	&PrintUsage::print_xml_header();
160	$self->print_xml($language);
161	}
162
163
164	sub print_xml
165	{
166	local $self = shift(@_);
167	local $language = shift(@_);
168
169	local $optionlistref = $self->{'option_list'};
170	local @optionlist = @$optionlistref;
171	local $pluginoptions = pop(@$optionlistref);
172	return if (!defined($pluginoptions));
173
174	print STDERR "<PlugInfo>\n";
175	print STDERR " <Name>$pluginoptions->{'name'}</Name>\n";
176	print STDERR " <Desc>$pluginoptions->{'desc'}</Desc>\n";
177	print STDERR " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n";
178	print STDERR " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n";
179	print STDERR " <Arguments>\n";
180	if (defined($pluginoptions->{'args'})) {
181	&PrintUsage::print_options_xml($language, $pluginoptions->{'args'});
182	}
183
184	# Recurse up the plugin hierarchy
185	$self->print_xml($language);
186
187	print STDERR " </Arguments>\n";
188	print STDERR "</PlugInfo>\n";
189	}
190
191
192	sub print_txt_usage
193	{
194	local $self = shift(@_);
195	local $language = shift(@_);
196
197	# Print the usage message for a plugin (recursively)
198	local $descoffset = $self->determine_description_offset(0);
199	$self->print_plugin_usage($language, $descoffset, 1);
200	}
201
202
203	sub determine_description_offset
204	{
205	local $self = shift(@_);
206	local $maxoffset = shift(@_);
207
208	local $optionlistref = $self->{'option_list'};
209	local @optionlist = @$optionlistref;
210	local $pluginoptions = pop(@$optionlistref);
211	return $maxoffset if (!defined($pluginoptions));
212
213	# Find the length of the longest option string of this plugin
214	local $pluginargs = $pluginoptions->{'args'};
215	if (defined($pluginargs)) {
216	local $longest = &PrintUsage::find_longest_option_string($pluginargs);
217	if ($longest > $maxoffset) {
218	$maxoffset = $longest;
219	}
220	}
221
222	# Recurse up the plugin hierarchy
223	$maxoffset = $self->determine_description_offset($maxoffset);
224	$self->{'option_list'} = \@optionlist;
225	return $maxoffset;
226	}
227
228
229	sub print_plugin_usage
230	{
231	local $self = shift(@_);
232	local $language = shift(@_);
233	local $descoffset = shift(@_);
234	local $isleafclass = shift(@_);
235
236	local $optionlistref = $self->{'option_list'};
237	local @optionlist = @$optionlistref;
238	local $pluginoptions = pop(@$optionlistref);
239	return if (!defined($pluginoptions));
240
241	local $pluginname = $pluginoptions->{'name'};
242	local $pluginargs = $pluginoptions->{'args'};
243
244	# Produce the usage information using the data structure above
245	if ($isleafclass) {
246	&gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
247	# print STDERR " usage: classify $classifiername [options]\n\n";
248	}
249
250	# Display the plugin options, if there are some
251	if (defined($pluginargs)) {
252	# Calculate the column offset of the option descriptions
253	local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
254
255	if ($isleafclass) {
256	&gsprintf(STDERR, " {common.specific_options}:\n");
257	# print STDERR " specific options:\n";
258	}
259	else {
260	&gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
261	# print STDERR " general options (from $classifiername):\n";
262	}
263
264	# Display the plugin options
265	&PrintUsage::print_options_txt($language, $pluginargs, $optiondescoffset);
266	}
267
268	# Recurse up the plugin hierarchy
269	$self->print_plugin_usage($language, $descoffset, 0);
270	$self->{'option_list'} = \@optionlist;
271	}
272
273
274	# sub print_general_usage {
275	# my ($plugin_name) = @_;
276
277	# print STDERR "\n usage: plugin $plugin_name [options]\n\n";
278
279	# print STDERR " -process_exp A perl regular expression to match against filenames.\n";
280	# print STDERR " Matching filenames will be processed by this plugin.\n";
281	# print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
282	# print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
283	# print STDERR " .htm or .html (case-insensitive).\n\n";
284
285	# print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
286	# print STDERR " being passed to any later plugins in the list. This has no\n";
287	# print STDERR " real effect other than to prevent lots of warning messages\n";
288	# print STDERR " about input files you don't care about. Each plugin might\n";
289	# print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
290	# print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
291	# print STDERR " file extensions.\n\n";
292
293
294	# print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
295	# print STDERR " converted from these encodings and stored internally as\n";
296	# print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
297	# print STDERR " are:\n";
298
299	# print STDERR " auto: Use text categorization algorithm to automatically\n";
300	# print STDERR " identify the encoding of each source document. This\n";
301	# print STDERR " will be slower than explicitly setting the encoding\n";
302	# print STDERR " but will work where more than one encoding is used\n";
303	# print STDERR " within the same collection.\n";
304
305	# print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
306	# print STDERR " using iso_8859_1. Beware of using this on a collection\n";
307	# print STDERR " of documents that may contain characters outside the\n";
308	# print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
309	# print STDERR " documents containing accents), use iso_8859_1 instead.\n";
310
311	# print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
312	# print STDERR " unicode: just unicode\n";
313
314	# my $e = $encodings::encodings;
315	# foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
316	# print STDERR " $enc: $e->{$enc}->{'name'}\n";
317	# }
318	# print STDERR "\n";
319	# print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
320	# print STDERR " the text categorization algorithm fails to extract the\n";
321	# print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
322	# print STDERR " The default is iso_8859_1.\n\n";
323
324	# print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
325	# print STDERR " metadata. Note that this will be done automatically if\n";
326	# print STDERR " -input_encoding is 'auto'.\n\n";
327	# print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
328	# print STDERR " the 'Language' metadata element will be set to this value.\n";
329	# print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
330	# print STDERR " en = English). Note that if -input_encoding is not set to\n";
331	# print STDERR " 'auto' and -extract_language is not set, all documents will\n";
332	# print STDERR " have their 'Language' metadata set to this value.\n\n";
333
334	# print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
335
336	# print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
337
338	# print STDERR " -first Comma separated list of first sizes to extract from the\n";
339	# print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
340
341	# print STDERR " -extract_email Extract email addresses as metadata\n\n";
342
343	# print STDERR " -extract_historical_years Extract time-period information from historical\n";
344	# print STDERR " documents. This is stored as metadata with the document.\n";
345	# print STDERR " There is a search interface for this metadata, which you \n";
346	# print STDERR " can include in your collection by adding the statement:\n";
347	# print STDERR " format QueryInterface DateSearch\n";
348	# print STDERR " to your collection configuration file\n";
349	# print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
350	# print STDERR " Common Era date, such as 1950)\n";
351	# print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
352	# print STDERR " metadata (e.g. 14 will extract all references up to the\n";
353	# print STDERR " 14th century)\n";
354	# print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
355	# print STDERR " historical dates.\n";
356	# print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
357	# print STDERR " prefix as the file being processed) and associate it as a\n";
358	# print STDERR " cover image\n\n";
359	# }
360
361	# sub print_usage {
362	# print STDERR "\nThis plugin has no plugin specific options\n\n";
363	# }
364
365	sub new {
366	my $class = shift (@_);
367	my $plugin_name = shift (@_);
368	my $self = {};
369	$self->{'plugin_type'} = "BasPlug";
370	my $enc = "^(";
371	map {$enc .= "$_\|";} keys %$encodings::encodings;
372	my $denc = $enc . "ascii\|utf8\|unicode)\$";
373	$enc .= "ascii\|utf8\|unicode\|auto)\$";
374
375	$self->{'outhandle'} = STDERR;
376	my $year = (localtime)[5]+1900;
377
378	$self->{'textcat'} = new textcat();
379
380	$self->{'num_processed'} = 0;
381	$self->{'num_not_processed'} = 0;
382	$self->{'num_blocked'} = 0;
383	$self->{'num_archives'} = 0;
384
385	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
386	$self->{'option_list'} = [ $options ];
387
388	# general options available to all plugins
389	if (!parsargv::parse(\@_,
390	q^process_exp/.*/^, \$self->{'process_exp'},
391	q^block_exp/.*/^, \$self->{'block_exp'},
392	q^extract_language^, \$self->{'extract_language'},
393	q^extract_acronyms^, \$self->{'extract_acronyms'},
394	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
395	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
396	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
397	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
398	q^extract_email^, \$self->{'extract_email'},
399	q^markup_acronyms^, \$self->{'markup_acronyms'},
400	q^default_language/.{2}/en^, \$self->{'default_language'},
401	q^first/.*/^, \$self->{'first'},
402	q^extract_historical_years^, \$self->{'date_extract'},
403	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
404	q^no_bibliography^, \$self->{'no_biblio'},
405	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
406	q^cover_image^, \$self->{'cover_image'},
407	"allow_extra_options")) {
408
409	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
410	$self->print_txt_usage(""); # Use default resource bundle
411	die "\n";
412	}
413
414	return bless $self, $class;
415	}
416
417	# initialize BasPlug options
418	# if init() is overridden in a sub-class, remember to call BasPlug::init()
419	sub init {
420	my $self = shift (@_);
421	my ($verbosity, $outhandle, $failhandle) = @_;
422
423	# verbosity is passed through from the processor
424	$self->{'verbosity'} = $verbosity;
425
426	# as are the outhandle and failhandle
427	$self->{'outhandle'} = $outhandle if defined $outhandle;
428	$self->{'failhandle'} = $failhandle;
429
430	# set process_exp and block_exp to defaults unless they were
431	# explicitly set
432
433	if ((!$self->is_recursive()) and
434	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
435
436	$self->{'process_exp'} = $self->get_default_process_exp ();
437	if ($self->{'process_exp'} eq "") {
438	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
439	}
440	}
441
442	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
443	$self->{'block_exp'} = $self->get_default_block_exp ();
444	}
445	}
446
447	sub begin {
448	my $self = shift (@_);
449	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
450	$self->initialise_extractors();
451	}
452
453	sub end {
454	my ($self) = @_;
455	$self->finalise_extractors();
456	}
457
458	# this function should be overridden to return 1
459	# in recursive plugins
460	sub is_recursive {
461	my $self = shift (@_);
462
463	return 0;
464	}
465
466	sub get_default_block_exp {
467	my $self = shift (@_);
468
469	return "";
470	}
471
472	sub get_default_process_exp {
473	my $self = shift (@_);
474
475	return "";
476	}
477
478	# The BasPlug read() function. This function does all the right things
479	# to make general options work for a given plugin. It calls the process()
480	# function which does all the work specific to a plugin (like the old
481	# read functions used to do). Most plugins should define their own
482	# process() function and let this read() function keep control.
483	#
484	# recursive plugins (e.g. RecPlug) and specialized plugins like those
485	# capable of processing many documents within a single file (e.g.
486	# GMLPlug) should normally implement their own version of read()
487	#
488	# Return number of files processed, undef if can't process
489	# Note that $base_dir might be "" and that $file might
490	# include directories
491
492	sub read {
493	my $self = shift (@_);
494
495	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
496
497	if ($self->is_recursive()) {
498	&gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
499	}
500
501	my $outhandle = $self->{'outhandle'};
502
503	my $filename = $file;
504	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
505
506	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
507	$self->{'num_blocked'} ++;
508	return 0;
509	}
510	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
511	return undef;
512	}
513	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
514
515	# Do encoding stuff
516	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
517
518	# create a new document
519	my $doc_obj = new doc ($filename, "indexed_doc");
520	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
521	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
522	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
523	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
524	my ($filemeta) = $file =~ /([^\\\/]+)$/;
525	# how do we know what encoding the filename is in?
526	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
527	if ($self->{'cover_image'}) {
528	$self->associate_cover_image($doc_obj, $filename);
529	}
530
531	# read in file ($text will be in utf8)
532	my $text = "";
533	$self->read_file ($filename, $encoding, $language, \$text);
534
535	if (!length ($text)) {
536	my $plugin_name = ref ($self);
537	&gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
538
539	my $failhandle = $self->{'failhandle'};
540	&gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
541	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
542	$self->{'num_not_processed'} ++;
543
544	return 0;
545	}
546
547	# include any metadata passed in from previous plugins
548	# note that this metadata is associated with the top level section
549	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
550
551	# do plugin specific processing of doc_obj
552	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli));
553
554	# do any automatic metadata extraction
555	$self->auto_extract_metadata ($doc_obj);
556
557	# add an OID
558	# see if there is a plugin-specific set_OID function...
559	if (defined ($self->can(set_OID))) {
560	# it will need $doc_obj to set the Identifier metadata...
561	$self->set_OID($doc_obj);
562	} else {
563	# use the default set_OID() in doc.pm
564	$doc_obj->set_OID();
565	}
566
567	# process the document
568	$processor->process($doc_obj);
569
570	$self->{'num_processed'} ++;
571
572	return 1; # processed the file
573	}
574
575	# returns undef if file is rejected by the plugin
576	sub process {
577	my $self = shift (@_);
578	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
579
580	&gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
581	# die "Basplug::process function must be implemented in sub-class\n";
582
583	return undef; # never gets here
584	}
585
586	# uses the multiread package to read in the entire file pointed to
587	# by filename and loads the resulting text into $$textref. Input text
588	# may be in any of the encodings handled by multiread, output text
589	# will be in utf8
590	sub read_file {
591	my $self = shift (@_);
592	my ($filename, $encoding, $language, $textref) = @_;
593
594	if (!-r $filename)
595	{
596	my $outhandle = $self->{'outhandle'};
597	&gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
598	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
599	return;
600	}
601
602	$$textref = "";
603
604	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
605	# open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
606
607	if ($encoding eq "ascii") {
608	undef $/;
609	$$textref = <FILE>;
610	$/ = "\n";
611	} else {
612	my $reader = new multiread();
613	$reader->set_handle ('BasPlug::FILE');
614	$reader->set_encoding ($encoding);
615	$reader->read_file ($textref);
616
617	if ($language eq "zh") {
618	# segment the Chinese words
619	$$textref = &cnseg::segment($$textref);
620	}
621	}
622
623	close FILE;
624	}
625
626	sub textcat_get_language_encoding {
627	my $self = shift (@_);
628	my ($filename) = @_;
629
630	my ($language, $encoding, $extracted_encoding);
631	if ($self->{'input_encoding'} eq "auto") {
632	# use textcat to automatically work out the input encoding and language
633	($language, $encoding) = $self->get_language_encoding ($filename);
634	} elsif ($self->{'extract_language'}) {
635	# use textcat to get language metadata
636	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
637	$encoding = $self->{'input_encoding'};
638	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
639	my $plugin_name = ref ($self);
640	my $outhandle = $self->{'outhandle'};
641	&gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
642	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
643	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
644	}
645	} else {
646	$language = $self->{'default_language'};
647	$encoding = $self->{'input_encoding'};
648	}
649	return ($language, $encoding);
650	}
651
652	# Uses textcat to work out the encoding and language of the text in
653	# $filename. All html tags are removed before processing.
654	# returns an array containing "language" and "encoding"
655	sub get_language_encoding {
656	my $self = shift (@_);
657	my ($filename) = @_;
658	my $outhandle = $self->{'outhandle'};
659
660	# read in file
661	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
662	undef $/;
663	my $text = <FILE>;
664	$/ = "\n";
665	close FILE;
666
667	# remove <title>stuff</title> -- as titles tend often to be in English
668	# for foreign language documents
669	$text =~ s/<title>.*?<\/title>//i;
670
671	# remove all HTML tags
672	$text =~ s/<[^>]*>//sg;
673
674	# get the language/encoding
675	my $results = $self->{'textcat'}->classify(\$text);
676
677	# if textcat returns 3 or less possibilities we'll use the
678	# first one in the list - otherwise use the defaults
679	if (scalar @$results > 3) {
680	# changed 12 Feb 2003 by jrm21
681	# use the most popular encoding at least... otherwise we might
682	# generate invalid archive files!
683	my %guessed_encodings = ();
684	foreach my $result (@$results) {
685	$result =~ /([^\-]+)$/;
686	my $enc=$1;
687	if (!defined($guessed_encodings{$enc})) {
688	$guessed_encodings{$enc}=0;
689	}
690	$guessed_encodings{$enc}++;
691	}
692	my $best_encoding="";
693	$guessed_encodings{""}=-1;
694	foreach my $enc (keys %guessed_encodings) {
695	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
696	$best_encoding=$enc;
697	}
698	}
699
700	if ($self->{'input_encoding'} ne 'auto') {
701	if ($self->{'extract_language'} && $self->{'verbosity'}) {
702	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
703	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
704	# print $outhandle "defaulting to $self->{'default_language'}\n";
705	}
706	return ($self->{'default_language'}, $self->{'input_encoding'});
707
708	} else {
709	if ($self->{'verbosity'}) {
710	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
711	# print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
712	# print $outhandle "defaulting to $self->{'default_language'}.\n";
713	}
714	return ($self->{'default_language'}, $best_encoding);
715	}
716	}
717
718	# format language/encoding
719	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
720	if (!defined $language) {
721	if ($self->{'verbosity'}) {
722	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
723	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
724	# print $outhandle "defaulting to $self->{'default_language'}\n";
725	}
726	$language = $self->{'default_language'};
727	}
728	if (!defined $encoding) {
729	if ($self->{'verbosity'}) {
730	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
731	# print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
732	# print $outhandle "defaulting to $self->{'default_encoding'}\n";
733	}
734	$encoding = $self->{'default_encoding'};
735	}
736
737	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
738	!defined $encodings::encodings->{$encoding}) {
739	if ($self->{'verbosity'}) {
740	&gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", $filename, $encoding, $self->{'default_encoding'});
741	# print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
742	# print $outhandle "using $self->{'default_encoding'}\n";
743	}
744	$encoding = $self->{'default_encoding'};
745	}
746
747	return ($language, $encoding);
748	}
749
750	# add any extra metadata that's been passed around from one
751	# plugin to another.
752	# extra_metadata uses add_utf8_metadata so it expects metadata values
753	# to already be in utf8
754	sub extra_metadata {
755	my $self = shift (@_);
756	my ($doc_obj, $cursection, $metadata) = @_;
757
758	foreach my $field (keys(%$metadata)) {
759	# $metadata->{$field} may be an array reference
760	if (ref ($metadata->{$field}) eq "ARRAY") {
761	map {
762	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
763	} @{$metadata->{$field}};
764	} else {
765	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
766	}
767	}
768	}
769
770	# initialise metadata extractors
771	sub initialise_extractors {
772	my $self = shift (@_);
773
774	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
775	&acronym::initialise_acronyms();
776	}
777	}
778
779	# finalise metadata extractors
780	sub finalise_extractors {
781	my $self = shift (@_);
782
783	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
784	&acronym::finalise_acronyms();
785	}
786	}
787
788	# FIRSTNNN: extract the first NNN characters as metadata
789	sub extract_first_NNNN_characters {
790	my $self = shift (@_);
791	my ($textref, $doc_obj, $thissection) = @_;
792
793	foreach my $size (split /,/, $self->{'first'}) {
794	my $tmptext = $$textref;
795	$tmptext =~ s/^\s+//;
796	$tmptext =~ s/\s+$//;
797	$tmptext =~ s/\s+/ /gs;
798	$tmptext = substr ($tmptext, 0, $size);
799	$tmptext =~ s/\s\S*$/…/;
800	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
801	}
802	}
803
804	sub extract_email {
805	my $self = shift (@_);
806	my ($textref, $doc_obj, $thissection) = @_;
807	my $outhandle = $self->{'outhandle'};
808
809	# print $outhandle " extracting email addresses ...\n"
810	&gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
811	if ($self->{'verbosity'} > 2);
812
813	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
814	@email = sort @email;
815
816	my @email2 = ();
817	foreach my $address (@email) {
818	if (!(join(" ",@email2) =~ m/$address/ )) {
819	push @email2, $address;
820	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
821	# print $outhandle " extracting $address\n"
822	&gsprintf($outhandle, " {BasPlug.extracting} $address\n")
823	if ($self->{'verbosity'} > 3);
824	}
825	}
826	# print $outhandle " done extracting email addresses.\n"
827	&gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
828	if ($self->{'verbosity'} > 2);
829	}
830
831	# extract metadata
832	sub auto_extract_metadata {
833
834	my $self = shift (@_);
835	my ($doc_obj) = @_;
836
837	if ($self->{'extract_email'}) {
838	my $thissection = $doc_obj->get_top_section();
839	while (defined $thissection) {
840	my $text = $doc_obj->get_text($thissection);
841	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
842	$thissection = $doc_obj->get_next_section ($thissection);
843	}
844	}
845
846
847	#adding kea keyphrases
848	if ($self->{'kea'}) {
849
850	my $thissection = $doc_obj->get_top_section();
851	my $text = "";
852	my @list;
853
854	while (defined $thissection) { #loop through sections to gather whole doc
855	my $sectiontext = $doc_obj->get_text($thissection);
856	$text = $text.$sectiontext;
857	$thissection = $doc_obj->get_next_section ($thissection);
858	}
859
860	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
861	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
862	} else { #otherwise call Kea with no options
863	@list = &Kea::extract_KeyPhrases ($text);
864	}
865
866	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
867	my $keyphrases = $list[0]; #first arg is keyphrase list
868	my $stems = $list[1]; #second arg is stemmed keyphrase list
869	&gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
870	# print STDERR "keyphrases: $keyphrases\n";
871	&gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
872	# print STDERR "stems: $stems\n";
873	$thissection = $doc_obj->get_top_section(); #add metadata to top section
874	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
875	$doc_obj->add_metadata($thissection, "stems", $stems);
876	}
877	} #end of kea
878
879	if ($self->{'first'}) {
880	my $thissection = $doc_obj->get_top_section();
881	while (defined $thissection) {
882	my $text = $doc_obj->get_text($thissection);
883	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
884	$thissection = $doc_obj->get_next_section ($thissection);
885	}
886	}
887
888	if ($self->{'extract_acronyms'}) {
889	my $thissection = $doc_obj->get_top_section();
890	while (defined $thissection) {
891	my $text = $doc_obj->get_text($thissection);
892	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
893	$thissection = $doc_obj->get_next_section ($thissection);
894	}
895	}
896
897	if ($self->{'markup_acronyms'}) {
898	my $thissection = $doc_obj->get_top_section();
899	while (defined $thissection) {
900	my $text = $doc_obj->get_text($thissection);
901	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
902	$doc_obj->delete_text($thissection);
903	$doc_obj->add_text($thissection, $text);
904	$thissection = $doc_obj->get_next_section ($thissection);
905	}
906	}
907
908	if($self->{'date_extract'}) {
909	my $thissection = $doc_obj->get_top_section();
910	while (defined $thissection) {
911
912	my $text = $doc_obj->get_text($thissection);
913	&DateExtract::get_date_metadata($text, $doc_obj,
914	$thissection,
915	$self->{'no_biblio'},
916	$self->{'max_year'},
917	$self->{'max_century'});
918	$thissection = $doc_obj->get_next_section ($thissection);
919	}
920	}
921	}
922
923	# extract acronyms from a section in a document. progress is
924	# reported to outhandle based on the verbosity. both the Acronym
925	# and the AcronymKWIC metadata items are created.
926
927	sub extract_acronyms {
928	my $self = shift (@_);
929	my ($textref, $doc_obj, $thissection) = @_;
930	my $outhandle = $self->{'outhandle'};
931
932	# print $outhandle " extracting acronyms ...\n"
933	&gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
934	if ($self->{'verbosity'} > 2);
935
936	my $acro_array = &acronym::acronyms($textref);
937
938	foreach my $acro (@$acro_array) {
939
940	#check that this is the first time ...
941	my $seen_before = "false";
942	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
943	foreach my $thisAcro (@$previous_data) {
944	if ($thisAcro eq $acro->to_string()) {
945	$seen_before = "true";
946	# print $outhandle " already seen ". $acro->to_string() . "\n"
947	&gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
948	if ($self->{'verbosity'} >= 4);
949	}
950	}
951
952	if ($seen_before eq "false") {
953	#write it to the file ...
954	$acro->write_to_file();
955
956	#do the normal acronym
957	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
958	# print $outhandle " adding ". $acro->to_string() . "\n"
959	&gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
960	if ($self->{'verbosity'} > 3);
961	}
962	}
963
964	# print $outhandle " done extracting acronyms. \n"
965	&gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
966	if ($self->{'verbosity'} > 2);
967	}
968
969	sub markup_acronyms {
970	my $self = shift (@_);
971	my ($text, $doc_obj, $thissection) = @_;
972	my $outhandle = $self->{'outhandle'};
973
974	# print $outhandle " marking up acronyms ...\n"
975	&gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
976	if ($self->{'verbosity'} > 2);
977
978	#self is passed in to check for verbosity ...
979	$text = &acronym::markup_acronyms($text, $self);
980
981	# print $outhandle " done marking up acronyms. \n"
982	&gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
983	if ($self->{'verbosity'} > 2);
984
985	return $text;
986	}
987
988	sub compile_stats {
989	my $self = shift(@_);
990	my ($stats) = @_;
991
992	$stats->{'num_processed'} += $self->{'num_processed'};
993	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
994	$stats->{'num_archives'} += $self->{'num_archives'};
995
996	}
997
998	sub associate_cover_image {
999	my $self = shift(@_);
1000	my ($doc_obj, $filename) = @_;
1001
1002	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1003	if (-e $filename) {
1004	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1005	} else {
1006	$filename =~ s/jpg$/JPG/;
1007	if (-e $filename) {
1008	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1009	}
1010	}
1011	}
1012
1013	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: