Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 7644

Last change on this file since 7644 was 7644, checked in by jrm21, 20 years ago
don't print "wrong encoding" message for text in english. textcat thinks all english is in iso-8859-1, so basplug complains if the file was read as utf8.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.0 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# BasPlug.pm -- base class for all the import plugins
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
	26	package BasPlug;
[2219]	27
[3834]	28	eval {require bytes};
[3767]	29
[2219]	30	# suppress the annoying "subroutine redefined" warning that various
	31	# plugins cause under perl 5.6
	32	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
	33
[1954]	34	use Kea;
[1219]	35	use parsargv;
	36	use multiread;
[1870]	37	use encodings;
[1219]	38	use cnseg;
[1242]	39	use acronym;
[1317]	40	use textcat;
[1242]	41	use doc;
[1360]	42	use diagnostics;
[1411]	43	use DateExtract;
[2751]	44	use ghtml;
[5681]	45	use gsprintf;
[4778]	46	use printusage;
[4]	47
[5681]	48
[4873]	49	my $unicode_list =
[4744]	50	[ { 'name' => "auto",
[4873]	51	'desc' => "{BasPlug.input_encoding.auto}" },
[4744]	52	{ 'name' => "ascii",
[4873]	53	'desc' => "{BasPlug.input_encoding.ascii}" },
[4744]	54	{ 'name' => "utf8",
[4873]	55	'desc' => "{BasPlug.input_encoding.utf8}" },
[4744]	56	{ 'name' => "unicode",
[4873]	57	'desc' => "{BasPlug.input_encoding.unicode}" } ];
[3540]	58
[4873]	59	my $arguments =
[3540]	60	[ { 'name' => "process_exp",
[4873]	61	'desc' => "{BasPlug.process_exp}",
[6408]	62	'type' => "regexp",
[3540]	63	'deft' => "",
	64	'reqd' => "no" },
	65	{ 'name' => "block_exp",
[4873]	66	'desc' => "{BasPlug.block_exp}",
[6408]	67	'type' => "regexp",
[3540]	68	'deft' => "",
	69	'reqd' => "no" },
	70	{ 'name' => "input_encoding",
[4873]	71	'desc' => "{BasPlug.input_encoding}",
[3540]	72	'type' => "enum",
	73	'list' => $unicode_list,
	74	'reqd' => "no" ,
	75	'deft' => "auto" } ,
	76	{ 'name' => "default_encoding",
[4873]	77	'desc' => "{BasPlug.default_encoding}",
[4744]	78	'type' => "enum",
[6332]	79	'list' => $unicode_list,
[4744]	80	'reqd' => "no",
	81	'deft' => "utf8" },
[3540]	82	{ 'name' => "extract_language",
[4873]	83	'desc' => "{BasPlug.extract_language}",
[3540]	84	'type' => "flag",
	85	'reqd' => "no" },
	86	{ 'name' => "default_language",
[4873]	87	'desc' => "{BasPlug.default_language}",
[3540]	88	'type' => "language",
	89	'deft' => "en",
	90	'reqd' => "no" },
	91	{ 'name' => "extract_acronyms",
[4873]	92	'desc' => "{BasPlug.extract_acronyms}",
[3540]	93	'type' => "flag",
	94	'reqd' => "no" },
	95	{ 'name' => "markup_acronyms",
[4873]	96	'desc' => "{BasPlug.markup_acronyms}",
[3540]	97	'type' => "flag",
	98	'reqd' => "no" },
	99	{ 'name' => "first",
[4873]	100	'desc' => "{BasPlug.first}",
[3540]	101	'type' => "string",
	102	'reqd' => "no" },
	103	{ 'name' => "extract_email",
[4873]	104	'desc' => "{BasPlug.extract_email}",
[3540]	105	'type' => "flag",
	106	'reqd' => "no" },
	107	{ 'name' => "extract_historical_years",
[4873]	108	'desc' => "{BasPlug.extract_historical_years}",
[3540]	109	'type' => "flag",
	110	'reqd' => "no" },
	111	{ 'name' => "maximum_year",
[4873]	112	'desc' => "{BasPlug.maximum_year}",
[3540]	113	'type' => "int",
[4744]	114	'deft' => (localtime)[5]+1900,
[3540]	115	'reqd' => "no"},
	116	{ 'name' => "maximum_century",
[4873]	117	'desc' => "{BasPlug.maximum_century}",
[7105]	118	'type' => "string",
	119	'deft' => "",
[3540]	120	'reqd' => "no" },
	121	{ 'name' => "no_bibliography",
[4873]	122	'desc' => "{BasPlug.no_bibliography}",
[3540]	123	'type' => "flag",
	124	'reqd' => "no"},
	125	{ 'name' => "cover_image",
[4873]	126	'desc' => "{BasPlug.cover_image}",
[3540]	127	'type' => "flag",
	128	'reqd' => "no" } ];
	129
	130	my $options = { 'name' => "BasPlug",
[5681]	131	'desc' => "{BasPlug.desc}",
[6408]	132	'abstract' => "yes",
	133	'inherits' => "no",
[4750]	134	'args' => $arguments };
[3540]	135
[4778]	136
[5681]	137	sub gsprintf
	138	{
	139	return &gsprintf::gsprintf(@_);
	140	}
	141
	142
[4873]	143	sub get_arguments
	144	{
	145	local $self = shift(@_);
	146	local $optionlistref = $self->{'option_list'};
	147	local @optionlist = @$optionlistref;
	148	local $pluginoptions = pop(@$optionlistref);
	149	local $pluginarguments = $pluginoptions->{'args'};
	150	return $pluginarguments;
	151	}
	152
	153
[4778]	154	sub print_xml_usage
	155	{
	156	local $self = shift(@_);
	157
[6945]	158	# XML output is always in UTF-8
	159	&gsprintf::output_strings_in_UTF8;
	160
[4873]	161	&PrintUsage::print_xml_header();
[6925]	162	$self->print_xml();
[3540]	163	}
	164
[4778]	165
	166	sub print_xml
	167	{
	168	local $self = shift(@_);
	169
	170	local $optionlistref = $self->{'option_list'};
	171	local @optionlist = @$optionlistref;
	172	local $pluginoptions = pop(@$optionlistref);
	173	return if (!defined($pluginoptions));
	174
[6987]	175	&gsprintf(STDERR, "<PlugInfo>\n");
	176	&gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
[7023]	177	my $desc = &gsprintf::lookup_string($pluginoptions->{'desc'});
	178	$desc =~ s/</&lt;/g; # doubly escaped
	179	$desc =~ s/>/&gt;/g;
	180
	181	&gsprintf(STDERR, " <Desc>$desc</Desc>\n");
[6987]	182	&gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
	183	&gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
	184	&gsprintf(STDERR, " <Arguments>\n");
[4778]	185	if (defined($pluginoptions->{'args'})) {
[6925]	186	&PrintUsage::print_options_xml($pluginoptions->{'args'});
[3540]	187	}
[4778]	188
	189	# Recurse up the plugin hierarchy
[6925]	190	$self->print_xml();
[4778]	191
[6987]	192	&gsprintf(STDERR, " </Arguments>\n");
	193	&gsprintf(STDERR, "</PlugInfo>\n");
[3540]	194	}
	195
[4744]	196
[4778]	197	sub print_txt_usage
[4744]	198	{
	199	local $self = shift(@_);
	200
[4750]	201	# Print the usage message for a plugin (recursively)
	202	local $descoffset = $self->determine_description_offset(0);
[6925]	203	$self->print_plugin_usage($descoffset, 1);
[4750]	204	}
[4744]	205
	206
[4750]	207	sub determine_description_offset
	208	{
	209	local $self = shift(@_);
	210	local $maxoffset = shift(@_);
	211
	212	local $optionlistref = $self->{'option_list'};
	213	local @optionlist = @$optionlistref;
	214	local $pluginoptions = pop(@$optionlistref);
	215	return $maxoffset if (!defined($pluginoptions));
	216
	217	# Find the length of the longest option string of this plugin
	218	local $pluginargs = $pluginoptions->{'args'};
[4744]	219	if (defined($pluginargs)) {
[4778]	220	local $longest = &PrintUsage::find_longest_option_string($pluginargs);
	221	if ($longest > $maxoffset) {
	222	$maxoffset = $longest;
[4744]	223	}
[4750]	224	}
[4744]	225
[4750]	226	# Recurse up the plugin hierarchy
	227	$maxoffset = $self->determine_description_offset($maxoffset);
	228	$self->{'option_list'} = \@optionlist;
	229	return $maxoffset;
	230	}
	231
	232
	233	sub print_plugin_usage
	234	{
	235	local $self = shift(@_);
	236	local $descoffset = shift(@_);
	237	local $isleafclass = shift(@_);
	238
	239	local $optionlistref = $self->{'option_list'};
	240	local @optionlist = @$optionlistref;
	241	local $pluginoptions = pop(@$optionlistref);
	242	return if (!defined($pluginoptions));
	243
	244	local $pluginname = $pluginoptions->{'name'};
	245	local $pluginargs = $pluginoptions->{'args'};
[6932]	246	local $plugindesc = $pluginoptions->{'desc'};
[4750]	247
	248	# Produce the usage information using the data structure above
	249	if ($isleafclass) {
[6932]	250	if (defined($plugindesc)) {
	251	&gsprintf(STDERR, "$plugindesc\n\n");
	252	}
[5681]	253	&gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
[4750]	254	}
	255
	256	# Display the plugin options, if there are some
	257	if (defined($pluginargs)) {
[4744]	258	# Calculate the column offset of the option descriptions
[4750]	259	local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
[4744]	260
[4750]	261	if ($isleafclass) {
[5681]	262	&gsprintf(STDERR, " {common.specific_options}:\n");
[4750]	263	}
	264	else {
[5681]	265	&gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
[4750]	266	}
	267
[4744]	268	# Display the plugin options
[6925]	269	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
[4744]	270	}
	271
[4750]	272	# Recurse up the plugin hierarchy
[6925]	273	$self->print_plugin_usage($descoffset, 0);
[4750]	274	$self->{'option_list'} = \@optionlist;
[4744]	275	}
	276
	277
[4]	278	sub new {
[1219]	279	my $class = shift (@_);
[1242]	280	my $plugin_name = shift (@_);
[1219]	281	my $self = {};
[5924]	282	$self->{'plugin_type'} = "BasPlug";
[1844]	283	my $enc = "^(";
[1874]	284	map {$enc .= "$_\|";} keys %$encodings::encodings;
[1870]	285	my $denc = $enc . "ascii\|utf8\|unicode)\$";
	286	$enc .= "ascii\|utf8\|unicode\|auto)\$";
[1844]	287
[1424]	288	$self->{'outhandle'} = STDERR;
[1411]	289	my $year = (localtime)[5]+1900;
[2235]	290
	291	$self->{'textcat'} = new textcat();
[2785]	292
	293	$self->{'num_processed'} = 0;
	294	$self->{'num_not_processed'} = 0;
	295	$self->{'num_blocked'} = 0;
	296	$self->{'num_archives'} = 0;
[1954]	297
[3540]	298	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
	299	$self->{'option_list'} = [ $options ];
	300
[1219]	301	# general options available to all plugins
[1242]	302	if (!parsargv::parse(\@_,
	303	q^process_exp/.*/^, \$self->{'process_exp'},
	304	q^block_exp/.*/^, \$self->{'block_exp'},
[4744]	305	q^extract_language^, \$self->{'extract_language'},
[1954]	306	q^extract_acronyms^, \$self->{'extract_acronyms'},
[4744]	307	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
	308	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
[1844]	309	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
[3427]	310	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
[1602]	311	q^extract_email^, \$self->{'extract_email'},
[1393]	312	q^markup_acronyms^, \$self->{'markup_acronyms'},
[1844]	313	q^default_language/.{2}/en^, \$self->{'default_language'},
[1605]	314	q^first/.*/^, \$self->{'first'},
[2835]	315	q^extract_historical_years^, \$self->{'date_extract'},
	316	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
[1411]	317	q^no_bibliography^, \$self->{'no_biblio'},
[1844]	318	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
[2816]	319	q^cover_image^, \$self->{'cover_image'},
[6584]	320	q^separate_cjk^, \$self->{'separate_cjk'},
[1219]	321	"allow_extra_options")) {
[5681]	322
	323	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
[4873]	324	$self->print_txt_usage(""); # Use default resource bundle
[1219]	325	die "\n";
	326	}
	327
	328	return bless $self, $class;
[4]	329	}
	330
[1242]	331	# initialize BasPlug options
	332	# if init() is overridden in a sub-class, remember to call BasPlug::init()
	333	sub init {
	334	my $self = shift (@_);
[2785]	335	my ($verbosity, $outhandle, $failhandle) = @_;
[1242]	336
	337	# verbosity is passed through from the processor
	338	$self->{'verbosity'} = $verbosity;
	339
[2785]	340	# as are the outhandle and failhandle
[1424]	341	$self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]	342	$self->{'failhandle'} = $failhandle;
[1424]	343
[1242]	344	# set process_exp and block_exp to defaults unless they were
	345	# explicitly set
[1244]	346
	347	if ((!$self->is_recursive()) and
[1242]	348	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	349
[1242]	350	$self->{'process_exp'} = $self->get_default_process_exp ();
	351	if ($self->{'process_exp'} eq "") {
[1244]	352	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	353	}
	354	}
	355
	356	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	357	$self->{'block_exp'} = $self->get_default_block_exp ();
	358	}
	359	}
	360
[839]	361	sub begin {
	362	my $self = shift (@_);
	363	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[1396]	364	$self->initialise_extractors();
[839]	365	}
	366
	367	sub end {
	368	my ($self) = @_;
[1396]	369	$self->finalise_extractors();
[839]	370	}
	371
[1242]	372	# this function should be overridden to return 1
	373	# in recursive plugins
[4]	374	sub is_recursive {
	375	my $self = shift (@_);
	376
[1242]	377	return 0;
[4]	378	}
	379
[1242]	380	sub get_default_block_exp {
	381	my $self = shift (@_);
	382
	383	return "";
	384	}
	385
	386	sub get_default_process_exp {
	387	my $self = shift (@_);
	388
	389	return "";
	390	}
	391
	392	# The BasPlug read() function. This function does all the right things
	393	# to make general options work for a given plugin. It calls the process()
	394	# function which does all the work specific to a plugin (like the old
	395	# read functions used to do). Most plugins should define their own
	396	# process() function and let this read() function keep control.
	397	#
[1244]	398	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	399	# capable of processing many documents within a single file (e.g.
	400	# GMLPlug) should normally implement their own version of read()
	401	#
[7362]	402	# Return number of files processed, undef if can't recognise, -1 if can't
	403	# process
[4]	404	# Note that $base_dir might be "" and that $file might
	405	# include directories
[1242]	406
[4]	407	sub read {
[1954]	408	my $self = shift (@_);
	409
[6332]	410	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
[4]	411
[1242]	412	if ($self->is_recursive()) {
[5681]	413	&gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
[1242]	414	}
[4]	415
[1844]	416	my $outhandle = $self->{'outhandle'};
	417
[2795]	418	my $filename = $file;
	419	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	420
[2785]	421	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
	422	$self->{'num_blocked'} ++;
[7362]	423	return 0; # blocked
[2785]	424	}
[1242]	425	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
[7362]	426	return undef; # can't recognise
[1242]	427	}
	428	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1844]	429
[2811]	430	# Do encoding stuff
	431	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[1844]	432
[1242]	433	# create a new document
[1379]	434	my $doc_obj = new doc ($filename, "indexed_doc");
[2327]	435	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
[1844]	436	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
[1868]	437	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[7508]	438	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[2796]	439	my ($filemeta) = $file =~ /([^\\\/]+)$/;
[4845]	440	# how do we know what encoding the filename is in?
	441	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
[2816]	442	if ($self->{'cover_image'}) {
	443	$self->associate_cover_image($doc_obj, $filename);
	444	}
[1242]	445
	446	# read in file ($text will be in utf8)
	447	my $text = "";
[2734]	448	$self->read_file ($filename, $encoding, $language, \$text);
[1242]	449
[1844]	450	if (!length ($text)) {
[2811]	451	my $plugin_name = ref ($self);
[5681]	452	&gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
[2785]	453
	454	my $failhandle = $self->{'failhandle'};
[5681]	455	&gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
	456	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
[2785]	457	$self->{'num_not_processed'} ++;
	458
[7362]	459	return 0; # what should we return here?? error but don't want to pass it on
[1242]	460	}
[1954]	461
[1242]	462	# include any metadata passed in from previous plugins
	463	# note that this metadata is associated with the top level section
	464	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	465
	466	# do plugin specific processing of doc_obj
[7362]	467	return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli));
[1954]	468
[1242]	469	# do any automatic metadata extraction
	470	$self->auto_extract_metadata ($doc_obj);
[1954]	471
[1242]	472	# add an OID
[3515]	473	# see if there is a plugin-specific set_OID function...
	474	if (defined ($self->can(set_OID))) {
	475	# it will need $doc_obj to set the Identifier metadata...
	476	$self->set_OID($doc_obj);
	477	} else {
	478	# use the default set_OID() in doc.pm
	479	$doc_obj->set_OID();
	480	}
[1242]	481
	482	# process the document
	483	$processor->process($doc_obj);
	484
[2785]	485	$self->{'num_processed'} ++;
	486
[1242]	487	return 1; # processed the file
[4]	488	}
	489
[1244]	490	# returns undef if file is rejected by the plugin
[1242]	491	sub process {
	492	my $self = shift (@_);
	493	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	494
[5681]	495	&gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
	496	# die "Basplug::process function must be implemented in sub-class\n";
[1244]	497
	498	return undef; # never gets here
[1242]	499	}
	500
[1219]	501	# uses the multiread package to read in the entire file pointed to
	502	# by filename and loads the resulting text into $$textref. Input text
	503	# may be in any of the encodings handled by multiread, output text
	504	# will be in utf8
	505	sub read_file {
	506	my $self = shift (@_);
[2734]	507	my ($filename, $encoding, $language, $textref) = @_;
[4]	508
[1756]	509	if (!-r $filename)
	510	{
[1844]	511	my $outhandle = $self->{'outhandle'};
[5681]	512	&gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
	513	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
[1756]	514	return;
	515	}
	516
[1219]	517	$$textref = "";
	518
[5681]	519	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
	520	# open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
[1219]	521
[1844]	522	if ($encoding eq "ascii") {
[1219]	523	undef $/;
	524	$$textref = <FILE>;
	525	$/ = "\n";
	526	} else {
	527	my $reader = new multiread();
	528	$reader->set_handle ('BasPlug::FILE');
[1844]	529	$reader->set_encoding ($encoding);
[1219]	530	$reader->read_file ($textref);
	531
[6584]	532	#Now segments chinese if the separate_cjk option is set
	533	if ($self->{'separate_cjk'}) {
[1219]	534	# segment the Chinese words
	535	$$textref = &cnseg::segment($$textref);
	536	}
	537	}
	538
	539	close FILE;
	540	}
	541
[7504]	542	sub filename_based_title
	543	{
	544	my $self = shift (@_);
	545	my ($file) = @_;
	546
	547	my $file_derived_title = $file;
	548	$file_derived_title =~ s/_/ /g;
	549	$file_derived_title =~ s/\..*?$//;
	550
	551	return $file_derived_title;
	552	}
	553
	554
	555	sub title_fallback
	556	{
	557	my $self = shift (@_);
	558	my ($doc_obj,$section,$file) = @_;
	559
	560	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
	561
	562	my $file_derived_title = $self->filename_based_title($file);
	563	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
	564	}
	565	}
	566
[2811]	567	sub textcat_get_language_encoding {
	568	my $self = shift (@_);
	569	my ($filename) = @_;
	570
	571	my ($language, $encoding, $extracted_encoding);
	572	if ($self->{'input_encoding'} eq "auto") {
	573	# use textcat to automatically work out the input encoding and language
	574	($language, $encoding) = $self->get_language_encoding ($filename);
	575	} elsif ($self->{'extract_language'}) {
	576	# use textcat to get language metadata
	577	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
	578	$encoding = $self->{'input_encoding'};
[7644]	579	# don't print this message for english... english in utf8 is identical
	580	# to english in iso-8859-1 (except for some punctuation). We don't have
	581	# a language model for en_utf8, so textcat always says iso-8859-1!
	582	if ($extracted_encoding ne $encoding && $language ne "en"
	583	&& $self->{'verbosity'}) {
[2811]	584	my $plugin_name = ref ($self);
	585	my $outhandle = $self->{'outhandle'};
[5681]	586	&gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
	587	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
	588	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
[2811]	589	}
	590	} else {
	591	$language = $self->{'default_language'};
	592	$encoding = $self->{'input_encoding'};
	593	}
	594	return ($language, $encoding);
	595	}
	596
[1844]	597	# Uses textcat to work out the encoding and language of the text in
	598	# $filename. All html tags are removed before processing.
	599	# returns an array containing "language" and "encoding"
	600	sub get_language_encoding {
	601	my $self = shift (@_);
	602	my ($filename) = @_;
	603	my $outhandle = $self->{'outhandle'};
	604
	605	# read in file
[5681]	606	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
[1844]	607	undef $/;
	608	my $text = <FILE>;
	609	$/ = "\n";
	610	close FILE;
	611
[1999]	612	# remove <title>stuff</title> -- as titles tend often to be in English
	613	# for foreign language documents
	614	$text =~ s/<title>.*?<\/title>//i;
	615
[1844]	616	# remove all HTML tags
	617	$text =~ s/<[^>]*>//sg;
	618
	619	# get the language/encoding
[2235]	620	my $results = $self->{'textcat'}->classify(\$text);
[1844]	621
[1903]	622	# if textcat returns 3 or less possibilities we'll use the
	623	# first one in the list - otherwise use the defaults
[2235]	624	if (scalar @$results > 3) {
[3731]	625	# changed 12 Feb 2003 by jrm21
	626	# use the most popular encoding at least... otherwise we might
	627	# generate invalid archive files!
	628	my %guessed_encodings = ();
	629	foreach my $result (@$results) {
	630	$result =~ /([^\-]+)$/;
	631	my $enc=$1;
	632	if (!defined($guessed_encodings{$enc})) {
	633	$guessed_encodings{$enc}=0;
	634	}
	635	$guessed_encodings{$enc}++;
	636	}
	637	my $best_encoding="";
	638	$guessed_encodings{""}=-1;
	639	foreach my $enc (keys %guessed_encodings) {
	640	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
	641	$best_encoding=$enc;
	642	}
	643	}
	644
[1844]	645	if ($self->{'input_encoding'} ne 'auto') {
	646	if ($self->{'extract_language'} && $self->{'verbosity'}) {
[5681]	647	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
	648	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
	649	# print $outhandle "defaulting to $self->{'default_language'}\n";
[1844]	650	}
	651	return ($self->{'default_language'}, $self->{'input_encoding'});
	652
	653	} else {
	654	if ($self->{'verbosity'}) {
[5681]	655	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
	656	# print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
	657	# print $outhandle "defaulting to $self->{'default_language'}.\n";
[1844]	658	}
[3731]	659	return ($self->{'default_language'}, $best_encoding);
[1844]	660	}
	661	}
	662
	663	# format language/encoding
[2235]	664	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
[1870]	665	if (!defined $language) {
	666	if ($self->{'verbosity'}) {
[5681]	667	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
	668	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
	669	# print $outhandle "defaulting to $self->{'default_language'}\n";
[1870]	670	}
	671	$language = $self->{'default_language'};
	672	}
[1844]	673	if (!defined $encoding) {
[1870]	674	if ($self->{'verbosity'}) {
[5681]	675	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
	676	# print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
	677	# print $outhandle "defaulting to $self->{'default_encoding'}\n";
[1870]	678	}
	679	$encoding = $self->{'default_encoding'};
[1844]	680	}
	681
[1870]	682	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
	683	!defined $encodings::encodings->{$encoding}) {
[1844]	684	if ($self->{'verbosity'}) {
[5681]	685	&gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", $filename, $encoding, $self->{'default_encoding'});
	686	# print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
	687	# print $outhandle "using $self->{'default_encoding'}\n";
[1844]	688	}
	689	$encoding = $self->{'default_encoding'};
	690	}
	691
	692	return ($language, $encoding);
	693	}
	694
[1219]	695	# add any extra metadata that's been passed around from one
	696	# plugin to another.
	697	# extra_metadata uses add_utf8_metadata so it expects metadata values
	698	# to already be in utf8
	699	sub extra_metadata {
	700	my $self = shift (@_);
	701	my ($doc_obj, $cursection, $metadata) = @_;
	702
	703	foreach my $field (keys(%$metadata)) {
[839]	704	# $metadata->{$field} may be an array reference
	705	if (ref ($metadata->{$field}) eq "ARRAY") {
	706	map {
[1219]	707	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]	708	} @{$metadata->{$field}};
	709	} else {
[1219]	710	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]	711	}
	712	}
	713	}
	714
[1396]	715	# initialise metadata extractors
	716	sub initialise_extractors {
	717	my $self = shift (@_);
	718
	719	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	720	&acronym::initialise_acronyms();
	721	}
	722	}
	723
	724	# finalise metadata extractors
	725	sub finalise_extractors {
	726	my $self = shift (@_);
	727
	728	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	729	&acronym::finalise_acronyms();
	730	}
	731	}
	732
[1602]	733	# FIRSTNNN: extract the first NNN characters as metadata
	734	sub extract_first_NNNN_characters {
	735	my $self = shift (@_);
	736	my ($textref, $doc_obj, $thissection) = @_;
	737
	738	foreach my $size (split /,/, $self->{'first'}) {
	739	my $tmptext = $$textref;
	740	$tmptext =~ s/^\s+//;
	741	$tmptext =~ s/\s+$//;
	742	$tmptext =~ s/\s+/ /gs;
	743	$tmptext = substr ($tmptext, 0, $size);
	744	$tmptext =~ s/\s\S*$/…/;
	745	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	746	}
	747	}
	748
	749	sub extract_email {
	750	my $self = shift (@_);
	751	my ($textref, $doc_obj, $thissection) = @_;
	752	my $outhandle = $self->{'outhandle'};
	753
[5681]	754	# print $outhandle " extracting email addresses ...\n"
	755	&gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
[1844]	756	if ($self->{'verbosity'} > 2);
[1602]	757
[2604]	758	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
[1602]	759	@email = sort @email;
	760
	761	my @email2 = ();
	762	foreach my $address (@email) {
	763	if (!(join(" ",@email2) =~ m/$address/ )) {
	764	push @email2, $address;
	765	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
[5681]	766	# print $outhandle " extracting $address\n"
	767	&gsprintf($outhandle, " {BasPlug.extracting} $address\n")
[1844]	768	if ($self->{'verbosity'} > 3);
[1602]	769	}
	770	}
[5681]	771	# print $outhandle " done extracting email addresses.\n"
	772	&gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
[1844]	773	if ($self->{'verbosity'} > 2);
[1602]	774	}
	775
	776	# extract metadata
[5681]	777	sub auto_extract_metadata {
[1954]	778
[1242]	779	my $self = shift (@_);
	780	my ($doc_obj) = @_;
[1602]	781
	782	if ($self->{'extract_email'}) {
	783	my $thissection = $doc_obj->get_top_section();
	784	while (defined $thissection) {
	785	my $text = $doc_obj->get_text($thissection);
	786	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
	787	$thissection = $doc_obj->get_next_section ($thissection);
	788	}
[1954]	789	}
	790
	791
	792	#adding kea keyphrases
	793	if ($self->{'kea'}) {
	794
	795	my $thissection = $doc_obj->get_top_section();
	796	my $text = "";
	797	my @list;
	798
	799	while (defined $thissection) { #loop through sections to gather whole doc
	800	my $sectiontext = $doc_obj->get_text($thissection);
	801	$text = $text.$sectiontext;
	802	$thissection = $doc_obj->get_next_section ($thissection);
	803	}
	804
	805	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
	806	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
	807	} else { #otherwise call Kea with no options
	808	@list = &Kea::extract_KeyPhrases ($text);
	809	}
	810
	811	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
	812	my $keyphrases = $list[0]; #first arg is keyphrase list
	813	my $stems = $list[1]; #second arg is stemmed keyphrase list
[5681]	814	&gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
	815	# print STDERR "keyphrases: $keyphrases\n";
	816	&gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
	817	# print STDERR "stems: $stems\n";
[1954]	818	$thissection = $doc_obj->get_top_section(); #add metadata to top section
	819	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
	820	$doc_obj->add_metadata($thissection, "stems", $stems);
	821	}
	822	} #end of kea
	823
[1602]	824	if ($self->{'first'}) {
	825	my $thissection = $doc_obj->get_top_section();
	826	while (defined $thissection) {
	827	my $text = $doc_obj->get_text($thissection);
	828	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
	829	$thissection = $doc_obj->get_next_section ($thissection);
	830	}
	831	}
	832
[1242]	833	if ($self->{'extract_acronyms'}) {
	834	my $thissection = $doc_obj->get_top_section();
	835	while (defined $thissection) {
	836	my $text = $doc_obj->get_text($thissection);
	837	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
	838	$thissection = $doc_obj->get_next_section ($thissection);
	839	}
	840	}
[1602]	841
[1393]	842	if ($self->{'markup_acronyms'}) {
	843	my $thissection = $doc_obj->get_top_section();
	844	while (defined $thissection) {
	845	my $text = $doc_obj->get_text($thissection);
	846	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
	847	$doc_obj->delete_text($thissection);
	848	$doc_obj->add_text($thissection, $text);
	849	$thissection = $doc_obj->get_next_section ($thissection);
	850	}
	851	}
	852
[1846]	853	if($self->{'date_extract'}) {
[1317]	854	my $thissection = $doc_obj->get_top_section();
	855	while (defined $thissection) {
[1846]	856
[1317]	857	my $text = $doc_obj->get_text($thissection);
[1846]	858	&DateExtract::get_date_metadata($text, $doc_obj,
	859	$thissection,
	860	$self->{'no_biblio'},
	861	$self->{'max_year'},
	862	$self->{'max_century'});
[1317]	863	$thissection = $doc_obj->get_next_section ($thissection);
	864	}
	865	}
[1242]	866	}
	867
[1335]	868	# extract acronyms from a section in a document. progress is
[1424]	869	# reported to outhandle based on the verbosity. both the Acronym
[1335]	870	# and the AcronymKWIC metadata items are created.
	871
[1242]	872	sub extract_acronyms {
	873	my $self = shift (@_);
	874	my ($textref, $doc_obj, $thissection) = @_;
[1424]	875	my $outhandle = $self->{'outhandle'};
[1242]	876
[5681]	877	# print $outhandle " extracting acronyms ...\n"
	878	&gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
[1844]	879	if ($self->{'verbosity'} > 2);
[1335]	880
[1242]	881	my $acro_array = &acronym::acronyms($textref);
[1360]	882
[1242]	883	foreach my $acro (@$acro_array) {
	884
[1335]	885	#check that this is the first time ...
	886	my $seen_before = "false";
	887	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
	888	foreach my $thisAcro (@$previous_data) {
[1602]	889	if ($thisAcro eq $acro->to_string()) {
[1335]	890	$seen_before = "true";
[5681]	891	# print $outhandle " already seen ". $acro->to_string() . "\n"
	892	&gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
[1602]	893	if ($self->{'verbosity'} >= 4);
[5681]	894	}
[1242]	895	}
[1335]	896
[1602]	897	if ($seen_before eq "false") {
[1393]	898	#write it to the file ...
	899	$acro->write_to_file();
	900
[1335]	901	#do the normal acronym
	902	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[5681]	903	# print $outhandle " adding ". $acro->to_string() . "\n"
	904	&gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
[1844]	905	if ($self->{'verbosity'} > 3);
[1335]	906	}
[1242]	907	}
[5681]	908
	909	# print $outhandle " done extracting acronyms. \n"
	910	&gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
[1844]	911	if ($self->{'verbosity'} > 2);
[1242]	912	}
	913
[1393]	914	sub markup_acronyms {
	915	my $self = shift (@_);
	916	my ($text, $doc_obj, $thissection) = @_;
[1424]	917	my $outhandle = $self->{'outhandle'};
[1393]	918
[5681]	919	# print $outhandle " marking up acronyms ...\n"
	920	&gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
[1844]	921	if ($self->{'verbosity'} > 2);
[1393]	922
	923	#self is passed in to check for verbosity ...
	924	$text = &acronym::markup_acronyms($text, $self);
	925
[5681]	926	# print $outhandle " done marking up acronyms. \n"
	927	&gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
[1844]	928	if ($self->{'verbosity'} > 2);
[1393]	929
	930	return $text;
	931	}
	932
[2785]	933	sub compile_stats {
	934	my $self = shift(@_);
	935	my ($stats) = @_;
	936
	937	$stats->{'num_processed'} += $self->{'num_processed'};
	938	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]	939	$stats->{'num_archives'} += $self->{'num_archives'};
[2785]	940
	941	}
	942
[2816]	943	sub associate_cover_image {
	944	my $self = shift(@_);
	945	my ($doc_obj, $filename) = @_;
	946
	947	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
	948	if (-e $filename) {
	949	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[3086]	950	} else {
	951	$filename =~ s/jpg$/JPG/;
	952	if (-e $filename) {
	953	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
	954	}
[2816]	955	}
	956	}
	957
[4]	958	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: