Context Navigation

source: gsdl/trunk/perllib/plugins/BasPlug.pm@ 14961

Last change on this file since 14961 was 14961, checked in by davidb, 16 years ago
Setting filename metadata (Source) in BasPlug.pm looks to user its locale, in the first instance, to resolve what character encoding the file system uses.
Property svn:keywords set to `Author Date Id Revision`
File size: 48.6 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# BasPlug.pm -- base class for all the import plugins
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[9413]	8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
	26	package BasPlug;
[2219]	27
[9398]	28	BEGIN {
	29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	30	}
	31
[3834]	32	eval {require bytes};
[3767]	33
[2219]	34	# suppress the annoying "subroutine redefined" warning that various
	35	# plugins cause under perl 5.6
	36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
	37
[10254]	38	use strict;
	39	no strict 'subs';
	40	no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]	41
[8892]	42	use File::Basename;
	43
[1954]	44	use Kea;
[1219]	45	use multiread;
[1870]	46	use encodings;
[11389]	47	use unicode;
[1219]	48	use cnseg;
[1242]	49	use acronym;
[1317]	50	use textcat;
[1242]	51	use doc;
[7645]	52	eval "require diagnostics"; # some perl distros (eg mac) don't have this
[1411]	53	use DateExtract;
[2751]	54	use ghtml;
[9413]	55	use gsprintf 'gsprintf';
[4778]	56	use printusage;
[10218]	57	use parse2;
[4]	58
[10218]	59
[9398]	60	use GISBasPlug;
[5681]	61
[9413]	62	@BasPlug::ISA = ( GISBasPlug );
[9398]	63
[4873]	64	my $unicode_list =
[10218]	65	[ { 'name' => "ascii",
[4873]	66	'desc' => "{BasPlug.input_encoding.ascii}" },
[4744]	67	{ 'name' => "utf8",
[4873]	68	'desc' => "{BasPlug.input_encoding.utf8}" },
[4744]	69	{ 'name' => "unicode",
[4873]	70	'desc' => "{BasPlug.input_encoding.unicode}" } ];
[3540]	71
[10218]	72	my $auto_unicode_list =
	73	[ { 'name' => "auto",
	74	'desc' => "{BasPlug.input_encoding.auto}" } ];
	75
[10620]	76	my $e = $encodings::encodings;
	77	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
	78	{
	79	my $hashEncode =
	80	{'name' => $enc,
	81	'desc' => $e->{$enc}->{'name'}};
	82
	83	push(@{$unicode_list},$hashEncode);
	84	}
	85
	86	push(@{$auto_unicode_list},@{$unicode_list});
	87
[4873]	88	my $arguments =
[3540]	89	[ { 'name' => "process_exp",
[4873]	90	'desc' => "{BasPlug.process_exp}",
[6408]	91	'type' => "regexp",
[3540]	92	'deft' => "",
	93	'reqd' => "no" },
	94	{ 'name' => "block_exp",
[4873]	95	'desc' => "{BasPlug.block_exp}",
[6408]	96	'type' => "regexp",
[3540]	97	'deft' => "",
	98	'reqd' => "no" },
[9067]	99	{ 'name' => "smart_block",
	100	'desc' => "{BasPlug.smart_block}",
	101	'type' => "flag",
[11368]	102	'reqd' => "no" },
[8892]	103	{ 'name' => "associate_ext",
	104	'desc' => "{BasPlug.associate_ext}",
	105	'type' => "string",
	106	'reqd' => "no" },
[11122]	107	{ 'name' => "associate_tail_re",
	108	'desc' => "{BasPlug.associate_tail_re}",
	109	'type' => "string",
	110	'reqd' => "no" },
[3540]	111	{ 'name' => "input_encoding",
[4873]	112	'desc' => "{BasPlug.input_encoding}",
[3540]	113	'type' => "enum",
[10218]	114	'list' => $auto_unicode_list,
[3540]	115	'reqd' => "no" ,
	116	'deft' => "auto" } ,
	117	{ 'name' => "default_encoding",
[4873]	118	'desc' => "{BasPlug.default_encoding}",
[4744]	119	'type' => "enum",
[6332]	120	'list' => $unicode_list,
[4744]	121	'reqd' => "no",
	122	'deft' => "utf8" },
[3540]	123	{ 'name' => "extract_language",
[4873]	124	'desc' => "{BasPlug.extract_language}",
[3540]	125	'type' => "flag",
	126	'reqd' => "no" },
	127	{ 'name' => "default_language",
[4873]	128	'desc' => "{BasPlug.default_language}",
[10329]	129	'type' => "string",
[3540]	130	'deft' => "en",
	131	'reqd' => "no" },
	132	{ 'name' => "extract_acronyms",
[4873]	133	'desc' => "{BasPlug.extract_acronyms}",
[3540]	134	'type' => "flag",
	135	'reqd' => "no" },
	136	{ 'name' => "markup_acronyms",
[4873]	137	'desc' => "{BasPlug.markup_acronyms}",
[3540]	138	'type' => "flag",
[9398]	139	'reqd' => "no" },
[8789]	140	{ 'name' => "extract_keyphrases",
	141	'desc' => "{BasPlug.extract_keyphrases}",
	142	'type' => "flag",
[8814]	143	'reqd' => "no" },
[11069]	144	{ 'name' => "extract_keyphrases_kea4",
	145	'desc' => "{BasPlug.extract_keyphrases_kea4}",
	146	'type' => "flag",
	147	'reqd' => "no" },
[8789]	148	{ 'name' => "extract_keyphrase_options",
	149	'desc' => "{BasPlug.extract_keyphrase_options}",
	150	'type' => "string",
	151	'deft' => "",
[8814]	152	'reqd' => "no" },
[3540]	153	{ 'name' => "first",
[4873]	154	'desc' => "{BasPlug.first}",
[3540]	155	'type' => "string",
	156	'reqd' => "no" },
	157	{ 'name' => "extract_email",
[4873]	158	'desc' => "{BasPlug.extract_email}",
[3540]	159	'type' => "flag",
	160	'reqd' => "no" },
	161	{ 'name' => "extract_historical_years",
[4873]	162	'desc' => "{BasPlug.extract_historical_years}",
[3540]	163	'type' => "flag",
	164	'reqd' => "no" },
	165	{ 'name' => "maximum_year",
[4873]	166	'desc' => "{BasPlug.maximum_year}",
[3540]	167	'type' => "int",
[4744]	168	'deft' => (localtime)[5]+1900,
[10218]	169	'char_length' => "4",
	170	#'range' => "2,100",
[3540]	171	'reqd' => "no"},
	172	{ 'name' => "maximum_century",
[4873]	173	'desc' => "{BasPlug.maximum_century}",
[7105]	174	'type' => "string",
[10218]	175	'deft' => "-1",
[3540]	176	'reqd' => "no" },
	177	{ 'name' => "no_bibliography",
[4873]	178	'desc' => "{BasPlug.no_bibliography}",
[3540]	179	'type' => "flag",
	180	'reqd' => "no"},
[8678]	181	{ 'name' => "no_cover_image",
	182	'desc' => "{BasPlug.no_cover_image}",
[3540]	183	'type' => "flag",
[10218]	184	'reqd' => "no" },
	185	{ 'name' => "separate_cjk",
	186	'desc' => "{BasPlug.separate_cjk}",
	187	'type' => "flag",
	188	'reqd' => "no",
	189	'hiddengli' => "yes" },
	190	{ 'name' => "new_extract_email",
	191	'desc' => "",
	192	'type' => "flag",
	193	'reqd' => "no",
	194	'hiddengli' => "yes" } ];
[3540]	195
[9398]	196	my $gis_arguments =
	197	[ { 'name' => "extract_placenames",
	198	'desc' => "{GISBasPlug.extract_placenames}",
	199	'type' => "flag",
	200	'reqd' => "no" },
	201	{ 'name' => "gazetteer",
	202	'desc' => "{GISBasPlug.gazetteer}",
	203	'type' => "string",
	204	'reqd' => "no" },
	205	{ 'name' => "place_list",
	206	'desc' => "{GISBasPlug.place_list}",
	207	'type' => "flag",
	208	'reqd' => "no" } ];
	209
	210
[3540]	211	my $options = { 'name' => "BasPlug",
[5681]	212	'desc' => "{BasPlug.desc}",
[6408]	213	'abstract' => "yes",
	214	'inherits' => "no",
[4750]	215	'args' => $arguments };
[3540]	216
[4778]	217
[12970]	218	sub set_incremental {
[10478]	219	my $self = shift(@_);
[12970]	220	my ($incremental) = @_;
[10478]	221
[12970]	222	$self->{'incremental'} = $incremental;
[10478]	223	}
	224
[4873]	225	sub get_arguments
	226	{
[8716]	227	my $self = shift(@_);
	228	my $optionlistref = $self->{'option_list'};
	229	my @optionlist = @$optionlistref;
	230	my $pluginoptions = pop(@$optionlistref);
	231	my $pluginarguments = $pluginoptions->{'args'};
[4873]	232	return $pluginarguments;
	233	}
	234
	235
[4778]	236	sub print_xml_usage
	237	{
[8716]	238	my $self = shift(@_);
[11681]	239	my $header = shift(@_);
[12624]	240	my $high_level_information_only = shift(@_);
[11681]	241
[6945]	242	# XML output is always in UTF-8
[9413]	243	gsprintf::output_strings_in_UTF8;
[6945]	244
[11681]	245	if ($header) {
	246	&PrintUsage::print_xml_header("plugin");
	247	}
[12624]	248	$self->print_xml($high_level_information_only);
[3540]	249	}
	250
[4778]	251
	252	sub print_xml
	253	{
[8716]	254	my $self = shift(@_);
[12624]	255	my $high_level_information_only = shift(@_);
[4778]	256
[8716]	257	my $optionlistref = $self->{'option_list'};
	258	my @optionlist = @$optionlistref;
[10229]	259	my $pluginoptions = shift(@$optionlistref);
[4778]	260	return if (!defined($pluginoptions));
	261
[12630]	262	# Find the process and block default expressions in the plugin arguments
	263	my $process_exp = "";
	264	my $block_exp = "";
	265	if (defined($pluginoptions->{'args'})) {
	266	foreach my $option (@{$pluginoptions->{'args'}}) {
	267	if ($option->{'name'} eq "process_exp") {
	268	$process_exp = $option->{'deft'};
	269	}
	270	if ($option->{'name'} eq "block_exp") {
	271	$block_exp = $option->{'deft'};
	272	}
	273	}
	274	}
	275
[9413]	276	gsprintf(STDERR, "<PlugInfo>\n");
	277	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
	278	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
[7023]	279	$desc =~ s/</&lt;/g; # doubly escaped
	280	$desc =~ s/>/&gt;/g;
[9413]	281	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
	282	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
	283	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
[12630]	284	gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");
	285	gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");
[9413]	286	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
[12624]	287	unless (defined($high_level_information_only)) {
	288	gsprintf(STDERR, " <Arguments>\n");
	289	if (defined($pluginoptions->{'args'})) {
	290	&PrintUsage::print_options_xml($pluginoptions->{'args'});
	291	}
	292	gsprintf(STDERR, " </Arguments>\n");
	293
	294	# Recurse up the plugin hierarchy
	295	$self->print_xml();
[3540]	296	}
[9413]	297	gsprintf(STDERR, "</PlugInfo>\n");
[3540]	298	}
	299
[4744]	300
[4778]	301	sub print_txt_usage
[4744]	302	{
[8716]	303	my $self = shift(@_);
[4750]	304	# Print the usage message for a plugin (recursively)
[8716]	305	my $descoffset = $self->determine_description_offset(0);
[6925]	306	$self->print_plugin_usage($descoffset, 1);
[4750]	307	}
[4744]	308
	309
[4750]	310	sub determine_description_offset
	311	{
[8716]	312	my $self = shift(@_);
	313	my $maxoffset = shift(@_);
[4750]	314
[8716]	315	my $optionlistref = $self->{'option_list'};
	316	my @optionlist = @$optionlistref;
[10229]	317	my $pluginoptions = shift(@$optionlistref);
[4750]	318	return $maxoffset if (!defined($pluginoptions));
	319
	320	# Find the length of the longest option string of this plugin
[8716]	321	my $pluginargs = $pluginoptions->{'args'};
[4744]	322	if (defined($pluginargs)) {
[8716]	323	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
[4778]	324	if ($longest > $maxoffset) {
	325	$maxoffset = $longest;
[4744]	326	}
[4750]	327	}
[4744]	328
[4750]	329	# Recurse up the plugin hierarchy
	330	$maxoffset = $self->determine_description_offset($maxoffset);
	331	$self->{'option_list'} = \@optionlist;
	332	return $maxoffset;
	333	}
	334
	335
	336	sub print_plugin_usage
	337	{
[8716]	338	my $self = shift(@_);
	339	my $descoffset = shift(@_);
	340	my $isleafclass = shift(@_);
[4750]	341
[8716]	342	my $optionlistref = $self->{'option_list'};
	343	my @optionlist = @$optionlistref;
[10229]	344	my $pluginoptions = shift(@$optionlistref);
[4750]	345	return if (!defined($pluginoptions));
	346
[8716]	347	my $pluginname = $pluginoptions->{'name'};
	348	my $pluginargs = $pluginoptions->{'args'};
	349	my $plugindesc = $pluginoptions->{'desc'};
[4750]	350
	351	# Produce the usage information using the data structure above
	352	if ($isleafclass) {
[6932]	353	if (defined($plugindesc)) {
[9413]	354	gsprintf(STDERR, "$plugindesc\n\n");
[6932]	355	}
[9413]	356	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
[4750]	357	}
	358
	359	# Display the plugin options, if there are some
	360	if (defined($pluginargs)) {
[4744]	361	# Calculate the column offset of the option descriptions
[8716]	362	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
[4744]	363
[4750]	364	if ($isleafclass) {
[9413]	365	gsprintf(STDERR, " {common.specific_options}:\n");
[4750]	366	}
	367	else {
[9413]	368	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
[4750]	369	}
	370
[4744]	371	# Display the plugin options
[6925]	372	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
[4744]	373	}
	374
[4750]	375	# Recurse up the plugin hierarchy
[6925]	376	$self->print_plugin_usage($descoffset, 0);
[4750]	377	$self->{'option_list'} = \@optionlist;
[4744]	378	}
	379
	380
[4]	381	sub new {
[10218]	382	# Set Encodings to the list!!
	383
	384
	385	# Start the BasPlug Constructor
[1219]	386	my $class = shift (@_);
[10218]	387	my ($pluginlist,$args,$hashArgOptLists) = @_;
	388	push(@$pluginlist, $class);
	389	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
[9398]	390
[10218]	391	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	392	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	393
[9398]	394	if (GISBasPlug::has_mapdata()) {
	395	push(@$arguments,@$gis_arguments);
	396	}
[10218]	397
	398	my $self = {};
[10579]	399	$self->{'outhandle'} = STDERR;
	400	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
	401	$self->{"info_only"} = 0;
	402
	403	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
	404	# the args, just return the object.
	405	foreach my $strArg (@{$args})
	406	{
	407	if($strArg eq "-gsdlinfo")
	408	{
	409	$self->{"info_only"} = 1;
	410	return bless $self, $class;
	411	}
	412	}
	413
[12546]	414	if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
[10218]	415	{
	416	my $classTempClass = bless $self, $class;
[10620]	417	print STDERR "<BadPlugin p=$plugin_name>\n";
[10218]	418	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
	419	$classTempClass->print_txt_usage(""); # Use default resource bundle
	420	die "\n";
	421	}
[9398]	422
[10280]	423
[10579]	424	delete $self->{"info_only"};
[10218]	425	# else parsing was successful.
	426
	427	$self->{'plugin_type'} = $plugin_name;
[10579]	428	#$self->{'outhandle'} = STDERR;
[2785]	429	$self->{'num_processed'} = 0;
	430	$self->{'num_not_processed'} = 0;
	431	$self->{'num_blocked'} = 0;
	432	$self->{'num_archives'} = 0;
[8678]	433	$self->{'cover_image'} = 1; # cover image is on by default
[10218]	434	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[10579]	435	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]	436
[8892]	437	my $associate_ext = $self->{'associate_ext'};
	438	if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]	439
[11122]	440	my $associate_tail_re = $self->{'associate_tail_re'};
	441	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
	442	my $outhandle = $self->{'outhandle'};
	443	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
	444	print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]	445	}
[11122]	446	else {
	447	my @exts = split(/,/,$associate_ext);
[8892]	448
[11122]	449	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
	450	my $associate_tail_re = join("\|",@exts_bracketed);
	451	$self->{'associate_tail_re'} = $associate_tail_re;
	452	}
	453
	454	delete $self->{'associate_ext'};
[8892]	455	}
	456
	457	$self->{'shared_fileroot'} = {};
[8510]	458	$self->{'file_blocks'} = {};
[1219]	459
[9398]	460	if ($self->{'extract_placenames'}) {
	461
	462	my $outhandle = $self->{'outhandle'};
[10218]	463
[9398]	464	my $places_ref
	465	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
[10218]	466
[9398]	467	if (!defined $places_ref) {
	468	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
	469	print $outhandle " No placename extraction will take place.\n";
	470	$self->{'extract_placenames'} = undef;
	471	}
	472	else {
	473	$self->{'places'} = $places_ref;
	474	}
[10280]	475	}
[11089]	476
[1219]	477	return bless $self, $class;
[10218]	478
[4]	479	}
	480
[1242]	481	# initialize BasPlug options
	482	# if init() is overridden in a sub-class, remember to call BasPlug::init()
	483	sub init {
	484	my $self = shift (@_);
[2785]	485	my ($verbosity, $outhandle, $failhandle) = @_;
[1242]	486
	487	# verbosity is passed through from the processor
	488	$self->{'verbosity'} = $verbosity;
	489
[2785]	490	# as are the outhandle and failhandle
[1424]	491	$self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]	492	$self->{'failhandle'} = $failhandle;
[1424]	493
[1242]	494	# set process_exp and block_exp to defaults unless they were
	495	# explicitly set
[1244]	496
	497	if ((!$self->is_recursive()) and
[1242]	498	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	499
[1242]	500	$self->{'process_exp'} = $self->get_default_process_exp ();
	501	if ($self->{'process_exp'} eq "") {
[1244]	502	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	503	}
	504	}
	505
	506	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	507	$self->{'block_exp'} = $self->get_default_block_exp ();
	508	}
[11089]	509
[1242]	510	}
	511
[839]	512	sub begin {
	513	my $self = shift (@_);
	514	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[1396]	515	$self->initialise_extractors();
[839]	516	}
	517
	518	sub end {
[10155]	519	# potentially called at the end of each plugin pass
	520	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
	521
[839]	522	my ($self) = @_;
[1396]	523	$self->finalise_extractors();
[839]	524	}
	525
[10155]	526	sub deinit {
	527	# called only once, after all plugin passes have been done
	528
	529	my ($self) = @_;
	530	}
	531
[1242]	532	# this function should be overridden to return 1
	533	# in recursive plugins
[4]	534	sub is_recursive {
	535	my $self = shift (@_);
	536
[1242]	537	return 0;
[4]	538	}
	539
[1242]	540	sub get_default_block_exp {
	541	my $self = shift (@_);
	542
	543	return "";
	544	}
	545
	546	sub get_default_process_exp {
	547	my $self = shift (@_);
	548
	549	return "";
	550	}
	551
[9067]	552	# default implementation is to do nothing.
	553	sub store_block_files
	554	{
	555	my $self =shift (@_);
	556	my ($filename) = @_;
	557	return;
	558	}
	559
	560	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
	561	sub block_cover_image
	562	{
[10833]	563	my $self =shift;
	564	my $filename = shift;
	565
[9067]	566	if ($self->{'cover_image'}) {
	567	my $coverfile = $filename;
	568	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
	569	if (!-e $coverfile) {
	570	$coverfile =~ s/jpg$/JPG/;
	571	}
	572	if (-e $coverfile) {
	573	$self->{'file_blocks'}->{$coverfile} = 1;
[11089]	574	}
[9067]	575	}
	576
	577	return;
	578	}
[11122]	579
	580	sub root_ext_split
	581	{
	582	my $self = shift (@_);
	583	my ($filename,$tail_re) = @_;
[9067]	584
[11122]	585	my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
	586
	587	if ((!defined $file_prefix) \|\| (!defined $file_ext)) {
	588	($file_prefix,$file_ext) = ($filename =~ m/^(.)(\..?)$/);
	589	}
	590
	591	return ($file_prefix,$file_ext);
	592	}
	593
[8510]	594	sub metadata_read {
	595	my $self = shift (@_);
	596	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[8892]	597	# Keep track of filenames with same root but different extensions
[11122]	598	# Used to support -associate_ext and the more generalised
	599	# -associate_tail_re
[8892]	600
[11122]	601	my $associate_tail_re = $self->{'associate_tail_re'};
	602	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[8892]	603
[11122]	604	my ($file_prefix,$file_ext)
	605	= $self->root_ext_split($file,$associate_tail_re);
	606
[8892]	607	if ((defined $file_prefix) && (defined $file_ext)) {
[11122]	608
[8892]	609	my $shared_fileroot = $self->{'shared_fileroot'};
	610	if (!defined $shared_fileroot->{$file_prefix}) {
[11122]	611	my $file_prefix_rec = { 'tie_to' => undef,
	612	'exts' => {} };
[8892]	613	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
	614	}
	615
	616	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
	617
	618	my $process_exp = $self->{'process_exp'};
	619
[11122]	620	if ($file =~ m/$process_exp/) {
[8892]	621	# This is the document the others should be tied to
	622	$file_prefix_rec->{'tie_to'} = $file_ext;
	623	}
	624	else {
[11122]	625	if ($file_ext =~ m/$associate_tail_re$/) {
[9351]	626	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
	627	}
[8892]	628	}
[11122]	629
[8892]	630	}
	631	}
[11122]	632
[9067]	633	# now check whether we are actually processing this
	634	my $filename = $file;
	635	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	636	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	637	return undef; # can't recognise
	638	}
[8892]	639
[9067]	640	# do smart blocking if appropriate
[11089]	641	if ($self->{'smart_block'}) {
[9067]	642	$self->store_block_files($filename);
	643	}
[11089]	644	# block the cover image if there is one
	645	if ($self->{'cover_image'}) {
	646	$self->block_cover_image($filename);
	647	}
[9067]	648
	649	return 1;
[8510]	650	}
	651
[8892]	652	sub tie_to_filename
	653	{
	654	my $self = shift (@_);
	655
	656	my ($file_ext,$file_prefix_rec) = @_;
	657
	658	if (defined $file_prefix_rec) {
	659	my $tie_to = $file_prefix_rec->{'tie_to'};
[9351]	660
[8892]	661	if (defined $tie_to) {
	662	if ($tie_to eq $file_ext) {
	663	return 1;
	664	}
	665	}
	666	}
	667
	668	return 0;
	669	}
	670
	671	sub tie_to_assoc_file
	672	{
	673	my $self = shift (@_);
	674	my ($file_ext,$file_prefix_rec) = @_;
	675
	676	if (defined $file_prefix_rec) {
	677	my $tie_to = $file_prefix_rec->{'tie_to'};
	678	if (defined $tie_to) {
	679
	680	my $exts = $file_prefix_rec->{'exts'};
	681
	682	my $has_file_ext = $exts->{$file_ext};
[11122]	683
[8892]	684	if ($has_file_ext) {
	685	return 1;
	686	}
	687	}
	688	}
	689
	690	return 0;
	691	}
	692
	693
	694	sub associate_with
	695	{
	696	my $self = shift (@_);
	697	my ($file, $filename, $metadata) = @_;
	698
[11122]	699	my $associate_tail_re = $self->{'associate_tail_re'};
	700	return 0 if (!$associate_tail_re);
[8892]	701
	702	# If file, see if matches with "tie_to" doc or is one of the
	703	# associated filename extensions.
	704
[11122]	705	my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
	706
[8892]	707	if ((defined $file_prefix) && (defined $file_ext)) {
	708
	709	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
[9351]	710
[8892]	711	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
	712
	713	# Set up gsdlassocfile_tobe
	714
	715	my $exts = $file_prefix_rec->{'exts'};
	716
	717	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
	718	$metadata->{'gsdlassocfile_tobe'} = [];
	719	}
	720
	721	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
	722
	723	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
	724	foreach my $e (keys %$exts) {
[11122]	725	my $assoc_file = "$full_prefix$e";
	726	print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
[8892]	727	my $mime_type = ""; # let system auto detect this
	728	push(@$assoc_tobe,"$assoc_file:$mime_type:");
	729	}
[11122]	730
[8892]	731	}
	732	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
[11122]	733
	734
	735	# a form of smart block
[8892]	736	return 1;
	737	}
	738	}
	739
	740	return 0;
	741	}
	742
	743
[10280]	744	sub read_block {
[1954]	745	my $self = shift (@_);
	746
[9853]	747	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[4]	748
[8908]	749
[2795]	750	my $filename = $file;
	751	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	752
[8892]	753	if ($self->associate_with($file,$filename,$metadata)) {
	754	# a form of smart block
	755	$self->{'num_blocked'} ++;
[10280]	756	return (0,undef); # blocked
[8892]	757	}
[8510]	758
[10280]	759	my $smart_block = $self->{'smart_block'};
	760	my $smart_block_BN = $self->{'smart_block_BN'};
[11089]	761
[8915]	762	if ($smart_block \|\| $smart_block_BN) {
[8510]	763	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
[8892]	764	$self->{'num_blocked'} ++;
[10280]	765	return (0,undef); # blocked
[8510]	766	}
[11089]	767	} else {
	768	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
	769	$self->{'num_blocked'} ++;
	770	return (0,undef); # blocked
	771	}
	772	if ($self->{'cover_image'}) {
	773	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
	774	$self->{'num_blocked'} ++;
	775	return (0,undef); # blocked
	776	}
	777	}
[2785]	778	}
[11122]	779
[1242]	780	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
[10280]	781	return (undef,undef); # can't recognise
[1242]	782	}
[10280]	783
	784	return (1,$filename);
	785	}
	786
	787	sub read_tidy_file {
	788
	789	my $self = shift (@_);
	790
	791	my ($file) = @_;
	792
[1242]	793	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1844]	794
[10280]	795	return $file;
	796	}
	797
	798
[14961]	799	sub filename_to_metadata
	800	{
	801	my $self = shift (@_);
	802	my ($file, $encoding) = @_;
[10280]	803
[14961]	804	my $outhandle = $self->{'outhandle'};
	805
	806	my $filesystem_encoding = undef;
	807
	808	eval {
	809	use POSIX qw(locale_h);
	810
	811	# With only one parameter, setlocale retrieves the current value
	812	my $current_locale = setlocale(LC_CTYPE);
	813
	814	if ($current_locale =~ m/^.\.(.?)$/) {
	815	my $char_encoding = lc($1);
	816	$char_encoding =~ s/-/_/g;
	817	$char_encoding =~ s/^utf_8$/utf8/;
	818
	819	if ($char_encoding =~ m/^\d+$/) {
	820	if (defined $encodings::encoding->{"windows_$char_encoding"}) {
	821	$char_encoding = "windows_$char_encoding";
	822	}
	823	elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
	824	$char_encoding = "dos_$char_encoding";
	825	}
	826	}
	827
	828	if (($char_encoding =~ m/(?:ascii\|utf8\|unicode)/)
	829	\|\| (defined $encodings::encoding->{$char_encoding})) {
	830	$filesystem_encoding = $char_encoding;
	831	}
	832	else {
	833	print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
	834	}
	835	}
	836
	837
	838	};
	839	if ($@) {
	840	print $outhandle "$@\n";
	841	print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
	842
	843	}
	844
	845	my ($filemeta) = $file =~ /([^\\\/]+)$/;
	846
	847	# how do we know what encoding the filename is in?
	848	# => one answer is to check the locale
	849
	850	if (defined $filesystem_encoding) {
	851	if ($filesystem_encoding !~ /(?:ascii\|utf8\|unicode)/) {
	852	$filemeta = unicode::unicode2utf8(
	853	unicode::convert2unicode($filesystem_encoding, \$filemeta)
	854	);
	855	}
	856	}
	857	# assume it is in the same encoding as its contents
	858	elsif ((defined $encoding) && ($encoding !~ /(?:ascii\|utf8\|unicode)/)) {
	859	$filemeta = unicode::unicode2utf8(
	860	unicode::convert2unicode($encoding, \$filemeta)
	861	);
	862	}
	863
	864	my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
	865
	866	return $dmsafe_filemeta;
	867	}
	868
	869
[10280]	870	# The BasPlug read_into_doc_obj() function. This function does all the
	871	# right things to make general options work for a given plugin. It reads in
	872	# a file and sets up a slew of metadata all saved in doc_obj, which
	873	# it then returns as part of a tuple (process_status,doc_obj)
	874	#
	875	# Much of this functionality used to reside in read, but it was broken
	876	# down into a supporting routine to make the code more flexible.
	877	#
	878	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	879	# capable of processing many documents within a single file (e.g.
	880	# GMLPlug) will normally want to implement their own version of
	881	# read_into_doc_obj()
	882	#
	883	# Note that $base_dir might be "" and that $file might
	884	# include directories
	885	sub read_into_doc_obj {
	886	my $self = shift (@_);
	887	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	888
	889	if ($self->is_recursive()) {
	890	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
	891	}
	892
	893	my $outhandle = $self->{'outhandle'};
	894
	895	my ($block_status,$filename) = $self->read_block(@_);
	896	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
	897	$file = $self->read_tidy_file($file);
	898
[2811]	899	# Do encoding stuff
	900	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[11389]	901	if ($self->{'verbosity'} > 2) {
	902	print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
	903	}
[1844]	904
[1242]	905	# create a new document
[1379]	906	my $doc_obj = new doc ($filename, "indexed_doc");
[14961]	907	my $top_section = $doc_obj->get_top_section();
	908
[12270]	909	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[14961]	910	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
	911	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
	912	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
	913	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
[8166]	914
[14961]	915	my $filemeta = $self->filename_to_metadata($file,$encoding);
	916	$doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
[2816]	917	if ($self->{'cover_image'}) {
	918	$self->associate_cover_image($doc_obj, $filename);
	919	}
[1242]	920
	921	# read in file ($text will be in utf8)
	922	my $text = "";
[2734]	923	$self->read_file ($filename, $encoding, $language, \$text);
[1242]	924
[1844]	925	if (!length ($text)) {
[2811]	926	my $plugin_name = ref ($self);
[9586]	927	if ($gli) {
	928	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
	929	}
[9703]	930	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
[2785]	931
	932	my $failhandle = $self->{'failhandle'};
[9413]	933	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
[5681]	934	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
[2785]	935	$self->{'num_not_processed'} ++;
	936
[10280]	937	return (0,undef); # what should we return here?? error but don't want to pass it on
[1242]	938	}
[1954]	939
[1242]	940	# include any metadata passed in from previous plugins
	941	# note that this metadata is associated with the top level section
[8510]	942
[11122]	943	my $associate_tail_re = $self->{'associate_tail_re'};
	944
[1242]	945	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
[9398]	946
[8716]	947	# do plugin specific processing of doc_obj
	948	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
	949	$text = '';
	950	undef $text;
[9584]	951	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[10280]	952	return (-1,undef);
[8716]	953	}
	954	$text='';
	955	undef $text;
[9398]	956
[1242]	957	# do any automatic metadata extraction
	958	$self->auto_extract_metadata ($doc_obj);
[1954]	959
[1242]	960	# add an OID
[3515]	961	# see if there is a plugin-specific set_OID function...
[8716]	962	if (defined ($self->can('set_OID'))) {
[3515]	963	# it will need $doc_obj to set the Identifier metadata...
	964	$self->set_OID($doc_obj);
	965	} else {
	966	# use the default set_OID() in doc.pm
	967	$doc_obj->set_OID();
	968	}
[10280]	969
	970	return (1,$doc_obj);
	971	}
[1242]	972
	973
[10280]	974	# The BasPlug read() function. This function calls read_into_doc_obj()
	975	# to ensure all the right things to make general options work for a
	976	# given plugin are done. It then calls the process() function which
	977	# does all the work specific to a plugin (like the old read functions
	978	# used to do). Most plugins should define their own process() function
	979	# and let this read() function keep control.
	980	#
	981	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	982	# capable of processing many documents within a single file (e.g.
	983	# GMLPlug) might want to implement their own version of read(), but
	984	# more likely need to implement their own version of read_into_doc_obj()
	985	#
	986	# Return number of files processed, undef if can't recognise, -1 if can't
	987	# process
	988
	989	sub read {
	990	my $self = shift (@_);
	991	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	992
	993	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
	994
	995	if ((defined $process_status) && ($process_status == 1)) {
	996	# process the document
	997	$processor->process($doc_obj);
	998
	999	if(defined($self->{'places_filename'})){
	1000	&util::rm($self->{'places_filename'});
	1001	$self->{'places_filename'} = undef;
	1002	}
	1003
	1004	$self->{'num_processed'} ++;
	1005	undef $doc_obj;
[9398]	1006	}
	1007
[10280]	1008	# if process_status == 1, then the file has been processed.
	1009	return $process_status;
	1010
[4]	1011	}
	1012
[1244]	1013	# returns undef if file is rejected by the plugin
[1242]	1014	sub process {
	1015	my $self = shift (@_);
[11089]	1016	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]	1017
[9413]	1018	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
[5681]	1019	# die "Basplug::process function must be implemented in sub-class\n";
[1244]	1020
	1021	return undef; # never gets here
[1242]	1022	}
	1023
[1219]	1024	# uses the multiread package to read in the entire file pointed to
	1025	# by filename and loads the resulting text into $$textref. Input text
	1026	# may be in any of the encodings handled by multiread, output text
	1027	# will be in utf8
	1028	sub read_file {
	1029	my $self = shift (@_);
[2734]	1030	my ($filename, $encoding, $language, $textref) = @_;
[4]	1031
[1756]	1032	if (!-r $filename)
	1033	{
[1844]	1034	my $outhandle = $self->{'outhandle'};
[9413]	1035	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
[5681]	1036	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
[1756]	1037	return;
	1038	}
[1219]	1039	$$textref = "";
[9413]	1040	if (!open (FILE, $filename)) {
	1041	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
[10280]	1042	die "\n";
	1043	}
	1044
[1844]	1045	if ($encoding eq "ascii") {
[1219]	1046	undef $/;
	1047	$$textref = <FILE>;
	1048	$/ = "\n";
	1049	} else {
	1050	my $reader = new multiread();
	1051	$reader->set_handle ('BasPlug::FILE');
[1844]	1052	$reader->set_encoding ($encoding);
[1219]	1053	$reader->read_file ($textref);
[10280]	1054	#Now segments chinese if the separate_cjk option is set
[6584]	1055	if ($self->{'separate_cjk'}) {
[1219]	1056	# segment the Chinese words
	1057	$$textref = &cnseg::segment($$textref);
	1058	}
	1059	}
[10280]	1060	close FILE;
	1061	}
[1219]	1062
[10280]	1063	# write_file -- used by ConvertToPlug, for example in post processing
	1064	#
	1065	sub utf8_write_file {
	1066	my $self = shift (@_);
	1067	my ($textref, $filename) = @_;
	1068
	1069	if (!open (FILE, ">$filename")) {
	1070	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
	1071	die "\n";
	1072	}
	1073	print FILE $$textref;
	1074
[1219]	1075	close FILE;
	1076	}
	1077
[10280]	1078
[7504]	1079	sub filename_based_title
	1080	{
	1081	my $self = shift (@_);
	1082	my ($file) = @_;
	1083
	1084	my $file_derived_title = $file;
	1085	$file_derived_title =~ s/_/ /g;
	1086	$file_derived_title =~ s/\..*?$//;
	1087
	1088	return $file_derived_title;
	1089	}
	1090
[9398]	1091
[7504]	1092	sub title_fallback
	1093	{
	1094	my $self = shift (@_);
	1095	my ($doc_obj,$section,$file) = @_;
	1096
	1097	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
	1098
	1099	my $file_derived_title = $self->filename_based_title($file);
	1100	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
	1101	}
	1102	}
	1103
[2811]	1104	sub textcat_get_language_encoding {
	1105	my $self = shift (@_);
	1106	my ($filename) = @_;
	1107
[10280]	1108
[2811]	1109	my ($language, $encoding, $extracted_encoding);
	1110	if ($self->{'input_encoding'} eq "auto") {
	1111	# use textcat to automatically work out the input encoding and language
	1112	($language, $encoding) = $self->get_language_encoding ($filename);
	1113	} elsif ($self->{'extract_language'}) {
	1114	# use textcat to get language metadata
	1115	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
	1116	$encoding = $self->{'input_encoding'};
[7644]	1117	# don't print this message for english... english in utf8 is identical
	1118	# to english in iso-8859-1 (except for some punctuation). We don't have
	1119	# a language model for en_utf8, so textcat always says iso-8859-1!
	1120	if ($extracted_encoding ne $encoding && $language ne "en"
	1121	&& $self->{'verbosity'}) {
[2811]	1122	my $plugin_name = ref ($self);
	1123	my $outhandle = $self->{'outhandle'};
[9413]	1124	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
[2811]	1125	}
	1126	} else {
	1127	$language = $self->{'default_language'};
	1128	$encoding = $self->{'input_encoding'};
	1129	}
[10280]	1130
[2811]	1131	return ($language, $encoding);
	1132	}
	1133
[1844]	1134	# Uses textcat to work out the encoding and language of the text in
	1135	# $filename. All html tags are removed before processing.
	1136	# returns an array containing "language" and "encoding"
	1137	sub get_language_encoding {
	1138	my $self = shift (@_);
	1139	my ($filename) = @_;
	1140	my $outhandle = $self->{'outhandle'};
[9413]	1141	my $unicode_format = "";
[11389]	1142	my $best_language = "";
	1143	my $best_encoding = "";
	1144
[1844]	1145	# read in file
[11389]	1146	if (!open (FILE, $filename)) {
	1147	gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
	1148	# this is a pretty bad error, but try to continue anyway
	1149	return ($self->{'default_language'}, $self->{'input_encoding'});
	1150	}
[1844]	1151	undef $/;
	1152	my $text = <FILE>;
	1153	$/ = "\n";
	1154	close FILE;
	1155
[9413]	1156	# check if first few bytes have a Byte Order Marker
	1157	my $bom=substr($text,0,2); # check 16bit unicode
	1158	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
	1159	$unicode_format="unicode";
	1160	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
	1161	$unicode_format="unicode";
	1162	} else {
	1163	$bom=substr($text,0,3); # check utf-8
	1164	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
	1165	$unicode_format="utf8";
	1166	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
	1167	# $unicode_format="utf8";
	1168	}
	1169	}
[10442]	1170
[1999]	1171
[11389]	1172	# handle html files specially
[9413]	1173	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
	1174	if (ref($self) eq 'HTMLPlug' \|\|
	1175	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
[11389]	1176
	1177	# remove <title>stuff</title> -- as titles tend often to be in English
	1178	# for foreign language documents
	1179	$text =~ s!<title>.*?</title>!!si;
	1180
	1181	# see if this html file specifies its encoding
	1182	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
	1183	$best_encoding = $1;
[11880]	1184	} elsif ($text =~ /<meta http-equiv.content-type.charset=(.+?)"/i) {#"
[11389]	1185	$best_encoding = $1;
	1186	}
	1187	if ($best_encoding) { # we extracted an encoding
	1188	$best_encoding =~ s/-+/_/g;
	1189	$best_encoding = lc($best_encoding); # lowercase
	1190	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
	1191	$self->{'input_encoding'} = $best_encoding;
	1192	}
	1193
	1194	# remove all HTML tags
[9413]	1195	$text =~ s/<[^>]*>//sg;
	1196	}
[1844]	1197
	1198	# get the language/encoding
[11966]	1199	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
[2235]	1200	my $results = $self->{'textcat'}->classify(\$text);
[1844]	1201
[1903]	1202	# if textcat returns 3 or less possibilities we'll use the
	1203	# first one in the list - otherwise use the defaults
[2235]	1204	if (scalar @$results > 3) {
[9413]	1205	if ($unicode_format) { # in case the first had a BOM
	1206	$best_encoding=$unicode_format;
	1207	} else {
	1208	my %guessed_encodings = ();
	1209	foreach my $result (@$results) {
	1210	$result =~ /([^\-]+)$/;
	1211	my $enc=$1;
	1212	if (!defined($guessed_encodings{$enc})) {
	1213	$guessed_encodings{$enc}=0;
	1214	}
	1215	$guessed_encodings{$enc}++;
[3731]	1216	}
[9413]	1217
	1218	$guessed_encodings{""}=-1; # for default best_encoding of ""
	1219	foreach my $enc (keys %guessed_encodings) {
	1220	if ($guessed_encodings{$enc} >
	1221	$guessed_encodings{$best_encoding}){
	1222	$best_encoding=$enc;
	1223	}
	1224	}
[3731]	1225	}
	1226
[1844]	1227	if ($self->{'input_encoding'} ne 'auto') {
[9961]	1228	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
[9413]	1229	gsprintf($outhandle,
	1230	"BasPlug: {BasPlug.could_not_extract_language}\n",
	1231	$filename, $self->{'default_language'});
[1844]	1232	}
[11389]	1233	$best_language = $self->{'default_language'};
	1234	$best_encoding = $self->{'input_encoding'};
[1844]	1235
	1236	} else {
[9961]	1237	if ($self->{'verbosity'}>2) {
[9413]	1238	gsprintf($outhandle,
	1239	"BasPlug: {BasPlug.could_not_extract_language}\n",
	1240	$filename, $self->{'default_language'});
[1844]	1241	}
[11389]	1242	$best_language = $self->{'default_language'};
[1844]	1243	}
[11389]	1244	} else { # <= 3 suggestions
	1245	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
	1246	if (!defined $language) {
	1247	if ($self->{'verbosity'}>2) {
	1248	gsprintf($outhandle,
	1249	"BasPlug: {BasPlug.could_not_extract_language}\n",
	1250	$filename, $self->{'default_language'});
	1251	}
	1252	$language = $self->{'default_language'};
[1870]	1253	}
[11389]	1254	if (!defined $encoding) {
	1255	if ($self->{'verbosity'}>2) {
	1256	gsprintf($outhandle,
	1257	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
	1258	$filename, $self->{'default_encoding'});
	1259	}
	1260	$encoding = $self->{'default_encoding'};
[1870]	1261	}
[11389]	1262	$best_language = $language;
	1263	if (! $best_encoding ) { # may already be set... eg from html meta tag
	1264	$best_encoding = $encoding;
	1265	}
[1844]	1266	}
	1267
[11389]	1268	my $text_copy = $text;
	1269	if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
	1270	# the text is valid utf8, so assume that's the real encoding
	1271	# (since textcat is based on probabilities)
	1272	$best_encoding = 'utf8';
	1273	}
[7818]	1274
	1275	# check for equivalents where textcat doesn't have some encodings...
	1276	# eg MS versions of standard encodings
[11389]	1277	if ($best_encoding =~ /^iso_8859_(\d+)/) {
[7818]	1278	my $iso = $1; # which variant of the iso standard?
	1279	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
	1280	if ($text =~ /[\x80-\x9f]/) {
	1281	# Western Europe
[11389]	1282	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
	1283	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
	1284	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
	1285	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
	1286	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
	1287	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
	1288	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
[7818]	1289	}
	1290	}
	1291
[11389]	1292	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
	1293	!defined $encodings::encodings->{$best_encoding}) {
[1844]	1294	if ($self->{'verbosity'}) {
[7818]	1295	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
[11389]	1296	$filename, $best_encoding, $self->{'default_encoding'});
[1844]	1297	}
[11389]	1298	$best_encoding = $self->{'default_encoding'};
[1844]	1299	}
	1300
[11389]	1301	return ($best_language, $best_encoding);
[1844]	1302	}
	1303
[1219]	1304	# add any extra metadata that's been passed around from one
	1305	# plugin to another.
	1306	# extra_metadata uses add_utf8_metadata so it expects metadata values
	1307	# to already be in utf8
	1308	sub extra_metadata {
	1309	my $self = shift (@_);
	1310	my ($doc_obj, $cursection, $metadata) = @_;
	1311
[11122]	1312	my $associate_tail_re = $self->{'associate_tail_re'};
	1313
[1219]	1314	foreach my $field (keys(%$metadata)) {
[839]	1315	# $metadata->{$field} may be an array reference
[8510]	1316	if ($field eq "gsdlassocfile_tobe") {
	1317	# 'gsdlassocfile_tobe' is artificially introduced metadata
	1318	# that is used to signal that certain additional files should
	1319	# be tied to this document. Useful in situations where a
	1320	# metadata pass in the plugin pipeline works out some files
	1321	# need to be associated with a document, but the document hasn't
	1322	# been formed yet.
	1323
	1324	my $equiv_form = "";
	1325	foreach my $gaf (@{$metadata->{$field}}) {
	1326	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
	1327	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
	1328	my $filename = $full_filename;
	1329
	1330	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[11122]	1331
	1332	# work out extended tail extension (i.e. matching tail re)
	1333
	1334	my ($file_prefix,$file_extended_ext)
	1335	= $self->root_ext_split($tail_filename,$associate_tail_re);
	1336	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
	1337
[8510]	1338	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
[11834]	1339	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
[8510]	1340	my $srcicon = "_icon".$doc_ext."_";
	1341	my $end_doclink = "</a>";
	1342
[11122]	1343	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
	1344
	1345	if (defined $pre_doc_ext) {
	1346	# for metadata such as [mp3._edited] [mp3._full] ...
	1347	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
	1348	}
	1349
	1350	# for multiple metadata such as [mp3.assoclink]
	1351	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
	1352
	1353	$equiv_form .= " $assoc_form";
[8510]	1354	}
	1355	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
	1356	}
	1357	elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]	1358	map {
[1219]	1359	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]	1360	} @{$metadata->{$field}};
	1361	} else {
[1219]	1362	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]	1363	}
	1364	}
	1365	}
	1366
[1396]	1367	# initialise metadata extractors
	1368	sub initialise_extractors {
	1369	my $self = shift (@_);
	1370
	1371	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	1372	&acronym::initialise_acronyms();
	1373	}
	1374	}
	1375
	1376	# finalise metadata extractors
	1377	sub finalise_extractors {
	1378	my $self = shift (@_);
	1379
	1380	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	1381	&acronym::finalise_acronyms();
	1382	}
	1383	}
	1384
[1602]	1385	# FIRSTNNN: extract the first NNN characters as metadata
	1386	sub extract_first_NNNN_characters {
	1387	my $self = shift (@_);
	1388	my ($textref, $doc_obj, $thissection) = @_;
	1389
	1390	foreach my $size (split /,/, $self->{'first'}) {
	1391	my $tmptext = $$textref;
	1392	$tmptext =~ s/^\s+//;
	1393	$tmptext =~ s/\s+$//;
	1394	$tmptext =~ s/\s+/ /gs;
	1395	$tmptext = substr ($tmptext, 0, $size);
	1396	$tmptext =~ s/\s\S*$/…/;
	1397	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	1398	}
	1399	}
	1400
	1401	sub extract_email {
	1402	my $self = shift (@_);
	1403	my ($textref, $doc_obj, $thissection) = @_;
	1404	my $outhandle = $self->{'outhandle'};
	1405
[9413]	1406	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
[1844]	1407	if ($self->{'verbosity'} > 2);
[1602]	1408
[2604]	1409	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
[1602]	1410	@email = sort @email;
	1411
[10218]	1412	# if($self->{"new_extract_email"} == 0)
	1413	# {
	1414	# my @email2 = ();
	1415	# foreach my $address (@email)
	1416	# {
	1417	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
	1418	# {
	1419	# push @email2, $address;
	1420	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
	1421	# # print $outhandle " extracting $address\n"
	1422	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
	1423	# if ($self->{'verbosity'} > 3);
	1424	# }
	1425	# }
	1426	# }
	1427	# else
	1428	# {
	1429	my $hashExistMail = {};
[1602]	1430	foreach my $address (@email) {
[10218]	1431	if (!(defined $hashExistMail->{$address}))
	1432	{
	1433	$hashExistMail->{$address} = 1;
[1602]	1434	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
[9413]	1435	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
[1844]	1436	if ($self->{'verbosity'} > 3);
[1602]	1437	}
	1438	}
[9413]	1439	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
[1844]	1440	if ($self->{'verbosity'} > 2);
[1602]	1441	}
	1442
	1443	# extract metadata
[5681]	1444	sub auto_extract_metadata {
[1954]	1445
[1242]	1446	my $self = shift (@_);
	1447	my ($doc_obj) = @_;
[1602]	1448
	1449	if ($self->{'extract_email'}) {
	1450	my $thissection = $doc_obj->get_top_section();
	1451	while (defined $thissection) {
	1452	my $text = $doc_obj->get_text($thissection);
	1453	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
	1454	$thissection = $doc_obj->get_next_section ($thissection);
	1455	}
[1954]	1456	}
[9398]	1457	if ($self->{'extract_placenames'}) {
	1458	my $thissection = $doc_obj->get_top_section();
	1459	while (defined $thissection) {
	1460	my $text = $doc_obj->get_text($thissection);
	1461	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
	1462	$thissection = $doc_obj->get_next_section ($thissection);
	1463	}
	1464	}
[1954]	1465
[11069]	1466	if ($self->{'extract_keyphrases'} \|\| $self->{'extract_keyphrases_kea4'}) {
	1467	$self->extract_keyphrases($doc_obj);
	1468	}
[1954]	1469
[1602]	1470	if ($self->{'first'}) {
	1471	my $thissection = $doc_obj->get_top_section();
	1472	while (defined $thissection) {
	1473	my $text = $doc_obj->get_text($thissection);
	1474	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
	1475	$thissection = $doc_obj->get_next_section ($thissection);
	1476	}
	1477	}
	1478
[1242]	1479	if ($self->{'extract_acronyms'}) {
	1480	my $thissection = $doc_obj->get_top_section();
	1481	while (defined $thissection) {
	1482	my $text = $doc_obj->get_text($thissection);
	1483	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
	1484	$thissection = $doc_obj->get_next_section ($thissection);
	1485	}
	1486	}
[1602]	1487
[1393]	1488	if ($self->{'markup_acronyms'}) {
	1489	my $thissection = $doc_obj->get_top_section();
	1490	while (defined $thissection) {
	1491	my $text = $doc_obj->get_text($thissection);
	1492	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
	1493	$doc_obj->delete_text($thissection);
	1494	$doc_obj->add_text($thissection, $text);
	1495	$thissection = $doc_obj->get_next_section ($thissection);
	1496	}
	1497	}
	1498
[10218]	1499	if($self->{'extract_historical_years'}) {
[1317]	1500	my $thissection = $doc_obj->get_top_section();
	1501	while (defined $thissection) {
[10218]	1502
[1317]	1503	my $text = $doc_obj->get_text($thissection);
[1846]	1504	&DateExtract::get_date_metadata($text, $doc_obj,
	1505	$thissection,
[10218]	1506	$self->{'no_bibliography'},
	1507	$self->{'maximum_year'},
	1508	$self->{'maximum_century'});
[1317]	1509	$thissection = $doc_obj->get_next_section ($thissection);
	1510	}
	1511	}
[1242]	1512	}
	1513
[11069]	1514
	1515	#adding kea keyphrases
	1516	sub extract_keyphrases
	1517	{
	1518	my $self = shift(@_);
	1519	my $doc_obj = shift(@_);
	1520
	1521	# Use Kea 3.0 unless 4.0 has been specified
	1522	my $kea_version = "3.0";
	1523	if ($self->{'extract_keyphrases_kea4'}) {
	1524	$kea_version = "4.0";
	1525	}
	1526
	1527	# Check that Kea exists, and tell the user where to get it if not
	1528	my $keahome = &Kea::get_Kea_directory($kea_version);
	1529	if (!-e $keahome) {
	1530	gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
	1531	return;
	1532	}
	1533
	1534	my $thissection = $doc_obj->get_top_section();
	1535	my $text = "";
	1536	my $list;
	1537
	1538	#loop through sections to gather whole doc
	1539	while (defined $thissection) {
	1540	my $sectiontext = $doc_obj->get_text($thissection);
	1541	$text = $text.$sectiontext;
	1542	$thissection = $doc_obj->get_next_section ($thissection);
	1543	}
	1544
	1545	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
	1546	$list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
	1547	} else { #otherwise call Kea with no options
	1548	$list = &Kea::extract_KeyPhrases ($kea_version, $text);
	1549	}
	1550
	1551	if ($list){
	1552	# if a list of kea keyphrases was returned (ie not empty)
	1553	if ($self->{'verbosity'}) {
	1554	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
	1555	}
	1556
	1557	#add metadata to top section
	1558	$thissection = $doc_obj->get_top_section();
	1559
	1560	# add all key phrases as one metadata
	1561	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
	1562
	1563	# add individual key phrases as multiple metadata
	1564	foreach my $keyphrase (split(',', $list)) {
	1565	$keyphrase =~ s/^\s+\|\s+$//g;
	1566	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
	1567	}
	1568	}
	1569	}
	1570
	1571
[1335]	1572	# extract acronyms from a section in a document. progress is
[1424]	1573	# reported to outhandle based on the verbosity. both the Acronym
[1335]	1574	# and the AcronymKWIC metadata items are created.
	1575
[1242]	1576	sub extract_acronyms {
	1577	my $self = shift (@_);
	1578	my ($textref, $doc_obj, $thissection) = @_;
[1424]	1579	my $outhandle = $self->{'outhandle'};
[1242]	1580
[5681]	1581	# print $outhandle " extracting acronyms ...\n"
[9413]	1582	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
[1844]	1583	if ($self->{'verbosity'} > 2);
[1335]	1584
[1242]	1585	my $acro_array = &acronym::acronyms($textref);
[1360]	1586
[1242]	1587	foreach my $acro (@$acro_array) {
	1588
[1335]	1589	#check that this is the first time ...
	1590	my $seen_before = "false";
	1591	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
	1592	foreach my $thisAcro (@$previous_data) {
[1602]	1593	if ($thisAcro eq $acro->to_string()) {
[1335]	1594	$seen_before = "true";
[9413]	1595	if ($self->{'verbosity'} >= 4) {
	1596	gsprintf($outhandle, " {BasPlug.already_seen} " .
	1597	$acro->to_string() . "\n");
	1598	}
[5681]	1599	}
[1242]	1600	}
[1335]	1601
[1602]	1602	if ($seen_before eq "false") {
[1393]	1603	#write it to the file ...
	1604	$acro->write_to_file();
	1605
[1335]	1606	#do the normal acronym
	1607	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[9413]	1608	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
[1844]	1609	if ($self->{'verbosity'} > 3);
[1335]	1610	}
[1242]	1611	}
[5681]	1612
[9413]	1613	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
[1844]	1614	if ($self->{'verbosity'} > 2);
[1242]	1615	}
	1616
[1393]	1617	sub markup_acronyms {
	1618	my $self = shift (@_);
	1619	my ($text, $doc_obj, $thissection) = @_;
[1424]	1620	my $outhandle = $self->{'outhandle'};
[1393]	1621
[9413]	1622	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
[1844]	1623	if ($self->{'verbosity'} > 2);
[1393]	1624
	1625	#self is passed in to check for verbosity ...
	1626	$text = &acronym::markup_acronyms($text, $self);
	1627
[9413]	1628	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
[1844]	1629	if ($self->{'verbosity'} > 2);
[1393]	1630
	1631	return $text;
	1632	}
	1633
[2785]	1634	sub compile_stats {
	1635	my $self = shift(@_);
	1636	my ($stats) = @_;
	1637
	1638	$stats->{'num_processed'} += $self->{'num_processed'};
	1639	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]	1640	$stats->{'num_archives'} += $self->{'num_archives'};
[2785]	1641
	1642	}
	1643
[2816]	1644	sub associate_cover_image {
[10833]	1645	my $self = shift;
[2816]	1646	my ($doc_obj, $filename) = @_;
	1647
[10833]	1648	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
	1649	if (exists $self->{'covers_missing_cache'}->{$filename}) {
	1650	# don't stat() for existence eg for multiple document input files
	1651	# (eg SplitPlug)
	1652	return;
	1653	}
	1654
[9413]	1655	my $top_section=$doc_obj->get_top_section();
	1656
[2816]	1657	if (-e $filename) {
[13968]	1658	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]	1659	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]	1660	} else {
[10833]	1661	my $upper_filename = $filename;
	1662	$upper_filename =~ s/jpg$/JPG/;
	1663	if (-e $upper_filename) {
	1664	$doc_obj->associate_file($upper_filename, "cover.jpg",
	1665	"image/jpeg");
[9413]	1666	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]	1667	} else {
	1668	# file doesn't exist, so record the fact that it's missing so
	1669	# we don't stat() again (stat is slow)
	1670	$self->{'covers_missing_cache'}->{$filename} = 1;
[3086]	1671	}
[2816]	1672	}
[10833]	1673
[2816]	1674	}
	1675
[11332]	1676
	1677	# Overridden by exploding plugins (eg. ISISPlug)
	1678	sub clean_up_after_exploding
	1679	{
	1680	my $self = shift(@_);
	1681	}
	1682
	1683
[4]	1684	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: