Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 11089

Last change on this file since 11089 was 11089, checked in by kjdon, 18 years ago
removed a couple of unnecessary bits of code like repeated arguments, changed cover image stuff - these are now blocked even if smart block is off (when cover image is on)
Property svn:keywords set to `Author Date Id Revision`
File size: 43.4 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# BasPlug.pm -- base class for all the import plugins
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[9413]	8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
	26	package BasPlug;
[2219]	27
[9398]	28	BEGIN {
	29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	30	}
	31
[3834]	32	eval {require bytes};
[3767]	33
[2219]	34	# suppress the annoying "subroutine redefined" warning that various
	35	# plugins cause under perl 5.6
	36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
	37
[10254]	38	use strict;
	39	no strict 'subs';
	40	no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]	41
[8892]	42	use File::Basename;
	43
[1954]	44	use Kea;
[1219]	45	use multiread;
[1870]	46	use encodings;
[1219]	47	use cnseg;
[1242]	48	use acronym;
[1317]	49	use textcat;
[1242]	50	use doc;
[7645]	51	eval "require diagnostics"; # some perl distros (eg mac) don't have this
[1411]	52	use DateExtract;
[2751]	53	use ghtml;
[9413]	54	use gsprintf 'gsprintf';
[4778]	55	use printusage;
[10218]	56	use parse2;
[4]	57
[10218]	58
[9398]	59	use GISBasPlug;
[5681]	60
[9413]	61	@BasPlug::ISA = ( GISBasPlug );
[9398]	62
[4873]	63	my $unicode_list =
[10218]	64	[ { 'name' => "ascii",
[4873]	65	'desc' => "{BasPlug.input_encoding.ascii}" },
[4744]	66	{ 'name' => "utf8",
[4873]	67	'desc' => "{BasPlug.input_encoding.utf8}" },
[4744]	68	{ 'name' => "unicode",
[4873]	69	'desc' => "{BasPlug.input_encoding.unicode}" } ];
[3540]	70
[10218]	71	my $auto_unicode_list =
	72	[ { 'name' => "auto",
	73	'desc' => "{BasPlug.input_encoding.auto}" } ];
	74
[10620]	75	my $e = $encodings::encodings;
	76	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
	77	{
	78	my $hashEncode =
	79	{'name' => $enc,
	80	'desc' => $e->{$enc}->{'name'}};
	81
	82	push(@{$unicode_list},$hashEncode);
	83	}
	84
	85	push(@{$auto_unicode_list},@{$unicode_list});
	86
[4873]	87	my $arguments =
[3540]	88	[ { 'name' => "process_exp",
[4873]	89	'desc' => "{BasPlug.process_exp}",
[6408]	90	'type' => "regexp",
[3540]	91	'deft' => "",
	92	'reqd' => "no" },
	93	{ 'name' => "block_exp",
[4873]	94	'desc' => "{BasPlug.block_exp}",
[6408]	95	'type' => "regexp",
[3540]	96	'deft' => "",
	97	'reqd' => "no" },
[9067]	98	{ 'name' => "smart_block",
	99	'desc' => "{BasPlug.smart_block}",
	100	'type' => "flag",
[11089]	101	'reqd' => "no",
	102	'hiddengli' => "yes" },
[8892]	103	{ 'name' => "associate_ext",
	104	'desc' => "{BasPlug.associate_ext}",
	105	'type' => "string",
	106	'reqd' => "no" },
[3540]	107	{ 'name' => "input_encoding",
[4873]	108	'desc' => "{BasPlug.input_encoding}",
[3540]	109	'type' => "enum",
[10218]	110	'list' => $auto_unicode_list,
[3540]	111	'reqd' => "no" ,
	112	'deft' => "auto" } ,
	113	{ 'name' => "default_encoding",
[4873]	114	'desc' => "{BasPlug.default_encoding}",
[4744]	115	'type' => "enum",
[6332]	116	'list' => $unicode_list,
[4744]	117	'reqd' => "no",
	118	'deft' => "utf8" },
[3540]	119	{ 'name' => "extract_language",
[4873]	120	'desc' => "{BasPlug.extract_language}",
[3540]	121	'type' => "flag",
	122	'reqd' => "no" },
	123	{ 'name' => "default_language",
[4873]	124	'desc' => "{BasPlug.default_language}",
[10329]	125	'type' => "string",
[3540]	126	'deft' => "en",
	127	'reqd' => "no" },
	128	{ 'name' => "extract_acronyms",
[4873]	129	'desc' => "{BasPlug.extract_acronyms}",
[3540]	130	'type' => "flag",
	131	'reqd' => "no" },
	132	{ 'name' => "markup_acronyms",
[4873]	133	'desc' => "{BasPlug.markup_acronyms}",
[3540]	134	'type' => "flag",
[9398]	135	'reqd' => "no" },
[8789]	136	{ 'name' => "extract_keyphrases",
	137	'desc' => "{BasPlug.extract_keyphrases}",
	138	'type' => "flag",
[8814]	139	'reqd' => "no" },
[11069]	140	{ 'name' => "extract_keyphrases_kea4",
	141	'desc' => "{BasPlug.extract_keyphrases_kea4}",
	142	'type' => "flag",
	143	'reqd' => "no" },
[8789]	144	{ 'name' => "extract_keyphrase_options",
	145	'desc' => "{BasPlug.extract_keyphrase_options}",
	146	'type' => "string",
	147	'deft' => "",
[8814]	148	'reqd' => "no" },
[3540]	149	{ 'name' => "first",
[4873]	150	'desc' => "{BasPlug.first}",
[3540]	151	'type' => "string",
	152	'reqd' => "no" },
	153	{ 'name' => "extract_email",
[4873]	154	'desc' => "{BasPlug.extract_email}",
[3540]	155	'type' => "flag",
	156	'reqd' => "no" },
	157	{ 'name' => "extract_historical_years",
[4873]	158	'desc' => "{BasPlug.extract_historical_years}",
[3540]	159	'type' => "flag",
	160	'reqd' => "no" },
	161	{ 'name' => "maximum_year",
[4873]	162	'desc' => "{BasPlug.maximum_year}",
[3540]	163	'type' => "int",
[4744]	164	'deft' => (localtime)[5]+1900,
[10218]	165	'char_length' => "4",
	166	#'range' => "2,100",
[3540]	167	'reqd' => "no"},
	168	{ 'name' => "maximum_century",
[4873]	169	'desc' => "{BasPlug.maximum_century}",
[7105]	170	'type' => "string",
[10218]	171	'deft' => "-1",
[3540]	172	'reqd' => "no" },
	173	{ 'name' => "no_bibliography",
[4873]	174	'desc' => "{BasPlug.no_bibliography}",
[3540]	175	'type' => "flag",
	176	'reqd' => "no"},
[8678]	177	{ 'name' => "no_cover_image",
	178	'desc' => "{BasPlug.no_cover_image}",
[3540]	179	'type' => "flag",
[10218]	180	'reqd' => "no" },
	181	{ 'name' => "separate_cjk",
	182	'desc' => "{BasPlug.separate_cjk}",
	183	'type' => "flag",
	184	'reqd' => "no",
	185	'hiddengli' => "yes" },
	186	{ 'name' => "new_extract_email",
	187	'desc' => "",
	188	'type' => "flag",
	189	'reqd' => "no",
	190	'hiddengli' => "yes" } ];
[3540]	191
[9398]	192	my $gis_arguments =
	193	[ { 'name' => "extract_placenames",
	194	'desc' => "{GISBasPlug.extract_placenames}",
	195	'type' => "flag",
	196	'reqd' => "no" },
	197	{ 'name' => "gazetteer",
	198	'desc' => "{GISBasPlug.gazetteer}",
	199	'type' => "string",
	200	'reqd' => "no" },
	201	{ 'name' => "place_list",
	202	'desc' => "{GISBasPlug.place_list}",
	203	'type' => "flag",
	204	'reqd' => "no" } ];
	205
	206
[3540]	207	my $options = { 'name' => "BasPlug",
[5681]	208	'desc' => "{BasPlug.desc}",
[6408]	209	'abstract' => "yes",
	210	'inherits' => "no",
[4750]	211	'args' => $arguments };
[3540]	212
[4778]	213
[10478]	214	sub set_keepold {
	215	my $self = shift(@_);
	216	my ($keepold) = @_;
	217
	218	$self->{'keepold'} = $keepold;
	219	}
	220
[4873]	221	sub get_arguments
	222	{
[8716]	223	my $self = shift(@_);
	224	my $optionlistref = $self->{'option_list'};
	225	my @optionlist = @$optionlistref;
	226	my $pluginoptions = pop(@$optionlistref);
	227	my $pluginarguments = $pluginoptions->{'args'};
[4873]	228	return $pluginarguments;
	229	}
	230
	231
[4778]	232	sub print_xml_usage
	233	{
[8716]	234	my $self = shift(@_);
[4778]	235
[6945]	236	# XML output is always in UTF-8
[9413]	237	gsprintf::output_strings_in_UTF8;
[6945]	238
[9413]	239	PrintUsage::print_xml_header();
[6925]	240	$self->print_xml();
[3540]	241	}
	242
[4778]	243
	244	sub print_xml
	245	{
[8716]	246	my $self = shift(@_);
[4778]	247
[8716]	248	my $optionlistref = $self->{'option_list'};
	249	my @optionlist = @$optionlistref;
[10229]	250	my $pluginoptions = shift(@$optionlistref);
[4778]	251	return if (!defined($pluginoptions));
	252
[9413]	253	gsprintf(STDERR, "<PlugInfo>\n");
	254	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
	255	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
[7023]	256	$desc =~ s/</&lt;/g; # doubly escaped
	257	$desc =~ s/>/&gt;/g;
	258
[9413]	259	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
	260	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
	261	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
	262	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
	263	gsprintf(STDERR, " <Arguments>\n");
[4778]	264	if (defined($pluginoptions->{'args'})) {
[6925]	265	&PrintUsage::print_options_xml($pluginoptions->{'args'});
[3540]	266	}
[4778]	267
	268	# Recurse up the plugin hierarchy
[6925]	269	$self->print_xml();
[4778]	270
[9413]	271	gsprintf(STDERR, " </Arguments>\n");
	272	gsprintf(STDERR, "</PlugInfo>\n");
[3540]	273	}
	274
[4744]	275
[4778]	276	sub print_txt_usage
[4744]	277	{
[8716]	278	my $self = shift(@_);
[4750]	279	# Print the usage message for a plugin (recursively)
[8716]	280	my $descoffset = $self->determine_description_offset(0);
[6925]	281	$self->print_plugin_usage($descoffset, 1);
[4750]	282	}
[4744]	283
	284
[4750]	285	sub determine_description_offset
	286	{
[8716]	287	my $self = shift(@_);
	288	my $maxoffset = shift(@_);
[4750]	289
[8716]	290	my $optionlistref = $self->{'option_list'};
	291	my @optionlist = @$optionlistref;
[10229]	292	my $pluginoptions = shift(@$optionlistref);
[4750]	293	return $maxoffset if (!defined($pluginoptions));
	294
	295	# Find the length of the longest option string of this plugin
[8716]	296	my $pluginargs = $pluginoptions->{'args'};
[4744]	297	if (defined($pluginargs)) {
[8716]	298	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
[4778]	299	if ($longest > $maxoffset) {
	300	$maxoffset = $longest;
[4744]	301	}
[4750]	302	}
[4744]	303
[4750]	304	# Recurse up the plugin hierarchy
	305	$maxoffset = $self->determine_description_offset($maxoffset);
	306	$self->{'option_list'} = \@optionlist;
	307	return $maxoffset;
	308	}
	309
	310
	311	sub print_plugin_usage
	312	{
[8716]	313	my $self = shift(@_);
	314	my $descoffset = shift(@_);
	315	my $isleafclass = shift(@_);
[4750]	316
[8716]	317	my $optionlistref = $self->{'option_list'};
	318	my @optionlist = @$optionlistref;
[10229]	319	my $pluginoptions = shift(@$optionlistref);
[4750]	320	return if (!defined($pluginoptions));
	321
[8716]	322	my $pluginname = $pluginoptions->{'name'};
	323	my $pluginargs = $pluginoptions->{'args'};
	324	my $plugindesc = $pluginoptions->{'desc'};
[4750]	325
	326	# Produce the usage information using the data structure above
	327	if ($isleafclass) {
[6932]	328	if (defined($plugindesc)) {
[9413]	329	gsprintf(STDERR, "$plugindesc\n\n");
[6932]	330	}
[9413]	331	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
[4750]	332	}
	333
	334	# Display the plugin options, if there are some
	335	if (defined($pluginargs)) {
[4744]	336	# Calculate the column offset of the option descriptions
[8716]	337	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
[4744]	338
[4750]	339	if ($isleafclass) {
[9413]	340	gsprintf(STDERR, " {common.specific_options}:\n");
[4750]	341	}
	342	else {
[9413]	343	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
[4750]	344	}
	345
[4744]	346	# Display the plugin options
[6925]	347	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
[4744]	348	}
	349
[4750]	350	# Recurse up the plugin hierarchy
[6925]	351	$self->print_plugin_usage($descoffset, 0);
[4750]	352	$self->{'option_list'} = \@optionlist;
[4744]	353	}
	354
	355
[4]	356	sub new {
[10218]	357	# Set Encodings to the list!!
	358
	359
	360	# Start the BasPlug Constructor
[1219]	361	my $class = shift (@_);
[10218]	362	my ($pluginlist,$args,$hashArgOptLists) = @_;
	363	push(@$pluginlist, $class);
	364	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
[9398]	365
[10218]	366	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	367	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	368
[9398]	369	if (GISBasPlug::has_mapdata()) {
	370	push(@$arguments,@$gis_arguments);
	371	}
[10218]	372
	373	my $self = {};
[10579]	374	$self->{'outhandle'} = STDERR;
	375	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
	376	$self->{"info_only"} = 0;
	377
	378	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
	379	# the args, just return the object.
	380	foreach my $strArg (@{$args})
	381	{
	382	if($strArg eq "-gsdlinfo")
	383	{
	384	$self->{"info_only"} = 1;
	385	return bless $self, $class;
	386	}
	387	}
	388
[10218]	389	if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
	390	{
	391	my $classTempClass = bless $self, $class;
[10620]	392	print STDERR "<BadPlugin p=$plugin_name>\n";
[10218]	393	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
	394	$classTempClass->print_txt_usage(""); # Use default resource bundle
	395	die "\n";
	396	}
[9398]	397
[10280]	398
[10579]	399	delete $self->{"info_only"};
[10218]	400	# else parsing was successful.
	401
	402	$self->{'plugin_type'} = $plugin_name;
[10579]	403	#$self->{'outhandle'} = STDERR;
[2235]	404	$self->{'textcat'} = new textcat();
[2785]	405	$self->{'num_processed'} = 0;
	406	$self->{'num_not_processed'} = 0;
	407	$self->{'num_blocked'} = 0;
	408	$self->{'num_archives'} = 0;
[8678]	409	$self->{'cover_image'} = 1; # cover image is on by default
[10218]	410	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[10579]	411	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]	412
[8892]	413	my $associate_ext = $self->{'associate_ext'};
	414	if ((defined $associate_ext) && ($associate_ext ne "")) {
	415	my @exts = split(/,/,$associate_ext);
[9351]	416
[8892]	417	my %associate_ext_lookup = ();
	418	foreach my $e (@exts) {
[9351]	419	$associate_ext_lookup{$e} = 1;
[8892]	420	}
	421
	422	$self->{'associate_ext_lookup'} = \%associate_ext_lookup;
	423	}
	424
	425	$self->{'shared_fileroot'} = {};
[8510]	426	$self->{'file_blocks'} = {};
[1219]	427
[9398]	428	if ($self->{'extract_placenames'}) {
	429
	430	my $outhandle = $self->{'outhandle'};
[10218]	431
[9398]	432	my $places_ref
	433	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
[10218]	434
[9398]	435	if (!defined $places_ref) {
	436	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
	437	print $outhandle " No placename extraction will take place.\n";
	438	$self->{'extract_placenames'} = undef;
	439	}
	440	else {
	441	$self->{'places'} = $places_ref;
	442	}
[10280]	443	}
[11089]	444
[1219]	445	return bless $self, $class;
[10218]	446
[4]	447	}
	448
[1242]	449	# initialize BasPlug options
	450	# if init() is overridden in a sub-class, remember to call BasPlug::init()
	451	sub init {
	452	my $self = shift (@_);
[2785]	453	my ($verbosity, $outhandle, $failhandle) = @_;
[1242]	454
	455	# verbosity is passed through from the processor
	456	$self->{'verbosity'} = $verbosity;
	457
[2785]	458	# as are the outhandle and failhandle
[1424]	459	$self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]	460	$self->{'failhandle'} = $failhandle;
[1424]	461
[1242]	462	# set process_exp and block_exp to defaults unless they were
	463	# explicitly set
[1244]	464
	465	if ((!$self->is_recursive()) and
[1242]	466	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	467
[1242]	468	$self->{'process_exp'} = $self->get_default_process_exp ();
	469	if ($self->{'process_exp'} eq "") {
[1244]	470	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	471	}
	472	}
	473
	474	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	475	$self->{'block_exp'} = $self->get_default_block_exp ();
	476	}
[11089]	477
[1242]	478	}
	479
[839]	480	sub begin {
	481	my $self = shift (@_);
	482	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[1396]	483	$self->initialise_extractors();
[839]	484	}
	485
	486	sub end {
[10155]	487	# potentially called at the end of each plugin pass
	488	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
	489
[839]	490	my ($self) = @_;
[1396]	491	$self->finalise_extractors();
[839]	492	}
	493
[10155]	494	sub deinit {
	495	# called only once, after all plugin passes have been done
	496
	497	my ($self) = @_;
	498	}
	499
[1242]	500	# this function should be overridden to return 1
	501	# in recursive plugins
[4]	502	sub is_recursive {
	503	my $self = shift (@_);
	504
[1242]	505	return 0;
[4]	506	}
	507
[1242]	508	sub get_default_block_exp {
	509	my $self = shift (@_);
	510
	511	return "";
	512	}
	513
	514	sub get_default_process_exp {
	515	my $self = shift (@_);
	516
	517	return "";
	518	}
	519
[9067]	520	# default implementation is to do nothing.
	521	sub store_block_files
	522	{
	523	my $self =shift (@_);
	524	my ($filename) = @_;
	525	return;
	526	}
	527
	528	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
	529	sub block_cover_image
	530	{
[10833]	531	my $self =shift;
	532	my $filename = shift;
	533
[9067]	534	if ($self->{'cover_image'}) {
	535	my $coverfile = $filename;
	536	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
	537	if (!-e $coverfile) {
	538	$coverfile =~ s/jpg$/JPG/;
	539	}
	540	if (-e $coverfile) {
	541	$self->{'file_blocks'}->{$coverfile} = 1;
[11089]	542	}
[9067]	543	}
	544
	545	return;
	546	}
	547
[8510]	548	sub metadata_read {
	549	my $self = shift (@_);
	550	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[8892]	551	# Keep track of filenames with same root but different extensions
[9067]	552	# Used to support -associate_ext
[8892]	553
	554	my $associate_ext = $self->{'associate_ext'};
	555	if ((defined $associate_ext) && ($associate_ext ne "")) {
	556
	557	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
	558	if ((defined $file_prefix) && (defined $file_ext)) {
	559
	560	my $shared_fileroot = $self->{'shared_fileroot'};
	561	if (!defined $shared_fileroot->{$file_prefix}) {
	562	my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
	563	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
	564	}
	565
	566	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
	567
	568	my $process_exp = $self->{'process_exp'};
	569
	570	if ($file =~ m/$self->{'process_exp'}/) {
	571	# This is the document the others should be tied to
	572	$file_prefix_rec->{'tie_to'} = $file_ext;
	573	}
	574	else {
[9351]	575	if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
	576	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
	577	}
[8892]	578	}
	579	}
	580	}
[9067]	581
	582	# now check whether we are actually processing this
	583	my $filename = $file;
	584	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	585	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	586	return undef; # can't recognise
	587	}
[8892]	588
[9067]	589	# do smart blocking if appropriate
[11089]	590	if ($self->{'smart_block'}) {
[9067]	591	$self->store_block_files($filename);
	592	}
[11089]	593	# block the cover image if there is one
	594	if ($self->{'cover_image'}) {
	595	$self->block_cover_image($filename);
	596	}
[9067]	597
	598	return 1;
[8510]	599	}
	600
[8892]	601	sub tie_to_filename
	602	{
	603	my $self = shift (@_);
	604
	605	my ($file_ext,$file_prefix_rec) = @_;
	606
	607	if (defined $file_prefix_rec) {
	608	my $tie_to = $file_prefix_rec->{'tie_to'};
[9351]	609
[8892]	610	if (defined $tie_to) {
	611	if ($tie_to eq $file_ext) {
	612	return 1;
	613	}
	614	}
	615	}
	616
	617	return 0;
	618	}
	619
	620	sub tie_to_assoc_file
	621	{
	622	my $self = shift (@_);
	623	my ($file_ext,$file_prefix_rec) = @_;
	624
	625	if (defined $file_prefix_rec) {
	626	my $tie_to = $file_prefix_rec->{'tie_to'};
	627	if (defined $tie_to) {
	628
	629	my $exts = $file_prefix_rec->{'exts'};
	630
	631	my $has_file_ext = $exts->{$file_ext};
	632
	633	if ($has_file_ext) {
	634	return 1;
	635	}
	636	}
	637	}
	638
	639	return 0;
	640	}
	641
	642
	643	sub associate_with
	644	{
	645	my $self = shift (@_);
	646	my ($file, $filename, $metadata) = @_;
	647
	648	my $associate_ext = $self->{'associate_ext'};
	649
[9351]	650
[8892]	651	return 0 if (!$associate_ext);
	652
	653	# If file, see if matches with "tie_to" doc or is one of the
	654	# associated filename extensions.
	655
	656	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
	657	if ((defined $file_prefix) && (defined $file_ext)) {
	658
	659	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
[9351]	660
[8892]	661	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
	662
	663	# Set up gsdlassocfile_tobe
	664
	665	my $exts = $file_prefix_rec->{'exts'};
	666
	667	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
	668	$metadata->{'gsdlassocfile_tobe'} = [];
	669	}
	670
	671	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
	672
	673	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
	674	foreach my $e (keys %$exts) {
	675	my $assoc_file = "$full_prefix.$e";
	676	my $mime_type = ""; # let system auto detect this
	677	push(@$assoc_tobe,"$assoc_file:$mime_type:");
	678	}
	679	}
	680	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
	681	# a form of smart block
	682
	683	return 1;
	684	}
	685	}
	686
	687	return 0;
	688	}
	689
	690
[10280]	691	sub read_block {
[1954]	692	my $self = shift (@_);
	693
[9853]	694	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[4]	695
[8908]	696
[2795]	697	my $filename = $file;
	698	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	699
[8892]	700	if ($self->associate_with($file,$filename,$metadata)) {
	701	# a form of smart block
	702	$self->{'num_blocked'} ++;
[10280]	703	return (0,undef); # blocked
[8892]	704	}
[8510]	705
[10280]	706	my $smart_block = $self->{'smart_block'};
	707	my $smart_block_BN = $self->{'smart_block_BN'};
[11089]	708
[8915]	709	if ($smart_block \|\| $smart_block_BN) {
[8510]	710	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
[8892]	711	$self->{'num_blocked'} ++;
[10280]	712	return (0,undef); # blocked
[8510]	713	}
[11089]	714	} else {
	715	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
	716	$self->{'num_blocked'} ++;
	717	return (0,undef); # blocked
	718	}
	719	if ($self->{'cover_image'}) {
	720	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
	721	$self->{'num_blocked'} ++;
	722	return (0,undef); # blocked
	723	}
	724	}
[2785]	725	}
[1242]	726	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
[10280]	727	return (undef,undef); # can't recognise
[1242]	728	}
[10280]	729
	730	return (1,$filename);
	731	}
	732
	733	sub read_tidy_file {
	734
	735	my $self = shift (@_);
	736
	737	my ($file) = @_;
	738
[1242]	739	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1844]	740
[10280]	741	return $file;
	742	}
	743
	744
	745
	746	# The BasPlug read_into_doc_obj() function. This function does all the
	747	# right things to make general options work for a given plugin. It reads in
	748	# a file and sets up a slew of metadata all saved in doc_obj, which
	749	# it then returns as part of a tuple (process_status,doc_obj)
	750	#
	751	# Much of this functionality used to reside in read, but it was broken
	752	# down into a supporting routine to make the code more flexible.
	753	#
	754	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	755	# capable of processing many documents within a single file (e.g.
	756	# GMLPlug) will normally want to implement their own version of
	757	# read_into_doc_obj()
	758	#
	759	# Note that $base_dir might be "" and that $file might
	760	# include directories
	761	sub read_into_doc_obj {
	762	my $self = shift (@_);
	763	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	764
	765	if ($self->is_recursive()) {
	766	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
	767	}
	768
	769	my $outhandle = $self->{'outhandle'};
	770
	771	my ($block_status,$filename) = $self->read_block(@_);
	772	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
	773	$file = $self->read_tidy_file($file);
	774
[2811]	775	# Do encoding stuff
	776	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[1844]	777
[1242]	778	# create a new document
[1379]	779	my $doc_obj = new doc ($filename, "indexed_doc");
[2327]	780	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
[1844]	781	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
[1868]	782	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[7508]	783	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8166]	784	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
	785
[2796]	786	my ($filemeta) = $file =~ /([^\\\/]+)$/;
[4845]	787	# how do we know what encoding the filename is in?
	788	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
[2816]	789	if ($self->{'cover_image'}) {
	790	$self->associate_cover_image($doc_obj, $filename);
	791	}
[1242]	792
	793	# read in file ($text will be in utf8)
	794	my $text = "";
[2734]	795	$self->read_file ($filename, $encoding, $language, \$text);
[1242]	796
[1844]	797	if (!length ($text)) {
[2811]	798	my $plugin_name = ref ($self);
[9586]	799	if ($gli) {
	800	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
	801	}
[9703]	802	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
[2785]	803
	804	my $failhandle = $self->{'failhandle'};
[9413]	805	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
[5681]	806	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
[2785]	807	$self->{'num_not_processed'} ++;
	808
[10280]	809	return (0,undef); # what should we return here?? error but don't want to pass it on
[1242]	810	}
[1954]	811
[1242]	812	# include any metadata passed in from previous plugins
	813	# note that this metadata is associated with the top level section
[8510]	814
[1242]	815	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
[9398]	816
[8716]	817	# do plugin specific processing of doc_obj
	818	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
	819	$text = '';
	820	undef $text;
[9584]	821	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[10280]	822	return (-1,undef);
[8716]	823	}
	824	$text='';
	825	undef $text;
[9398]	826
[1242]	827	# do any automatic metadata extraction
	828	$self->auto_extract_metadata ($doc_obj);
[1954]	829
[1242]	830	# add an OID
[3515]	831	# see if there is a plugin-specific set_OID function...
[8716]	832	if (defined ($self->can('set_OID'))) {
[3515]	833	# it will need $doc_obj to set the Identifier metadata...
	834	$self->set_OID($doc_obj);
	835	} else {
	836	# use the default set_OID() in doc.pm
	837	$doc_obj->set_OID();
	838	}
[10280]	839
	840	return (1,$doc_obj);
	841	}
[1242]	842
	843
[10280]	844	# The BasPlug read() function. This function calls read_into_doc_obj()
	845	# to ensure all the right things to make general options work for a
	846	# given plugin are done. It then calls the process() function which
	847	# does all the work specific to a plugin (like the old read functions
	848	# used to do). Most plugins should define their own process() function
	849	# and let this read() function keep control.
	850	#
	851	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	852	# capable of processing many documents within a single file (e.g.
	853	# GMLPlug) might want to implement their own version of read(), but
	854	# more likely need to implement their own version of read_into_doc_obj()
	855	#
	856	# Return number of files processed, undef if can't recognise, -1 if can't
	857	# process
	858
	859	sub read {
	860	my $self = shift (@_);
	861	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	862
	863	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
	864
	865	if ((defined $process_status) && ($process_status == 1)) {
	866	# process the document
	867	$processor->process($doc_obj);
	868
	869	if(defined($self->{'places_filename'})){
	870	&util::rm($self->{'places_filename'});
	871	$self->{'places_filename'} = undef;
	872	}
	873
	874	$self->{'num_processed'} ++;
	875	undef $doc_obj;
[9398]	876	}
	877
[10280]	878	# if process_status == 1, then the file has been processed.
	879	return $process_status;
	880
[4]	881	}
	882
[1244]	883	# returns undef if file is rejected by the plugin
[1242]	884	sub process {
	885	my $self = shift (@_);
[11089]	886	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]	887
[9413]	888	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
[5681]	889	# die "Basplug::process function must be implemented in sub-class\n";
[1244]	890
	891	return undef; # never gets here
[1242]	892	}
	893
[1219]	894	# uses the multiread package to read in the entire file pointed to
	895	# by filename and loads the resulting text into $$textref. Input text
	896	# may be in any of the encodings handled by multiread, output text
	897	# will be in utf8
	898	sub read_file {
	899	my $self = shift (@_);
[2734]	900	my ($filename, $encoding, $language, $textref) = @_;
[4]	901
[1756]	902	if (!-r $filename)
	903	{
[1844]	904	my $outhandle = $self->{'outhandle'};
[9413]	905	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
[5681]	906	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
[1756]	907	return;
	908	}
[1219]	909	$$textref = "";
[9413]	910	if (!open (FILE, $filename)) {
	911	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
[10280]	912	die "\n";
	913	}
	914
[1844]	915	if ($encoding eq "ascii") {
[1219]	916	undef $/;
	917	$$textref = <FILE>;
	918	$/ = "\n";
	919	} else {
	920	my $reader = new multiread();
	921	$reader->set_handle ('BasPlug::FILE');
[1844]	922	$reader->set_encoding ($encoding);
[1219]	923	$reader->read_file ($textref);
[10280]	924	#Now segments chinese if the separate_cjk option is set
[6584]	925	if ($self->{'separate_cjk'}) {
[1219]	926	# segment the Chinese words
	927	$$textref = &cnseg::segment($$textref);
	928	}
	929	}
[10280]	930	close FILE;
	931	}
[1219]	932
[10280]	933	# write_file -- used by ConvertToPlug, for example in post processing
	934	#
	935	sub utf8_write_file {
	936	my $self = shift (@_);
	937	my ($textref, $filename) = @_;
	938
	939	if (!open (FILE, ">$filename")) {
	940	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
	941	die "\n";
	942	}
	943	print FILE $$textref;
	944
[1219]	945	close FILE;
	946	}
	947
[10280]	948
[7504]	949	sub filename_based_title
	950	{
	951	my $self = shift (@_);
	952	my ($file) = @_;
	953
	954	my $file_derived_title = $file;
	955	$file_derived_title =~ s/_/ /g;
	956	$file_derived_title =~ s/\..*?$//;
	957
	958	return $file_derived_title;
	959	}
	960
[9398]	961
[7504]	962	sub title_fallback
	963	{
	964	my $self = shift (@_);
	965	my ($doc_obj,$section,$file) = @_;
	966
	967	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
	968
	969	my $file_derived_title = $self->filename_based_title($file);
	970	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
	971	}
	972	}
	973
[2811]	974	sub textcat_get_language_encoding {
	975	my $self = shift (@_);
	976	my ($filename) = @_;
	977
[10280]	978
[2811]	979	my ($language, $encoding, $extracted_encoding);
	980	if ($self->{'input_encoding'} eq "auto") {
	981	# use textcat to automatically work out the input encoding and language
	982	($language, $encoding) = $self->get_language_encoding ($filename);
	983	} elsif ($self->{'extract_language'}) {
	984	# use textcat to get language metadata
	985	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
	986	$encoding = $self->{'input_encoding'};
[7644]	987	# don't print this message for english... english in utf8 is identical
	988	# to english in iso-8859-1 (except for some punctuation). We don't have
	989	# a language model for en_utf8, so textcat always says iso-8859-1!
	990	if ($extracted_encoding ne $encoding && $language ne "en"
	991	&& $self->{'verbosity'}) {
[2811]	992	my $plugin_name = ref ($self);
	993	my $outhandle = $self->{'outhandle'};
[9413]	994	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
[5681]	995	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
	996	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
[2811]	997	}
	998	} else {
	999	$language = $self->{'default_language'};
	1000	$encoding = $self->{'input_encoding'};
	1001	}
[10280]	1002
[2811]	1003	return ($language, $encoding);
	1004	}
	1005
[1844]	1006	# Uses textcat to work out the encoding and language of the text in
	1007	# $filename. All html tags are removed before processing.
	1008	# returns an array containing "language" and "encoding"
	1009	sub get_language_encoding {
	1010	my $self = shift (@_);
	1011	my ($filename) = @_;
	1012	my $outhandle = $self->{'outhandle'};
[9413]	1013	my $unicode_format = "";
[1844]	1014	# read in file
[9413]	1015	open (FILE, $filename) \|\| (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
[1844]	1016	undef $/;
	1017	my $text = <FILE>;
	1018	$/ = "\n";
	1019	close FILE;
	1020
[9413]	1021	# check if first few bytes have a Byte Order Marker
	1022	my $bom=substr($text,0,2); # check 16bit unicode
	1023	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
	1024	$unicode_format="unicode";
	1025	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
	1026	$unicode_format="unicode";
	1027	} else {
	1028	$bom=substr($text,0,3); # check utf-8
	1029	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
	1030	$unicode_format="utf8";
	1031	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
	1032	# $unicode_format="utf8";
	1033	}
	1034	}
[10442]	1035
	1036	# VB scripting generated Word to HTML file
[10446]	1037	if ($text =~ /charset=(windows.*)[\"]/ig){
[10442]	1038	my $vbhtml_encoding = $1;
	1039	$vbhtml_encoding =~ s/-+/_/g;
	1040	$self->{'input_encoding'} = $vbhtml_encoding;
	1041	}
	1042
[1999]	1043	# remove <title>stuff</title> -- as titles tend often to be in English
	1044	# for foreign language documents
[8818]	1045	$text =~ s/<title>(.\|\n)*?<\/title>//i;
[1999]	1046
[1844]	1047	# remove all HTML tags
[9413]	1048	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
	1049	if (ref($self) eq 'HTMLPlug' \|\|
	1050	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
	1051	$text =~ s/<[^>]*>//sg;
	1052	}
[1844]	1053
	1054	# get the language/encoding
[2235]	1055	my $results = $self->{'textcat'}->classify(\$text);
[1844]	1056
[1903]	1057	# if textcat returns 3 or less possibilities we'll use the
	1058	# first one in the list - otherwise use the defaults
[2235]	1059	if (scalar @$results > 3) {
[3731]	1060	my $best_encoding="";
[9413]	1061	if ($unicode_format) { # in case the first had a BOM
	1062	$best_encoding=$unicode_format;
	1063	} else {
	1064	my %guessed_encodings = ();
	1065	foreach my $result (@$results) {
	1066	$result =~ /([^\-]+)$/;
	1067	my $enc=$1;
	1068	if (!defined($guessed_encodings{$enc})) {
	1069	$guessed_encodings{$enc}=0;
	1070	}
	1071	$guessed_encodings{$enc}++;
[3731]	1072	}
[9413]	1073
	1074	$guessed_encodings{""}=-1; # for default best_encoding of ""
	1075	foreach my $enc (keys %guessed_encodings) {
	1076	if ($guessed_encodings{$enc} >
	1077	$guessed_encodings{$best_encoding}){
	1078	$best_encoding=$enc;
	1079	}
	1080	}
[3731]	1081	}
	1082
[1844]	1083	if ($self->{'input_encoding'} ne 'auto') {
[9961]	1084	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
[9413]	1085	gsprintf($outhandle,
	1086	"BasPlug: {BasPlug.could_not_extract_language}\n",
	1087	$filename, $self->{'default_language'});
[1844]	1088	}
	1089	return ($self->{'default_language'}, $self->{'input_encoding'});
	1090
	1091	} else {
[9961]	1092	if ($self->{'verbosity'}>2) {
[9413]	1093	gsprintf($outhandle,
	1094	"BasPlug: {BasPlug.could_not_extract_language}\n",
	1095	$filename, $self->{'default_language'});
[1844]	1096	}
[3731]	1097	return ($self->{'default_language'}, $best_encoding);
[1844]	1098	}
	1099	}
	1100
	1101	# format language/encoding
[2235]	1102	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
[1870]	1103	if (!defined $language) {
[9961]	1104	if ($self->{'verbosity'}>2) {
[9413]	1105	gsprintf($outhandle,
	1106	"BasPlug: {BasPlug.could_not_extract_language}\n",
	1107	$filename, $self->{'default_language'});
[1870]	1108	}
	1109	$language = $self->{'default_language'};
	1110	}
[1844]	1111	if (!defined $encoding) {
[9961]	1112	if ($self->{'verbosity'}>2) {
[9413]	1113	gsprintf($outhandle,
	1114	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
	1115	$filename, $self->{'default_encoding'});
[1870]	1116	}
	1117	$encoding = $self->{'default_encoding'};
[1844]	1118	}
	1119
[7818]	1120
	1121	# check for equivalents where textcat doesn't have some encodings...
	1122	# eg MS versions of standard encodings
	1123	if ($encoding =~ /^iso_8859_(\d+)/) {
	1124	my $iso = $1; # which variant of the iso standard?
	1125	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
	1126	if ($text =~ /[\x80-\x9f]/) {
	1127	# Western Europe
	1128	if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
	1129	elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
	1130	elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
	1131	elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
	1132	elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
	1133	elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
	1134	elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
	1135	}
	1136	}
	1137
[1870]	1138	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
	1139	!defined $encodings::encodings->{$encoding}) {
[1844]	1140	if ($self->{'verbosity'}) {
[7818]	1141	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
	1142	$filename, $encoding, $self->{'default_encoding'});
[1844]	1143	}
	1144	$encoding = $self->{'default_encoding'};
	1145	}
	1146
	1147	return ($language, $encoding);
	1148	}
	1149
[1219]	1150	# add any extra metadata that's been passed around from one
	1151	# plugin to another.
	1152	# extra_metadata uses add_utf8_metadata so it expects metadata values
	1153	# to already be in utf8
	1154	sub extra_metadata {
	1155	my $self = shift (@_);
	1156	my ($doc_obj, $cursection, $metadata) = @_;
	1157
	1158	foreach my $field (keys(%$metadata)) {
[839]	1159	# $metadata->{$field} may be an array reference
[8510]	1160	if ($field eq "gsdlassocfile_tobe") {
	1161	# 'gsdlassocfile_tobe' is artificially introduced metadata
	1162	# that is used to signal that certain additional files should
	1163	# be tied to this document. Useful in situations where a
	1164	# metadata pass in the plugin pipeline works out some files
	1165	# need to be associated with a document, but the document hasn't
	1166	# been formed yet.
	1167
	1168	my $equiv_form = "";
	1169	foreach my $gaf (@{$metadata->{$field}}) {
	1170	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
	1171	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
	1172	my $filename = $full_filename;
	1173
	1174	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
	1175
	1176	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
[9351]	1177	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
[8510]	1178	my $srcicon = "_icon".$doc_ext."_";
	1179	my $end_doclink = "</a>";
	1180
	1181	$equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
	1182	}
	1183	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
	1184	}
	1185	elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]	1186	map {
[1219]	1187	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]	1188	} @{$metadata->{$field}};
	1189	} else {
[1219]	1190	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]	1191	}
	1192	}
	1193	}
	1194
[1396]	1195	# initialise metadata extractors
	1196	sub initialise_extractors {
	1197	my $self = shift (@_);
	1198
	1199	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	1200	&acronym::initialise_acronyms();
	1201	}
	1202	}
	1203
	1204	# finalise metadata extractors
	1205	sub finalise_extractors {
	1206	my $self = shift (@_);
	1207
	1208	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	1209	&acronym::finalise_acronyms();
	1210	}
	1211	}
	1212
[1602]	1213	# FIRSTNNN: extract the first NNN characters as metadata
	1214	sub extract_first_NNNN_characters {
	1215	my $self = shift (@_);
	1216	my ($textref, $doc_obj, $thissection) = @_;
	1217
	1218	foreach my $size (split /,/, $self->{'first'}) {
	1219	my $tmptext = $$textref;
	1220	$tmptext =~ s/^\s+//;
	1221	$tmptext =~ s/\s+$//;
	1222	$tmptext =~ s/\s+/ /gs;
	1223	$tmptext = substr ($tmptext, 0, $size);
	1224	$tmptext =~ s/\s\S*$/…/;
	1225	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	1226	}
	1227	}
	1228
	1229	sub extract_email {
	1230	my $self = shift (@_);
	1231	my ($textref, $doc_obj, $thissection) = @_;
	1232	my $outhandle = $self->{'outhandle'};
	1233
[9413]	1234	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
[1844]	1235	if ($self->{'verbosity'} > 2);
[1602]	1236
[2604]	1237	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
[1602]	1238	@email = sort @email;
	1239
[10218]	1240	# if($self->{"new_extract_email"} == 0)
	1241	# {
	1242	# my @email2 = ();
	1243	# foreach my $address (@email)
	1244	# {
	1245	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
	1246	# {
	1247	# push @email2, $address;
	1248	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
	1249	# # print $outhandle " extracting $address\n"
	1250	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
	1251	# if ($self->{'verbosity'} > 3);
	1252	# }
	1253	# }
	1254	# }
	1255	# else
	1256	# {
	1257	my $hashExistMail = {};
[1602]	1258	foreach my $address (@email) {
[10218]	1259	if (!(defined $hashExistMail->{$address}))
	1260	{
	1261	$hashExistMail->{$address} = 1;
[1602]	1262	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
[9413]	1263	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
[1844]	1264	if ($self->{'verbosity'} > 3);
[1602]	1265	}
	1266	}
[9413]	1267	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
[1844]	1268	if ($self->{'verbosity'} > 2);
[1602]	1269	}
	1270
	1271	# extract metadata
[5681]	1272	sub auto_extract_metadata {
[1954]	1273
[1242]	1274	my $self = shift (@_);
	1275	my ($doc_obj) = @_;
[1602]	1276
	1277	if ($self->{'extract_email'}) {
	1278	my $thissection = $doc_obj->get_top_section();
	1279	while (defined $thissection) {
	1280	my $text = $doc_obj->get_text($thissection);
	1281	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
	1282	$thissection = $doc_obj->get_next_section ($thissection);
	1283	}
[1954]	1284	}
[9398]	1285	if ($self->{'extract_placenames'}) {
	1286	my $thissection = $doc_obj->get_top_section();
	1287	while (defined $thissection) {
	1288	my $text = $doc_obj->get_text($thissection);
	1289	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
	1290	$thissection = $doc_obj->get_next_section ($thissection);
	1291	}
	1292	}
[1954]	1293
[11069]	1294	if ($self->{'extract_keyphrases'} \|\| $self->{'extract_keyphrases_kea4'}) {
	1295	$self->extract_keyphrases($doc_obj);
	1296	}
[1954]	1297
[1602]	1298	if ($self->{'first'}) {
	1299	my $thissection = $doc_obj->get_top_section();
	1300	while (defined $thissection) {
	1301	my $text = $doc_obj->get_text($thissection);
	1302	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
	1303	$thissection = $doc_obj->get_next_section ($thissection);
	1304	}
	1305	}
	1306
[1242]	1307	if ($self->{'extract_acronyms'}) {
	1308	my $thissection = $doc_obj->get_top_section();
	1309	while (defined $thissection) {
	1310	my $text = $doc_obj->get_text($thissection);
	1311	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
	1312	$thissection = $doc_obj->get_next_section ($thissection);
	1313	}
	1314	}
[1602]	1315
[1393]	1316	if ($self->{'markup_acronyms'}) {
	1317	my $thissection = $doc_obj->get_top_section();
	1318	while (defined $thissection) {
	1319	my $text = $doc_obj->get_text($thissection);
	1320	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
	1321	$doc_obj->delete_text($thissection);
	1322	$doc_obj->add_text($thissection, $text);
	1323	$thissection = $doc_obj->get_next_section ($thissection);
	1324	}
	1325	}
	1326
[10218]	1327	if($self->{'extract_historical_years'}) {
[1317]	1328	my $thissection = $doc_obj->get_top_section();
	1329	while (defined $thissection) {
[10218]	1330
[1317]	1331	my $text = $doc_obj->get_text($thissection);
[1846]	1332	&DateExtract::get_date_metadata($text, $doc_obj,
	1333	$thissection,
[10218]	1334	$self->{'no_bibliography'},
	1335	$self->{'maximum_year'},
	1336	$self->{'maximum_century'});
[1317]	1337	$thissection = $doc_obj->get_next_section ($thissection);
	1338	}
	1339	}
[1242]	1340	}
	1341
[11069]	1342
	1343	#adding kea keyphrases
	1344	sub extract_keyphrases
	1345	{
	1346	my $self = shift(@_);
	1347	my $doc_obj = shift(@_);
	1348
	1349	# Use Kea 3.0 unless 4.0 has been specified
	1350	my $kea_version = "3.0";
	1351	if ($self->{'extract_keyphrases_kea4'}) {
	1352	$kea_version = "4.0";
	1353	}
	1354
	1355	# Check that Kea exists, and tell the user where to get it if not
	1356	my $keahome = &Kea::get_Kea_directory($kea_version);
	1357	if (!-e $keahome) {
	1358	gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
	1359	return;
	1360	}
	1361
	1362	my $thissection = $doc_obj->get_top_section();
	1363	my $text = "";
	1364	my $list;
	1365
	1366	#loop through sections to gather whole doc
	1367	while (defined $thissection) {
	1368	my $sectiontext = $doc_obj->get_text($thissection);
	1369	$text = $text.$sectiontext;
	1370	$thissection = $doc_obj->get_next_section ($thissection);
	1371	}
	1372
	1373	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
	1374	$list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
	1375	} else { #otherwise call Kea with no options
	1376	$list = &Kea::extract_KeyPhrases ($kea_version, $text);
	1377	}
	1378
	1379	if ($list){
	1380	# if a list of kea keyphrases was returned (ie not empty)
	1381	if ($self->{'verbosity'}) {
	1382	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
	1383	}
	1384
	1385	#add metadata to top section
	1386	$thissection = $doc_obj->get_top_section();
	1387
	1388	# add all key phrases as one metadata
	1389	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
	1390
	1391	# add individual key phrases as multiple metadata
	1392	foreach my $keyphrase (split(',', $list)) {
	1393	$keyphrase =~ s/^\s+\|\s+$//g;
	1394	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
	1395	}
	1396	}
	1397	}
	1398
	1399
[1335]	1400	# extract acronyms from a section in a document. progress is
[1424]	1401	# reported to outhandle based on the verbosity. both the Acronym
[1335]	1402	# and the AcronymKWIC metadata items are created.
	1403
[1242]	1404	sub extract_acronyms {
	1405	my $self = shift (@_);
	1406	my ($textref, $doc_obj, $thissection) = @_;
[1424]	1407	my $outhandle = $self->{'outhandle'};
[1242]	1408
[5681]	1409	# print $outhandle " extracting acronyms ...\n"
[9413]	1410	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
[1844]	1411	if ($self->{'verbosity'} > 2);
[1335]	1412
[1242]	1413	my $acro_array = &acronym::acronyms($textref);
[1360]	1414
[1242]	1415	foreach my $acro (@$acro_array) {
	1416
[1335]	1417	#check that this is the first time ...
	1418	my $seen_before = "false";
	1419	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
	1420	foreach my $thisAcro (@$previous_data) {
[1602]	1421	if ($thisAcro eq $acro->to_string()) {
[1335]	1422	$seen_before = "true";
[9413]	1423	if ($self->{'verbosity'} >= 4) {
	1424	gsprintf($outhandle, " {BasPlug.already_seen} " .
	1425	$acro->to_string() . "\n");
	1426	}
[5681]	1427	}
[1242]	1428	}
[1335]	1429
[1602]	1430	if ($seen_before eq "false") {
[1393]	1431	#write it to the file ...
	1432	$acro->write_to_file();
	1433
[1335]	1434	#do the normal acronym
	1435	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[9413]	1436	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
[1844]	1437	if ($self->{'verbosity'} > 3);
[1335]	1438	}
[1242]	1439	}
[5681]	1440
[9413]	1441	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
[1844]	1442	if ($self->{'verbosity'} > 2);
[1242]	1443	}
	1444
[1393]	1445	sub markup_acronyms {
	1446	my $self = shift (@_);
	1447	my ($text, $doc_obj, $thissection) = @_;
[1424]	1448	my $outhandle = $self->{'outhandle'};
[1393]	1449
[9413]	1450	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
[1844]	1451	if ($self->{'verbosity'} > 2);
[1393]	1452
	1453	#self is passed in to check for verbosity ...
	1454	$text = &acronym::markup_acronyms($text, $self);
	1455
[9413]	1456	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
[1844]	1457	if ($self->{'verbosity'} > 2);
[1393]	1458
	1459	return $text;
	1460	}
	1461
[2785]	1462	sub compile_stats {
	1463	my $self = shift(@_);
	1464	my ($stats) = @_;
	1465
	1466	$stats->{'num_processed'} += $self->{'num_processed'};
	1467	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]	1468	$stats->{'num_archives'} += $self->{'num_archives'};
[2785]	1469
	1470	}
	1471
[2816]	1472	sub associate_cover_image {
[10833]	1473	my $self = shift;
[2816]	1474	my ($doc_obj, $filename) = @_;
	1475
[10833]	1476	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
	1477	if (exists $self->{'covers_missing_cache'}->{$filename}) {
	1478	# don't stat() for existence eg for multiple document input files
	1479	# (eg SplitPlug)
	1480	return;
	1481	}
	1482
[9413]	1483	my $top_section=$doc_obj->get_top_section();
	1484
[2816]	1485	if (-e $filename) {
	1486	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]	1487	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]	1488	} else {
[10833]	1489	my $upper_filename = $filename;
	1490	$upper_filename =~ s/jpg$/JPG/;
	1491	if (-e $upper_filename) {
	1492	$doc_obj->associate_file($upper_filename, "cover.jpg",
	1493	"image/jpeg");
[9413]	1494	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]	1495	} else {
	1496	# file doesn't exist, so record the fact that it's missing so
	1497	# we don't stat() again (stat is slow)
	1498	$self->{'covers_missing_cache'}->{$filename} = 1;
[3086]	1499	}
[2816]	1500	}
[10833]	1501
[2816]	1502	}
	1503
[4]	1504	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: