Context Navigation

source: gsdl/trunk/perllib/plugins/BasePlugin.pm@ 16557

Last change on this file since 16557 was 16557, checked in by ak19, 16 years ago
Auto filename encoding has several additional settings now, these are handled by subroutine filepath_to_utf8 which has changed accordingly. Some additional helper subroutines added. This file BasePlugin.pm is an intermediate but working version (still has many debug output statements even when most are commented out, but as I want to test the changes out on Windows first, I want to retain the debug statements).
Property svn:keywords set to `Author Date Id Revision`
File size: 32.4 KB

Rev	Line
[537]	1	###########################################################################
	2	#
[15868]	3	# BasePlugin.pm -- base class for all the import plugins
[537]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[9413]	8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
[15868]	26	package BasePlugin;
[2219]	27
[10254]	28	use strict;
	29	no strict 'subs';
	30	no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]	31
[8892]	32	use File::Basename;
	33
[1219]	34	use multiread;
[1870]	35	use encodings;
[11389]	36	use unicode;
[16557]	37	use textcat;
[1242]	38	use doc;
[7645]	39	eval "require diagnostics"; # some perl distros (eg mac) don't have this
[2751]	40	use ghtml;
[9413]	41	use gsprintf 'gsprintf';
[4]	42
[15868]	43	use PrintInfo;
[10218]	44
[15868]	45	BEGIN {
	46	@BasePlugin::ISA = ( 'PrintInfo' );
	47	}
[5681]	48
[15868]	49	our $encoding_list =
[10218]	50	[ { 'name' => "ascii",
[16014]	51	'desc' => "{BasePlugin.encoding.ascii}" },
[4744]	52	{ 'name' => "utf8",
[16014]	53	'desc' => "{BasePlugin.encoding.utf8}" },
[4744]	54	{ 'name' => "unicode",
[16014]	55	'desc' => "{BasePlugin.encoding.unicode}" } ];
[16557]	56
[3540]	57
[10620]	58	my $e = $encodings::encodings;
	59	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
	60	{
	61	my $hashEncode =
	62	{'name' => $enc,
	63	'desc' => $e->{$enc}->{'name'}};
	64
[15868]	65	push(@{$encoding_list},$hashEncode);
[10620]	66	}
	67
[15868]	68	our $encoding_plus_auto_list =
	69	[ { 'name' => "auto",
[16557]	70	'desc' => "{BasePlugin.filename_encoding.auto}" },
	71	{ 'name' => "auto-language-analysis",
	72	'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
	73	{ 'name' => "auto-filesystem-encoding",
	74	'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
	75	{ 'name' => "auto-fl",
	76	'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
	77	{ 'name' => "auto-lf",
	78	'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
[10620]	79
[15868]	80	push(@{$encoding_plus_auto_list},@{$encoding_list});
	81
[4873]	82	my $arguments =
[3540]	83	[ { 'name' => "process_exp",
[15868]	84	'desc' => "{BasePlugin.process_exp}",
[6408]	85	'type' => "regexp",
[3540]	86	'deft' => "",
	87	'reqd' => "no" },
[16390]	88	{ 'name' => "no_blocking",
	89	'desc' => "{BasePlugin.no_blocking}",
	90	'type' => "flag",
	91	'reqd' => "no"},
[3540]	92	{ 'name' => "block_exp",
[15868]	93	'desc' => "{BasePlugin.block_exp}",
[6408]	94	'type' => "regexp",
[3540]	95	'deft' => "",
	96	'reqd' => "no" },
[8892]	97	{ 'name' => "associate_ext",
[15868]	98	'desc' => "{BasePlugin.associate_ext}",
[8892]	99	'type' => "string",
	100	'reqd' => "no" },
[11122]	101	{ 'name' => "associate_tail_re",
[15868]	102	'desc' => "{BasePlugin.associate_tail_re}",
[11122]	103	'type' => "string",
	104	'reqd' => "no" },
[15018]	105	{ 'name' => "use_as_doc_identifier",
[15868]	106	'desc' => "{BasePlugin.use_as_doc_identifier}",
[15018]	107	'type' => "string",
	108	'reqd' => "no" ,
	109	'deft' => "" } ,
[15868]	110	{ 'name' => "no_cover_image",
	111	'desc' => "{BasePlugin.no_cover_image}",
[3540]	112	'type' => "flag",
	113	'reqd' => "no" },
[15868]	114	{ 'name' => "filename_encoding",
	115	'desc' => "{BasePlugin.filename_encoding}",
	116	'type' => "enum",
	117	'deft' => "auto",
	118	'list' => $encoding_plus_auto_list,
[16390]	119	'reqd' => "no" },
	120	{ 'name' => "smart_block",
[16520]	121	'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
[16390]	122	'type' => "flag",
	123	'reqd' => "no",
	124	'hiddengli' => "yes" } # deprecated, but leave in for old collections
	125
[15868]	126
	127	];
[3540]	128
[9398]	129
[15868]	130	my $options = { 'name' => "BasePlugin",
	131	'desc' => "{BasePlugin.desc}",
[6408]	132	'abstract' => "yes",
	133	'inherits' => "no",
[4750]	134	'args' => $arguments };
[3540]	135
[4778]	136
[4]	137	sub new {
[10218]	138
[15868]	139	my ($class) = shift (@_);
	140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
[10218]	141	push(@$pluginlist, $class);
[9398]	142
[15868]	143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	144	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	145
[15868]	146	my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
[16390]	147
	148	if ($self->{'info_only'}) {
	149	# don't worry about any options etc
	150	return bless $self, $class;
	151	}
[10579]	152
[16390]	153	if ($self->{'smart_block'}) {
	154	print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
	155	}
	156	$self->{'smart_block'} = undef;
	157
[15868]	158	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
	159	$self->{'plugin_type'} = $plugin_name;
[10579]	160
[2785]	161	$self->{'num_processed'} = 0;
	162	$self->{'num_not_processed'} = 0;
	163	$self->{'num_blocked'} = 0;
	164	$self->{'num_archives'} = 0;
[8678]	165	$self->{'cover_image'} = 1; # cover image is on by default
[10218]	166	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[10579]	167	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]	168
[8892]	169	my $associate_ext = $self->{'associate_ext'};
	170	if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]	171
[11122]	172	my $associate_tail_re = $self->{'associate_tail_re'};
	173	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
	174	my $outhandle = $self->{'outhandle'};
	175	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
	176	print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]	177	}
[11122]	178	else {
	179	my @exts = split(/,/,$associate_ext);
[8892]	180
[11122]	181	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
	182	my $associate_tail_re = join("\|",@exts_bracketed);
	183	$self->{'associate_tail_re'} = $associate_tail_re;
	184	}
	185
	186	delete $self->{'associate_ext'};
[8892]	187	}
	188
[15868]	189	return bless $self, $class;
[11089]	190
[4]	191	}
	192
[15868]	193	# initialize BasePlugin options
	194	# if init() is overridden in a sub-class, remember to call BasePlugin::init()
[1242]	195	sub init {
	196	my $self = shift (@_);
[2785]	197	my ($verbosity, $outhandle, $failhandle) = @_;
[1242]	198
	199	# verbosity is passed through from the processor
	200	$self->{'verbosity'} = $verbosity;
	201
[2785]	202	# as are the outhandle and failhandle
[1424]	203	$self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]	204	$self->{'failhandle'} = $failhandle;
[16390]	205	# $self->SUPER::init(@_);
	206
[1242]	207	# set process_exp and block_exp to defaults unless they were
	208	# explicitly set
[1244]	209
	210	if ((!$self->is_recursive()) and
[1242]	211	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	212
[1242]	213	$self->{'process_exp'} = $self->get_default_process_exp ();
	214	if ($self->{'process_exp'} eq "") {
[1244]	215	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	216	}
	217	}
	218
	219	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	220	$self->{'block_exp'} = $self->get_default_block_exp ();
	221	}
[11089]	222
[1242]	223	}
	224
[839]	225	sub begin {
	226	my $self = shift (@_);
	227	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
	228	}
	229
	230	sub end {
[10155]	231	# potentially called at the end of each plugin pass
	232	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
	233
[15868]	234	my ($self) = shift (@_);
[839]	235	}
	236
[10155]	237	sub deinit {
	238	# called only once, after all plugin passes have been done
	239
	240	my ($self) = @_;
	241	}
	242
[15868]	243	sub set_incremental {
	244	my $self = shift(@_);
	245	my ($incremental) = @_;
	246
	247	$self->{'incremental'} = $incremental;
	248	}
	249
[1242]	250	# this function should be overridden to return 1
	251	# in recursive plugins
[4]	252	sub is_recursive {
	253	my $self = shift (@_);
	254
[1242]	255	return 0;
[4]	256	}
	257
[1242]	258	sub get_default_block_exp {
	259	my $self = shift (@_);
	260
	261	return "";
	262	}
	263
	264	sub get_default_process_exp {
	265	my $self = shift (@_);
	266
	267	return "";
	268	}
	269
[16390]	270	# default implementation is to do nothing
	271	sub store_block_files {
	272
[9067]	273	my $self =shift (@_);
[16390]	274	my ($filename_full_path, $block_hash) = @_;
	275
[9067]	276	}
	277
[16390]	278	# put files to block into hash
	279	sub use_block_expressions {
	280
	281	my $self =shift (@_);
	282	my ($filename_full_path, $block_hash) = @_;
	283
	284	if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
	285	$block_hash->{'file_blocks'}->{$filename_full_path} = 1;
	286	}
	287
	288	}
	289
[9067]	290	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
	291	sub block_cover_image
	292	{
[10833]	293	my $self =shift;
[16390]	294	my ($filename, $block_hash) = @_;
[10833]	295
[9067]	296	if ($self->{'cover_image'}) {
	297	my $coverfile = $filename;
	298	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
	299	if (!-e $coverfile) {
	300	$coverfile =~ s/jpg$/JPG/;
	301	}
	302	if (-e $coverfile) {
[16390]	303	$block_hash->{'file_blocks'}->{$coverfile} = 1;
[11089]	304	}
[9067]	305	}
	306
	307	return;
	308	}
[11122]	309
	310
[16390]	311	# discover all the files that should be blocked by this plugin
	312	# check the args ...
	313	sub file_block_read {
[11122]	314
[8510]	315	my $self = shift (@_);
[16390]	316	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]	317	# Keep track of filenames with same root but different extensions
[11122]	318	# Used to support -associate_ext and the more generalised
	319	# -associate_tail_re
[16390]	320	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]	321
[11122]	322	my $associate_tail_re = $self->{'associate_tail_re'};
	323	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[16390]	324
[11122]	325	my ($file_prefix,$file_ext)
[16390]	326	= &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
	327
[8892]	328	if ((defined $file_prefix) && (defined $file_ext)) {
[16390]	329	my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]	330	if (!defined $shared_fileroot->{$file_prefix}) {
[11122]	331	my $file_prefix_rec = { 'tie_to' => undef,
	332	'exts' => {} };
[8892]	333	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
	334	}
	335
	336	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
	337
[16390]	338	if ($self->can_process_this_file($filename_full_path)) {
[8892]	339	# This is the document the others should be tied to
	340	$file_prefix_rec->{'tie_to'} = $file_ext;
	341	}
	342	else {
[11122]	343	if ($file_ext =~ m/$associate_tail_re$/) {
[16390]	344	# this file should be associated to the main one
[9351]	345	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
	346	}
[8892]	347	}
[11122]	348
[8892]	349	}
	350	}
[11122]	351
[16390]	352	# check block expressions
	353	$self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
	354
[9067]	355	# now check whether we are actually processing this
[16390]	356	if (!-f $filename_full_path \|\| !$self->can_process_this_file($filename_full_path)) {
[9067]	357	return undef; # can't recognise
	358	}
[16390]	359
	360	$self->store_block_files($filename_full_path, $block_hash) unless $self->{'no_blocking'};
[8892]	361
[11089]	362	# block the cover image if there is one
	363	if ($self->{'cover_image'}) {
[16390]	364	$self->block_cover_image($filename_full_path, $block_hash) unless $self->{'no_blocking'};
[11089]	365	}
[9067]	366
	367	return 1;
[8510]	368	}
	369
[16390]	370	# plugins that rely on more than process_exp (eg XML plugins) can override this method
	371	sub can_process_this_file {
	372	my $self = shift(@_);
	373	my ($filename) = @_;
[8892]	374
[16390]	375	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
	376	return 1;
[8892]	377	}
	378	return 0;
[10280]	379
	380	}
	381
[16390]	382	# just converts path as is to utf8.
	383	sub filepath_to_utf8 {
[10280]	384	my $self = shift (@_);
[15868]	385	my ($file, $file_encoding) = @_;
[16390]	386	my $filemeta = $file;
[10280]	387
[16557]	388	my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
	389
	390	## print STDERR "**** User chose filename encoding setting: $filename_encoding\n";
	391
	392	# Whenever filename-encoding is set to any of the auto settings, we
	393	# check if the filename is already in UTF8. If it is, then we're done.
	394	if($filename_encoding =~ m/auto/) {
	395	if(&unicode::check_is_utf8($filemeta))
	396	{
	397	## print STDERR "**** It is already UTF8\n";
	398	$filename_encoding = "utf8";
	399	return $filemeta;
	400	}
	401	}
	402
	403	# Auto setting, but filename is not utf8
	404	if ($filename_encoding eq "auto")
	405	{
	406	# try textcat
	407	$filename_encoding = $self->textcat_encoding($filemeta);
	408
	409	# check the locale next
	410	$filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
	411
	412
	413	# now try the encoding of the document, if available
	414	if ($filename_encoding eq "undefined" && defined $file_encoding) {
	415	$filename_encoding = $file_encoding;
[15868]	416	}
[16557]	417
	418	}
	419
	420	elsif ($filename_encoding eq "auto-language-analysis")
	421	{
	422	$filename_encoding = $self->textcat_encoding($filemeta);
	423
	424	# now try the encoding of the document, if available
	425	if ($filename_encoding eq "undefined" && defined $file_encoding) {
	426	$filename_encoding = $file_encoding;
	427	}
	428	}
	429
	430	elsif ($filename_encoding eq "auto-filesystem-encoding")
	431	{
	432	# try locale
	433	$filename_encoding = $self->locale_encoding();
	434	}
	435
	436	elsif ($filename_encoding eq "auto-fl")
	437	{
	438	# filesystem-encoding (locale) then language-analysis (textcat)
	439	$filename_encoding = $self->locale_encoding();
	440
	441	# try textcat
	442	$filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
	443
	444	# else assume filename encoding is encoding of file content, if that's available
	445	if ($filename_encoding eq "undefined" && defined $file_encoding) {
	446	$filename_encoding = $file_encoding;
[15868]	447	}
[16557]	448	}
[15868]	449
[16557]	450	elsif ($filename_encoding eq "auto-lf")
	451	{
	452	# language-analysis (textcat) then filesystem-encoding (locale)
	453	$filename_encoding = $self->textcat_encoding($filemeta);
	454
	455	# guess filename encoding from encoding of file content, if available
	456	if ($filename_encoding eq "undefined" && defined $file_encoding) {
	457	$filename_encoding = $file_encoding;
	458	}
	459
	460	# try locale
	461	$filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[15868]	462	}
	463
[16557]	464	## print STDERR "**** filename_encoding selected: $filename_encoding \n";
	465
	466	# if still undefined, use utf8 as fallback
	467	if ($filename_encoding eq "undefined") {
	468	$filename_encoding = "utf8";
	469	}
	470
	471	# if the filename encoding is set to utf8 but it isn't utf8 already--such as when
	472	# 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
	473	# be always utf8 (in which case the filename's encoding is also set as utf8 even
	474	# though the filename need not be if it originates from another system)--in such
	475	# cases attempt to make the filename utf8 to match.
	476	if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
	477	## print STDERR "**** BEFORE utf8 conversion: $filemeta\n";
	478	&unicode::ensure_utf8(\$filemeta);
	479	## print STDERR "**** AFTER utf8 conversion: $filemeta\n";
	480	}
	481
	482
	483	# convert non-unicode encodings to utf8
	484	if ($filename_encoding !~ m/(?:ascii\|utf8\|unicode)/) {
	485	$filemeta = &unicode::unicode2utf8(
	486	&unicode::convert2unicode($filename_encoding, \$filemeta)
[15868]	487	);
	488	}
[16390]	489
[16557]	490	print "*** filename encoding found: $filename_encoding\n";
	491	print "*** utf8 encoded filename: $filemeta\n";
	492
[16390]	493	return $filemeta;
	494	}
	495
	496	# gets the filename with no path, converts to utf8, and then dm safes it.
	497	#filename_encoding set by user
	498	sub filename_to_utf8_metadata
	499	{
	500	my $self = shift (@_);
	501	my ($file, $file_encoding) = @_;
	502
	503	my $outhandle = $self->{'outhandle'};
	504
	505	my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
	506	$filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
	507
[15868]	508	my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
	509
	510	return $dmsafe_filemeta;
	511
[10280]	512	}
	513
[16557]	514	sub locale_encoding {
	515	my $self = shift(@_);
	516
	517	if (!defined $self->{'filesystem_encoding'}) {
	518	$self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
	519	$self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
	520	}
[10280]	521
[16557]	522	print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
	523	return $self->{'filesystem_encoding'}; # can be the string "undefined"
	524	}
	525
	526	sub textcat_encoding {
	527	my $self = shift(@_);
	528	my ($filemeta) = @_;
	529
	530	# analyse filenames without extensions and digits (and trimmed of surrounding
	531	# whitespace), so that irrelevant chars don't confuse textcat
	532	my $strictfilemeta = $filemeta;
	533	$strictfilemeta =~ s/\.[^\.]+$//g;
	534	$strictfilemeta =~ s/\d//g;
	535	$strictfilemeta =~ s/^\s*//g;
	536	$strictfilemeta =~ s/\s*$//g;
	537
	538	## print STDERR "**** strict filename is \|$strictfilemeta\|\n";
	539	my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
	540	if(!defined $filename_encoding) {
	541	$filename_encoding = "undefined";
	542	}
	543
	544	## print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n";
	545	return $filename_encoding; # can be the string "undefined"
	546	}
	547
	548	# performs textcat
	549	sub encoding_from_language_analysis {
	550	my $self = shift(@_);
	551	my ($text) = @_;
	552
	553	my $outhandle = $self->{'outhandle'};
	554	my $best_encoding = undef;
	555
	556	# get the language/encoding of the file using textcat
	557	$self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
	558	#my $results = $self->{'textcat'}->classify(\$text);
	559	my $results = $self->{'textcat'}->classify_cached(\$text);
	560
	561
	562	if (scalar @$results < 0) {
	563	print STDERR "**** Textcat returned 0 results\n";
	564	return undef;
	565	}
	566
	567	print STDERR "**** TEXTCAT RESULTS for $text: ";
	568	print STDERR join(",", @$results);
	569	print STDERR "\n";
	570
	571	# We have some results, we choose the first
	572	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
	573
	574	$best_encoding = $encoding;
	575	if (!defined $best_encoding) {
	576	## print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n";
	577	return undef;
	578	}
	579
	580	if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
	581	# the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
	582	## print STDERR "*** Filename turns out to be UTF8\n";
	583	$best_encoding = 'utf8';
	584	}
	585
	586
	587	# check for equivalents where textcat doesn't have some encodings...
	588	# eg MS versions of standard encodings
	589	if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
	590	## print STDERR "**** best_encoding is ISO_8859: $best_encoding\n";
	591
	592	my $iso = $1; # which variant of the iso standard?
	593	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
	594	if ($text =~ /[\x80-\x9f]/) {
	595	## print STDERR "**** best_encoding is some windows value: $best_encoding\n";
	596	# Western Europe
	597	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
	598	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
	599	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
	600	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
	601	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
	602	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
	603	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
	604	## print STDERR "**** best_encoding windows value: $best_encoding\n";
	605	}
	606	}
	607
	608	if (defined $best_encoding && $best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
	609	!defined $encodings::encodings->{$best_encoding})
	610	{
	611	if ($self->{'verbosity'}) {
	612	gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
	613	}
	614	## print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n";
	615	$best_encoding = undef;
	616	}
	617	## print STDERR "**** language: $language\n" if defined $language;
	618	## print STDERR "**** encoding: $best_encoding\n" if defined $encoding;
	619
	620	return $best_encoding;
	621	}
	622
	623	# uses locale
[15868]	624	sub get_filesystem_encoding {
[10280]	625
[15868]	626	my $self = shift(@_);
	627
[14961]	628	my $outhandle = $self->{'outhandle'};
	629	my $filesystem_encoding = undef;
	630
	631	eval {
	632	use POSIX qw(locale_h);
[15868]	633
	634	# With only one parameter, setlocale retrieves the
	635	# current value
[14961]	636	my $current_locale = setlocale(LC_CTYPE);
[15868]	637
[14961]	638	if ($current_locale =~ m/^.\.(.?)$/) {
	639	my $char_encoding = lc($1);
[15446]	640	if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
	641	$char_encoding = "$1\_$2\_$3";
	642	}
	643
[14961]	644	$char_encoding =~ s/-/_/g;
	645	$char_encoding =~ s/^utf_8$/utf8/;
[15868]	646
[14961]	647	if ($char_encoding =~ m/^\d+$/) {
[15607]	648	if (defined $encodings::encodings->{"windows_$char_encoding"}) {
[14961]	649	$char_encoding = "windows_$char_encoding";
	650	}
[15607]	651	elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
[14961]	652	$char_encoding = "dos_$char_encoding";
	653	}
	654	}
[15868]	655
[14961]	656	if (($char_encoding =~ m/(?:ascii\|utf8\|unicode)/)
[15446]	657	\|\| (defined $encodings::encodings->{$char_encoding})) {
[14961]	658	$filesystem_encoding = $char_encoding;
	659	}
	660	else {
	661	print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
	662	}
	663	}
[15868]	664
[14961]	665
	666	};
	667	if ($@) {
	668	print $outhandle "$@\n";
	669	print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
	670
	671	}
[15868]	672	return $filesystem_encoding;
	673	}
[14961]	674
[15868]	675	# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
	676	sub set_Source_metadata {
	677	my $self = shift (@_);
	678	my ($doc_obj, $filename_no_path, $file_encoding) = @_;
	679
	680	my $top_section = $doc_obj->get_top_section();
[14961]	681
[15868]	682	# UTF-8 version of filename
	683	my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
[16022]	684	$doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
[14961]	685
	686	}
[15868]	687
	688	sub add_OID {
[14961]	689
[15018]	690	my $self = shift (@_);
	691	my ($doc_obj) = @_;
	692
	693	# See if a metadata field is specified as the field
	694	if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
	695	my $metadata_doc_id = $self->{'use_as_doc_identifier'};
	696
	697	# Consider "tidying" up metadata_doc_id to be something
	698	# suitable in a URL
	699	# Could even support a user specified plugin RE for this.
	700
	701	my $top_section = $doc_obj->get_top_section();
	702	my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
	703	## print STDERR "**** oid = $oid\n";
	704	$doc_obj->set_OID($oid);
	705	}
	706	# See if there is a plugin-specific set_OID function...
	707	elsif (defined ($self->can('set_OID'))) {
	708	# it will need $doc_obj to set the Identifier metadata...
	709	$self->set_OID(@_); # pass through any extra arguments supplied
	710	} else {
	711	# use the default set_OID() in doc.pm
	712	$doc_obj->set_OID();
	713	}
	714	}
	715
[15868]	716
	717
	718	# The BasePlugin read_into_doc_obj() function. This function does all the
	719	# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]	720	# a file and sets up a slew of metadata all saved in doc_obj, which
	721	# it then returns as part of a tuple (process_status,doc_obj)
	722	#
	723	# Much of this functionality used to reside in read, but it was broken
	724	# down into a supporting routine to make the code more flexible.
	725	#
	726	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	727	# capable of processing many documents within a single file (e.g.
	728	# GMLPlug) will normally want to implement their own version of
	729	# read_into_doc_obj()
	730	#
	731	# Note that $base_dir might be "" and that $file might
	732	# include directories
[15868]	733
	734	# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]	735	sub read_into_doc_obj {
	736	my $self = shift (@_);
[16390]	737	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]	738
[15868]	739	my $outhandle = $self->{'outhandle'};
[10280]	740
[15868]	741	# should we move this to read? What about secondary plugins?
	742	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
	743	print $outhandle "$self->{'plugin_type'} processing $file\n"
	744	if $self->{'verbosity'} > 1;
[10280]	745
[16390]	746	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[1242]	747	# create a new document
[15868]	748	my $doc_obj = new doc ($filename_full_path, "indexed_doc");
[14961]	749	my $top_section = $doc_obj->get_top_section();
	750
[15868]	751	# this should look at the plugin option too...
	752	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[14961]	753	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]	754	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
	755
[15877]	756	$self->set_Source_metadata($doc_obj, $filename_no_path);
[8166]	757
[15868]	758	# plugin specific stuff - what args do we need here??
	759	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
	760	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	761	return -1;
[2816]	762	}
[1242]	763
[15868]	764	# include any metadata passed in from previous plugins
	765	# note that this metadata is associated with the top level section
	766	my $section = $doc_obj->get_top_section();
	767	# can we merge these two methods??
	768	$self->add_associated_files($doc_obj, $filename_full_path);
	769	$self->extra_metadata ($doc_obj, $section, $metadata);
	770	$self->auto_extract_metadata($doc_obj);
[1242]	771
[15868]	772	# if we haven't found any Title so far, assign one
	773	# this was shifted to here from inside read()
	774	$self->title_fallback($doc_obj,$section,$filename_no_path);
	775
	776	$self->add_OID($doc_obj);
	777
	778	return (1,$doc_obj);
	779	}
[2785]	780
[15868]	781	sub add_dummy_text {
	782	my $self = shift(@_);
	783	my ($doc_obj, $section) = @_;
[2785]	784
[15868]	785	# add NoText metadata so we can hide this dummy text in format statements
	786	$doc_obj->add_metadata($section, "NoText", "1");
	787	$doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
	788
	789	}
[8510]	790
[15868]	791	# does nothing. Can be overridden by subclass
	792	sub auto_extract_metadata {
	793	my $self = shift(@_);
	794	my ($doc_obj) = @_;
	795	}
[11122]	796
[15868]	797	# adds cover image, associate_file options stuff. Should be called by sub class
	798	# read_into_doc_obj
	799	sub add_associated_files {
	800	my $self = shift(@_);
	801	# whatis filename??
	802	my ($doc_obj, $filename) = @_;
	803
	804	# add in the cover image
	805	if ($self->{'cover_image'}) {
	806	$self->associate_cover_image($doc_obj, $filename);
[8716]	807	}
[9398]	808
[15018]	809
[10280]	810	}
[1242]	811
[16390]	812	# implement this if you are extracting metadata for other documents
	813	sub metadata_read {
	814	my $self = shift (@_);
	815	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
	816
	817	# can we process this file??
	818	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	819	return undef unless $self->can_process_this_file($filename_full_path);
	820
	821	return 1; # we recognise the file, but don't actually do anything with it
	822	}
	823
	824
[15868]	825	# The BasePlugin read() function. This function calls read_into_doc_obj()
[10280]	826	# to ensure all the right things to make general options work for a
	827	# given plugin are done. It then calls the process() function which
	828	# does all the work specific to a plugin (like the old read functions
	829	# used to do). Most plugins should define their own process() function
	830	# and let this read() function keep control.
	831	#
	832	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	833	# capable of processing many documents within a single file (e.g.
	834	# GMLPlug) might want to implement their own version of read(), but
	835	# more likely need to implement their own version of read_into_doc_obj()
	836	#
	837	# Return number of files processed, undef if can't recognise, -1 if can't
	838	# process
	839
	840	sub read {
	841	my $self = shift (@_);
[16390]	842	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]	843
[16390]	844	# can we process this file??
	845	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	846	return undef unless $self->can_process_this_file($filename_full_path);
	847
[10280]	848	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[15868]	849
[10280]	850	if ((defined $process_status) && ($process_status == 1)) {
[15868]	851
[10280]	852	# process the document
	853	$processor->process($doc_obj);
[15868]	854
[10280]	855	$self->{'num_processed'} ++;
	856	undef $doc_obj;
[9398]	857	}
[15868]	858	# delete any temp files that we may have created
	859	$self->clean_up_after_doc_obj_processing();
[9398]	860
[10280]	861	# if process_status == 1, then the file has been processed.
	862	return $process_status;
	863
[4]	864	}
	865
[1244]	866	# returns undef if file is rejected by the plugin
[1242]	867	sub process {
	868	my $self = shift (@_);
[11089]	869	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]	870
[15868]	871	gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
[1244]	872
	873	return undef; # never gets here
[1242]	874	}
	875
[15868]	876	# overwrite this method to delete any temp files that we have created
	877	sub clean_up_after_doc_obj_processing {
	878	my $self = shift(@_);
[4]	879
[10280]	880	}
[16390]	881
[10280]	882	# write_file -- used by ConvertToPlug, for example in post processing
	883	#
[15868]	884	# where should this go, is here the best place??
[10280]	885	sub utf8_write_file {
	886	my $self = shift (@_);
	887	my ($textref, $filename) = @_;
	888
	889	if (!open (FILE, ">$filename")) {
	890	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
	891	die "\n";
	892	}
	893	print FILE $$textref;
	894
[1219]	895	close FILE;
	896	}
	897
[10280]	898
[7504]	899	sub filename_based_title
	900	{
	901	my $self = shift (@_);
	902	my ($file) = @_;
	903
	904	my $file_derived_title = $file;
	905	$file_derived_title =~ s/_/ /g;
	906	$file_derived_title =~ s/\..*?$//;
	907
	908	return $file_derived_title;
	909	}
	910
[9398]	911
[7504]	912	sub title_fallback
	913	{
	914	my $self = shift (@_);
	915	my ($doc_obj,$section,$file) = @_;
	916
[15868]	917	if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]	918
[15877]	919	my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
[15868]	920	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
	921	$doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]	922	}
[15868]	923	else {
	924	$doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title);
	925	}
[9413]	926	}
[15868]	927
[1844]	928	}
[15868]	929
[1219]	930	# add any extra metadata that's been passed around from one
	931	# plugin to another.
	932	# extra_metadata uses add_utf8_metadata so it expects metadata values
	933	# to already be in utf8
	934	sub extra_metadata {
	935	my $self = shift (@_);
	936	my ($doc_obj, $cursection, $metadata) = @_;
	937
[11122]	938	my $associate_tail_re = $self->{'associate_tail_re'};
	939
[1219]	940	foreach my $field (keys(%$metadata)) {
[839]	941	# $metadata->{$field} may be an array reference
[8510]	942	if ($field eq "gsdlassocfile_tobe") {
	943	# 'gsdlassocfile_tobe' is artificially introduced metadata
	944	# that is used to signal that certain additional files should
	945	# be tied to this document. Useful in situations where a
	946	# metadata pass in the plugin pipeline works out some files
	947	# need to be associated with a document, but the document hasn't
	948	# been formed yet.
	949	my $equiv_form = "";
	950	foreach my $gaf (@{$metadata->{$field}}) {
	951	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
	952	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
	953	my $filename = $full_filename;
	954	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[11122]	955
	956	# work out extended tail extension (i.e. matching tail re)
	957
	958	my ($file_prefix,$file_extended_ext)
[16390]	959	= &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]	960	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
	961
[8510]	962	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
[11834]	963	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
[8510]	964	my $srcicon = "_icon".$doc_ext."_";
	965	my $end_doclink = "</a>";
	966
[11122]	967	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
	968
	969	if (defined $pre_doc_ext) {
	970	# for metadata such as [mp3._edited] [mp3._full] ...
	971	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
	972	}
	973
	974	# for multiple metadata such as [mp3.assoclink]
	975	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
	976
	977	$equiv_form .= " $assoc_form";
[8510]	978	}
	979	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
	980	}
	981	elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]	982	map {
[1219]	983	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]	984	} @{$metadata->{$field}};
	985	} else {
[1219]	986	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]	987	}
	988	}
	989	}
	990
[1396]	991
[2785]	992	sub compile_stats {
	993	my $self = shift(@_);
	994	my ($stats) = @_;
	995
	996	$stats->{'num_processed'} += $self->{'num_processed'};
	997	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]	998	$stats->{'num_archives'} += $self->{'num_archives'};
[2785]	999
	1000	}
	1001
[2816]	1002	sub associate_cover_image {
[10833]	1003	my $self = shift;
[2816]	1004	my ($doc_obj, $filename) = @_;
	1005
[10833]	1006	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
	1007	if (exists $self->{'covers_missing_cache'}->{$filename}) {
	1008	# don't stat() for existence eg for multiple document input files
	1009	# (eg SplitPlug)
	1010	return;
	1011	}
	1012
[9413]	1013	my $top_section=$doc_obj->get_top_section();
	1014
[2816]	1015	if (-e $filename) {
[13968]	1016	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]	1017	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]	1018	} else {
[10833]	1019	my $upper_filename = $filename;
	1020	$upper_filename =~ s/jpg$/JPG/;
	1021	if (-e $upper_filename) {
	1022	$doc_obj->associate_file($upper_filename, "cover.jpg",
	1023	"image/jpeg");
[9413]	1024	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]	1025	} else {
	1026	# file doesn't exist, so record the fact that it's missing so
	1027	# we don't stat() again (stat is slow)
	1028	$self->{'covers_missing_cache'}->{$filename} = 1;
[3086]	1029	}
[2816]	1030	}
[10833]	1031
[2816]	1032	}
	1033
[11332]	1034
	1035	# Overridden by exploding plugins (eg. ISISPlug)
	1036	sub clean_up_after_exploding
	1037	{
	1038	my $self = shift(@_);
	1039	}
	1040
	1041
[16390]	1042
[4]	1043	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: