Context Navigation

source: gsdl/trunk/perllib/plugins/BasePlugin.pm@ 18320

Last change on this file since 18320 was 18320, checked in by ak19, 15 years ago
Now plugins provide the option of base64 encoding or url encoding filenames that are to be renamed (when copied into the archives dir). Previously renamed files would always be url-encoded. URL-encoding is the default now for most plugins except MP3Plugin and OggVorbisPlugin, where the default is base64 encoding. Base64 encoding filenames upon renaming them was introduced so that more files that browsers try to open in external applications can open them, since url encoding does not seem to be implemented the same everywhere (for instance, windows media player is unable to handle url-encoded wmv filenames when such files are launched in it through the browser).
Property svn:keywords set to `Author Date Id Revision`
File size: 34.8 KB

Line
1	###########################################################################
2	#
3	# BasePlugin.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasePlugin;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32	use File::Basename;
33
34	use encodings;
35	use unicode;
36	use textcat;
37	use doc;
38	use ghtml;
39	use gsprintf 'gsprintf';
40
41	use PrintInfo;
42
43	BEGIN {
44	@BasePlugin::ISA = ( 'PrintInfo' );
45	}
46
47	# the different methods that can be applied when renaming
48	# imported documents and their associated files
49	our $file_rename_method_list =
50	[ { 'name' => "url",
51	'desc' => "{BasePlugin.rename_method.url}" },
52	{ 'name' => "base64",
53	'desc' => "{BasePlugin.rename_method.base64}" } ];
54
55	our $encoding_list =
56	[ { 'name' => "ascii",
57	'desc' => "{BasePlugin.encoding.ascii}" },
58	{ 'name' => "utf8",
59	'desc' => "{BasePlugin.encoding.utf8}" },
60	{ 'name' => "unicode",
61	'desc' => "{BasePlugin.encoding.unicode}" } ];
62
63
64	my $e = $encodings::encodings;
65	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
66	{
67	my $hashEncode =
68	{'name' => $enc,
69	'desc' => $e->{$enc}->{'name'}};
70
71	push(@{$encoding_list},$hashEncode);
72	}
73
74	our $encoding_plus_auto_list =
75	[ { 'name' => "auto",
76	'desc' => "{BasePlugin.filename_encoding.auto}" },
77	{ 'name' => "auto-language-analysis",
78	'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
79	{ 'name' => "auto-filesystem-encoding",
80	'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
81	{ 'name' => "auto-fl",
82	'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
83	{ 'name' => "auto-lf",
84	'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
85
86	push(@{$encoding_plus_auto_list},@{$encoding_list});
87
88	our $oidtype_list =
89	[ { 'name' => "auto",
90	'desc' => "{BasePlugin.OIDtype.auto}" },
91	{ 'name' => "hash",
92	'desc' => "{import.OIDtype.hash}" },
93	{ 'name' => "assigned",
94	'desc' => "{import.OIDtype.assigned}" },
95	{ 'name' => "incremental",
96	'desc' => "{import.OIDtype.incremental}" },
97	{ 'name' => "dirname",
98	'desc' => "{import.OIDtype.dirname}" } ];
99
100	my $arguments =
101	[ { 'name' => "process_exp",
102	'desc' => "{BasePlugin.process_exp}",
103	'type' => "regexp",
104	'deft' => "",
105	'reqd' => "no" },
106	{ 'name' => "no_blocking",
107	'desc' => "{BasePlugin.no_blocking}",
108	'type' => "flag",
109	'reqd' => "no"},
110	{ 'name' => "block_exp",
111	'desc' => "{BasePlugin.block_exp}",
112	'type' => "regexp",
113	'deft' => "",
114	'reqd' => "no" },
115	{ 'name' => "associate_ext",
116	'desc' => "{BasePlugin.associate_ext}",
117	'type' => "string",
118	'reqd' => "no" },
119	{ 'name' => "associate_tail_re",
120	'desc' => "{BasePlugin.associate_tail_re}",
121	'type' => "string",
122	'reqd' => "no" },
123	{ 'name' => "OIDtype",
124	'desc' => "{import.OIDtype}",
125	'type' => "enum",
126	'list' => $oidtype_list,
127	# leave default empty so we can tell if its been set or not - if not set will use option from import.pl
128	'deft' => "auto",
129	'reqd' => "no",
130	'modegli' => "2" },
131	{ 'name' => "OIDmetadata",
132	'desc' => "{import.OIDmetadata}",
133	'type' => "metadata",
134	'deft' => "dc.Identifier",
135	'reqd' => "no",
136	'modegli' => "2" },
137	# { 'name' => "use_as_doc_identifier",
138	# 'desc' => "{BasePlugin.use_as_doc_identifier}",
139	# 'type' => "string",
140	# 'reqd' => "no" ,
141	# 'deft' => "" } ,
142	{ 'name' => "no_cover_image",
143	'desc' => "{BasePlugin.no_cover_image}",
144	'type' => "flag",
145	'reqd' => "no" },
146	{ 'name' => "filename_encoding",
147	'desc' => "{BasePlugin.filename_encoding}",
148	'type' => "enum",
149	'deft' => "auto",
150	'list' => $encoding_plus_auto_list,
151	'reqd' => "no" },
152	{ 'name' => "smart_block",
153	'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
154	'type' => "flag",
155	'reqd' => "no",
156	'hiddengli' => "yes" }, # deprecated, but leave in for old collections
157	{ 'name' => "file_rename_method",
158	'desc' => "{BasePlugin.file_rename_method}",
159	'type' => "enum",
160	'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
161	'list' => $file_rename_method_list,
162	'reqd' => "no"
163	}
164
165	];
166
167
168	my $options = { 'name' => "BasePlugin",
169	'desc' => "{BasePlugin.desc}",
170	'abstract' => "yes",
171	'inherits' => "no",
172	'args' => $arguments };
173
174	sub new {
175
176	my ($class) = shift (@_);
177	my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
178	push(@$pluginlist, $class);
179
180	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
181	push(@{$hashArgOptLists->{"OptList"}},$options);
182
183	my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
184
185	if ($self->{'info_only'}) {
186	# don't worry about any options etc
187	return bless $self, $class;
188	}
189
190	if ($self->{'smart_block'}) {
191	print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
192	}
193	$self->{'smart_block'} = undef;
194
195	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
196	$self->{'plugin_type'} = $plugin_name;
197
198	$self->{'num_processed'} = 0;
199	$self->{'num_not_processed'} = 0;
200	$self->{'num_blocked'} = 0;
201	$self->{'num_archives'} = 0;
202	$self->{'cover_image'} = 1; # cover image is on by default
203	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
204	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
205
206	my $associate_ext = $self->{'associate_ext'};
207	if ((defined $associate_ext) && ($associate_ext ne "")) {
208
209	my $associate_tail_re = $self->{'associate_tail_re'};
210	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
211	my $outhandle = $self->{'outhandle'};
212	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
213	print $outhandle " defaulting to 'associate_tail_re'\n";
214	}
215	else {
216	my @exts = split(/,/,$associate_ext);
217
218	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
219	my $associate_tail_re = join("\|",@exts_bracketed);
220	$self->{'associate_tail_re'} = $associate_tail_re;
221	}
222
223	delete $self->{'associate_ext'};
224	}
225
226	return bless $self, $class;
227
228	}
229
230	sub merge_inheritance
231	{
232	my $self = {};
233	my @child_selfs = @_;
234
235	foreach my $child_self (@child_selfs) {
236	foreach my $key (keys %$child_self) {
237	if (defined $self->{$key}) {
238	if ($self->{$key} ne $child_self->{$key}) {
239	# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
240	# print STDERR "Existing stored value = $self->{$key}\n";
241	# print STDERR "New (child) value = $child_self->{$key}\n";
242	# print STDERR "Keeping existing value\n";
243	# Existing value seems to be option specified in collect.cfg
244
245	### $self->{$key} = $child_self->{$key};
246
247	}
248	else {
249	## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
250	}
251
252	}
253	else {
254	$self->{$key} = $child_self->{$key};
255	}
256	}
257	}
258
259	return $self;
260	}
261
262	# initialize BasePlugin options
263	# if init() is overridden in a sub-class, remember to call BasePlugin::init()
264	sub init {
265	my $self = shift (@_);
266	my ($verbosity, $outhandle, $failhandle) = @_;
267
268	# verbosity is passed through from the processor
269	$self->{'verbosity'} = $verbosity;
270
271	# as are the outhandle and failhandle
272	$self->{'outhandle'} = $outhandle if defined $outhandle;
273	$self->{'failhandle'} = $failhandle;
274	# $self->SUPER::init(@_);
275
276	# set process_exp and block_exp to defaults unless they were
277	# explicitly set
278
279	if ((!$self->is_recursive()) and
280	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
281
282	$self->{'process_exp'} = $self->get_default_process_exp ();
283	if ($self->{'process_exp'} eq "") {
284	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
285	}
286	}
287
288	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
289	$self->{'block_exp'} = $self->get_default_block_exp ();
290	}
291
292	}
293
294	sub begin {
295	my $self = shift (@_);
296	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
297
298	if ($self->{'OIDtype'} eq "auto") {
299	# hasn't been set in the plugin, use the processor values
300	$self->{'OIDtype'} = $processor->{'OIDtype'};
301	$self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
302	}
303	if ($self->{'OIDtype'} eq "hash") {
304	# should we hash on the file or on the doc xml??
305	$self->{'OIDtype'} = $self->get_oid_hash_type();
306	if ($self->{'OIDtype'} !~ /^(hash_on_file\|hash_on_ga_xml)$/) {
307	$self->{'OIDtype'} = "hash_on_file";
308	}
309	}
310	}
311
312	sub end {
313	# potentially called at the end of each plugin pass
314	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
315
316	my ($self) = shift (@_);
317	}
318
319	sub deinit {
320	# called only once, after all plugin passes have been done
321
322	my ($self) = @_;
323	}
324
325	# default hashing type is to hash on the original file (or converted file)
326	# override this to return hash_on_ga_xml for filetypes where hashing on the
327	# file is no good eg video
328	sub get_oid_hash_type {
329
330	my $self = shift (@_);
331
332	return "hash_on_file";
333	}
334
335	sub set_incremental {
336	my $self = shift(@_);
337	my ($incremental) = @_;
338
339	$self->{'incremental'} = $incremental;
340	}
341
342	# this function should be overridden to return 1
343	# in recursive plugins
344	sub is_recursive {
345	my $self = shift (@_);
346
347	return 0;
348	}
349
350	sub get_default_block_exp {
351	my $self = shift (@_);
352
353	return "";
354	}
355
356	sub get_default_process_exp {
357	my $self = shift (@_);
358
359	return "";
360	}
361
362	# rename imported files and assoc files using URL encoding by default
363	# as this will work for most plugins and give more legible filenames
364	sub get_default_file_rename_method() {
365	my $self = shift (@_);
366	return "url";
367	}
368
369	# default implementation is to do nothing
370	sub store_block_files {
371
372	my $self =shift (@_);
373	my ($filename_full_path, $block_hash) = @_;
374
375	}
376
377	# put files to block into hash
378	sub use_block_expressions {
379
380	my $self =shift (@_);
381	my ($filename_full_path, $block_hash) = @_;
382
383	if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
384	$block_hash->{'file_blocks'}->{$filename_full_path} = 1;
385	}
386
387	}
388
389	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
390	sub block_cover_image
391	{
392	my $self =shift;
393	my ($filename, $block_hash) = @_;
394
395	if ($self->{'cover_image'}) {
396	my $coverfile = $filename;
397	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
398	if (!-e $coverfile) {
399	$coverfile =~ s/jpg$/JPG/;
400	}
401	if (-e $coverfile) {
402	$block_hash->{'file_blocks'}->{$coverfile} = 1;
403	}
404	}
405
406	return;
407	}
408
409
410	# discover all the files that should be blocked by this plugin
411	# check the args ...
412	sub file_block_read {
413
414	my $self = shift (@_);
415	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
416	# Keep track of filenames with same root but different extensions
417	# Used to support -associate_ext and the more generalised
418	# -associate_tail_re
419	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
420
421	my $associate_tail_re = $self->{'associate_tail_re'};
422	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
423	my ($file_prefix,$file_ext)
424	= &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
425	if ((defined $file_prefix) && (defined $file_ext)) {
426	my $shared_fileroot = $block_hash->{'shared_fileroot'};
427	if (!defined $shared_fileroot->{$file_prefix}) {
428	my $file_prefix_rec = { 'tie_to' => undef,
429	'exts' => {} };
430	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
431	}
432
433	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
434
435	if ($self->can_process_this_file($filename_full_path)) {
436	# This is the document the others should be tied to
437	$file_prefix_rec->{'tie_to'} = $file_ext;
438	}
439	else {
440	if ($file_ext =~ m/$associate_tail_re$/) {
441	# this file should be associated to the main one
442	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
443	}
444	}
445
446	}
447	}
448
449	# check block expressions
450	$self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
451
452	# now check whether we are actually processing this
453	if (!-f $filename_full_path \|\| !$self->can_process_this_file($filename_full_path)) {
454	return undef; # can't recognise
455	}
456
457	# if we have a block_exp, then this overrides the normal 'smart' blocking
458	$self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} \|\| $self->{'block_exp'} ne "");
459
460	# block the cover image if there is one
461	if ($self->{'cover_image'}) {
462	$self->block_cover_image($filename_full_path, $block_hash);
463	}
464
465	return 1;
466	}
467
468	# plugins that rely on more than process_exp (eg XML plugins) can override this method
469	sub can_process_this_file {
470	my $self = shift(@_);
471	my ($filename) = @_;
472
473	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
474	return 1;
475	}
476	return 0;
477
478	}
479
480	# just converts path as is to utf8.
481	sub filepath_to_utf8 {
482	my $self = shift (@_);
483	my ($file, $file_encoding) = @_;
484	my $filemeta = $file;
485
486	my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
487
488	# Whenever filename-encoding is set to any of the auto settings, we
489	# check if the filename is already in UTF8. If it is, then we're done.
490	if($filename_encoding =~ m/auto/) {
491	if(&unicode::check_is_utf8($filemeta))
492	{
493	$filename_encoding = "utf8";
494	return $filemeta;
495	}
496	}
497
498	# Auto setting, but filename is not utf8
499	if ($filename_encoding eq "auto")
500	{
501	# try textcat
502	$filename_encoding = $self->textcat_encoding($filemeta);
503
504	# check the locale next
505	$filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
506
507
508	# now try the encoding of the document, if available
509	if ($filename_encoding eq "undefined" && defined $file_encoding) {
510	$filename_encoding = $file_encoding;
511	}
512
513	}
514
515	elsif ($filename_encoding eq "auto-language-analysis")
516	{
517	$filename_encoding = $self->textcat_encoding($filemeta);
518
519	# now try the encoding of the document, if available
520	if ($filename_encoding eq "undefined" && defined $file_encoding) {
521	$filename_encoding = $file_encoding;
522	}
523	}
524
525	elsif ($filename_encoding eq "auto-filesystem-encoding")
526	{
527	# try locale
528	$filename_encoding = $self->locale_encoding();
529	}
530
531	elsif ($filename_encoding eq "auto-fl")
532	{
533	# filesystem-encoding (locale) then language-analysis (textcat)
534	$filename_encoding = $self->locale_encoding();
535
536	# try textcat
537	$filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
538
539	# else assume filename encoding is encoding of file content, if that's available
540	if ($filename_encoding eq "undefined" && defined $file_encoding) {
541	$filename_encoding = $file_encoding;
542	}
543	}
544
545	elsif ($filename_encoding eq "auto-lf")
546	{
547	# language-analysis (textcat) then filesystem-encoding (locale)
548	$filename_encoding = $self->textcat_encoding($filemeta);
549
550	# guess filename encoding from encoding of file content, if available
551	if ($filename_encoding eq "undefined" && defined $file_encoding) {
552	$filename_encoding = $file_encoding;
553	}
554
555	# try locale
556	$filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
557	}
558
559	# if still undefined, use utf8 as fallback
560	if ($filename_encoding eq "undefined") {
561	$filename_encoding = "utf8";
562	}
563
564	#print STDERR "**** UTF8 encoding the filename $filemeta ";
565
566	# if the filename encoding is set to utf8 but it isn't utf8 already--such as when
567	# 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
568	# be always utf8 (in which case the filename's encoding is also set as utf8 even
569	# though the filename need not be if it originates from another system)--in such
570	# cases attempt to make the filename utf8 to match.
571	if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
572	&unicode::ensure_utf8(\$filemeta);
573	}
574
575	# convert non-unicode encodings to utf8
576	if ($filename_encoding !~ m/(?:ascii\|utf8\|unicode)/) {
577	$filemeta = &unicode::unicode2utf8(
578	&unicode::convert2unicode($filename_encoding, \$filemeta)
579	);
580	}
581
582	#print STDERR " from encoding $filename_encoding -> $filemeta\n";
583	return $filemeta;
584	}
585
586	# gets the filename with no path, converts to utf8, and then dm safes it.
587	# filename_encoding set by user
588	sub filename_to_utf8_metadata
589	{
590	my $self = shift (@_);
591	my ($file, $file_encoding) = @_;
592
593	my $outhandle = $self->{'outhandle'};
594
595	my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
596	$filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
597
598	my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
599
600	return $dmsafe_filemeta;
601
602	}
603
604	sub locale_encoding {
605	my $self = shift(@_);
606
607	if (!defined $self->{'filesystem_encoding'}) {
608	$self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
609	$self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
610	}
611
612	#print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
613	return $self->{'filesystem_encoding'}; # can be the string "undefined"
614	}
615
616	sub textcat_encoding {
617	my $self = shift(@_);
618	my ($filemeta) = @_;
619
620	# analyse filenames without extensions and digits (and trimmed of surrounding
621	# whitespace), so that irrelevant chars don't confuse textcat
622	my $strictfilemeta = $filemeta;
623	$strictfilemeta =~ s/\.[^\.]+$//g;
624	$strictfilemeta =~ s/\d//g;
625	$strictfilemeta =~ s/^\s*//g;
626	$strictfilemeta =~ s/\s*$//g;
627
628	my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
629	if(!defined $filename_encoding) {
630	$filename_encoding = "undefined";
631	}
632
633	return $filename_encoding; # can be the string "undefined"
634	}
635
636	# performs textcat
637	sub encoding_from_language_analysis {
638	my $self = shift(@_);
639	my ($text) = @_;
640
641	my $outhandle = $self->{'outhandle'};
642	my $best_encoding = undef;
643
644	# get the language/encoding of the textstring using textcat
645	$self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
646	my $results = $self->{'textcat'}->classify_cached_filename(\$text);
647
648
649	if (scalar @$results < 0) {
650	return undef;
651	}
652
653	# We have some results, we choose the first
654	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
655
656	$best_encoding = $encoding;
657	if (!defined $best_encoding) {
658	return undef;
659	}
660
661	if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
662	# the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
663	$best_encoding = 'utf8';
664	}
665
666
667	# check for equivalents where textcat doesn't have some encodings...
668	# eg MS versions of standard encodings
669	if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
670	my $iso = $1; # which variant of the iso standard?
671	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
672	if ($text =~ /[\x80-\x9f]/) {
673	# Western Europe
674	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
675	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
676	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
677	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
678	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
679	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
680	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
681	}
682	}
683
684	if (defined $best_encoding && $best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
685	!defined $encodings::encodings->{$best_encoding})
686	{
687	if ($self->{'verbosity'}) {
688	gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
689	}
690	$best_encoding = undef;
691	}
692
693	return $best_encoding;
694	}
695
696	# uses locale
697	sub get_filesystem_encoding {
698
699	my $self = shift(@_);
700
701	my $outhandle = $self->{'outhandle'};
702	my $filesystem_encoding = undef;
703
704	eval {
705	use POSIX qw(locale_h);
706
707	# With only one parameter, setlocale retrieves the
708	# current value
709	my $current_locale = setlocale(LC_CTYPE);
710
711	if ($current_locale =~ m/^.\.(.?)$/) {
712	my $char_encoding = lc($1);
713	if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
714	$char_encoding = "$1\_$2\_$3";
715	}
716
717	$char_encoding =~ s/-/_/g;
718	$char_encoding =~ s/^utf_8$/utf8/;
719
720	if ($char_encoding =~ m/^\d+$/) {
721	if (defined $encodings::encodings->{"windows_$char_encoding"}) {
722	$char_encoding = "windows_$char_encoding";
723	}
724	elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
725	$char_encoding = "dos_$char_encoding";
726	}
727	}
728
729	if (($char_encoding =~ m/(?:ascii\|utf8\|unicode)/)
730	\|\| (defined $encodings::encodings->{$char_encoding})) {
731	$filesystem_encoding = $char_encoding;
732	}
733	else {
734	print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
735	}
736	}
737
738
739	};
740	if ($@) {
741	print $outhandle "$@\n";
742	print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
743
744	}
745	return $filesystem_encoding;
746	}
747
748	# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
749	sub set_Source_metadata {
750	my $self = shift (@_);
751	my ($doc_obj, $filename_no_path, $file_encoding) = @_;
752
753	my $top_section = $doc_obj->get_top_section();
754
755	# UTF-8 version of filename
756	my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
757
758	# Source is the UTF8 display name - not necessarily the name of the file on the system
759	$doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
760
761	$filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
762	# If using URL encoding, then SourceFile is the url-reference to url-encoded
763	# filemeta: it's a url that refers to the actual file on the system
764	$filemeta = &unicode::filename_to_url($filemeta, $self->{'file_rename_method'});
765
766	$doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
767	}
768
769	# this should be called by all plugins to set the oid of the doc obj, rather
770	# than calling doc_obj->set_OID directly
771	sub add_OID {
772	my $self = shift (@_);
773	my ($doc_obj) = @_;
774
775	$doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
776
777	# see if there is a plugin specific set_OID function
778	if (defined ($self->can('set_OID'))) {
779	$self->set_OID(@_); # pass through doc_obj and any extra arguments
780	}
781	else {
782	# use the default set_OID() in doc.pm
783	$doc_obj->set_OID();
784	}
785
786	}
787
788	# The BasePlugin read_into_doc_obj() function. This function does all the
789	# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
790	# a file and sets up a slew of metadata all saved in doc_obj, which
791	# it then returns as part of a tuple (process_status,doc_obj)
792	#
793	# Much of this functionality used to reside in read, but it was broken
794	# down into a supporting routine to make the code more flexible.
795	#
796	# recursive plugins (e.g. RecPlug) and specialized plugins like those
797	# capable of processing many documents within a single file (e.g.
798	# GMLPlug) will normally want to implement their own version of
799	# read_into_doc_obj()
800	#
801	# Note that $base_dir might be "" and that $file might
802	# include directories
803
804	# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
805	sub read_into_doc_obj {
806	my $self = shift (@_);
807	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
808
809	my $outhandle = $self->{'outhandle'};
810
811	# should we move this to read? What about secondary plugins?
812	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
813	print $outhandle "$self->{'plugin_type'} processing $file\n"
814	if $self->{'verbosity'} > 1;
815
816	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
817	# create a new document
818	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
819	my $top_section = $doc_obj->get_top_section();
820
821	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
822	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
823
824	# sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
825	# of the UTF8 filename (SourceFile) for generated files
826	$self->set_Source_metadata($doc_obj, $filename_no_path);
827
828	# plugin specific stuff - what args do we need here??
829	unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
830	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
831	return -1;
832	}
833
834	# include any metadata passed in from previous plugins
835	# note that this metadata is associated with the top level section
836	my $section = $doc_obj->get_top_section();
837	# can we merge these two methods??
838	$self->add_associated_files($doc_obj, $filename_full_path);
839	$self->extra_metadata ($doc_obj, $section, $metadata);
840	$self->auto_extract_metadata($doc_obj);
841
842	# if we haven't found any Title so far, assign one
843	# this was shifted to here from inside read()
844	$self->title_fallback($doc_obj,$section,$filename_no_path);
845
846	$self->add_OID($doc_obj);
847
848	return (1,$doc_obj);
849	}
850
851	sub add_dummy_text {
852	my $self = shift(@_);
853	my ($doc_obj, $section) = @_;
854
855	# add NoText metadata so we can hide this dummy text in format statements
856	$doc_obj->add_metadata($section, "NoText", "1");
857	$doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
858
859	}
860
861	# does nothing. Can be overridden by subclass
862	sub auto_extract_metadata {
863	my $self = shift(@_);
864	my ($doc_obj) = @_;
865	}
866
867	# adds cover image, associate_file options stuff. Should be called by sub class
868	# read_into_doc_obj
869	sub add_associated_files {
870	my $self = shift(@_);
871	# whatis filename??
872	my ($doc_obj, $filename) = @_;
873
874	# add in the cover image
875	if ($self->{'cover_image'}) {
876	$self->associate_cover_image($doc_obj, $filename);
877	}
878
879
880	}
881
882	# implement this if you are extracting metadata for other documents
883	sub metadata_read {
884	my $self = shift (@_);
885	my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
886
887	# can we process this file??
888	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
889	return undef unless $self->can_process_this_file($filename_full_path);
890
891	return 1; # we recognise the file, but don't actually do anything with it
892	}
893
894
895	# The BasePlugin read() function. This function calls read_into_doc_obj()
896	# to ensure all the right things to make general options work for a
897	# given plugin are done. It then calls the process() function which
898	# does all the work specific to a plugin (like the old read functions
899	# used to do). Most plugins should define their own process() function
900	# and let this read() function keep control.
901	#
902	# recursive plugins (e.g. RecPlug) and specialized plugins like those
903	# capable of processing many documents within a single file (e.g.
904	# GMLPlug) might want to implement their own version of read(), but
905	# more likely need to implement their own version of read_into_doc_obj()
906	#
907	# Return number of files processed, undef if can't recognise, -1 if can't
908	# process
909
910	sub read {
911	my $self = shift (@_);
912	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
913
914	# can we process this file??
915	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
916	return undef unless $self->can_process_this_file($filename_full_path);
917
918	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
919
920	if ((defined $process_status) && ($process_status == 1)) {
921
922	# process the document
923	$processor->process($doc_obj);
924
925	$self->{'num_processed'} ++;
926	undef $doc_obj;
927	}
928	# delete any temp files that we may have created
929	$self->clean_up_after_doc_obj_processing();
930
931	# if process_status == 1, then the file has been processed.
932	return $process_status;
933
934	}
935
936	# returns undef if file is rejected by the plugin
937	sub process {
938	my $self = shift (@_);
939	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
940
941	gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
942
943	return undef; # never gets here
944	}
945
946	# overwrite this method to delete any temp files that we have created
947	sub clean_up_after_doc_obj_processing {
948	my $self = shift(@_);
949
950	}
951
952	# write_file -- used by ConvertToPlug, for example in post processing
953	#
954	# where should this go, is here the best place??
955	sub utf8_write_file {
956	my $self = shift (@_);
957	my ($textref, $filename) = @_;
958
959	if (!open (FILE, ">$filename")) {
960	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
961	die "\n";
962	}
963	print FILE $$textref;
964
965	close FILE;
966	}
967
968
969	sub filename_based_title
970	{
971	my $self = shift (@_);
972	my ($file) = @_;
973
974	my $file_derived_title = $file;
975	$file_derived_title =~ s/_/ /g;
976	$file_derived_title =~ s/\..*?$//;
977
978	return $file_derived_title;
979	}
980
981
982	sub title_fallback
983	{
984	my $self = shift (@_);
985	my ($doc_obj,$section,$file) = @_;
986
987	if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
988
989	my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
990	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
991	$doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
992	}
993	else {
994	$doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
995	}
996	}
997
998	}
999
1000	# add any extra metadata that's been passed around from one
1001	# plugin to another.
1002	# extra_metadata uses add_utf8_metadata so it expects metadata values
1003	# to already be in utf8
1004	sub extra_metadata {
1005	my $self = shift (@_);
1006	my ($doc_obj, $cursection, $metadata) = @_;
1007
1008	my $associate_tail_re = $self->{'associate_tail_re'};
1009
1010	foreach my $field (keys(%$metadata)) {
1011	# $metadata->{$field} may be an array reference
1012	if ($field eq "gsdlassocfile_tobe") {
1013	# 'gsdlassocfile_tobe' is artificially introduced metadata
1014	# that is used to signal that certain additional files should
1015	# be tied to this document. Useful in situations where a
1016	# metadata pass in the plugin pipeline works out some files
1017	# need to be associated with a document, but the document hasn't
1018	# been formed yet.
1019	my $equiv_form = "";
1020	foreach my $gaf (@{$metadata->{$field}}) {
1021	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1022	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1023
1024	# we need to make sure the filename is valid utf-8 - we do
1025	# this by url or base64 encoding it
1026	# $tail_filename is the name that we store the file as
1027	$tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1028	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1029	# If the filename is url_encoded, we need to encode the % signs
1030	# in the filename, so that it works in a url
1031	my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1032	# work out extended tail extension (i.e. matching tail re)
1033
1034	my ($file_prefix,$file_extended_ext)
1035	= &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1036	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
1037	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1038	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$url_tail_filename\">";
1039	my $srcicon = "_icon".$doc_ext."_";
1040	my $end_doclink = "</a>";
1041
1042	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1043
1044	if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1045	# for metadata such as [mp3._edited] [mp3._full] ...
1046	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1047	}
1048
1049	# for multiple metadata such as [mp3.assoclink]
1050	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1051
1052	$equiv_form .= " $assoc_form";
1053	}
1054	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1055	}
1056	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1057	map {
1058	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1059	} @{$metadata->{$field}};
1060	} else {
1061	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1062	}
1063	}
1064	}
1065
1066
1067	sub compile_stats {
1068	my $self = shift(@_);
1069	my ($stats) = @_;
1070
1071	$stats->{'num_processed'} += $self->{'num_processed'};
1072	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1073	$stats->{'num_archives'} += $self->{'num_archives'};
1074
1075	}
1076
1077	sub associate_cover_image {
1078	my $self = shift;
1079	my ($doc_obj, $filename) = @_;
1080
1081	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1082	if (exists $self->{'covers_missing_cache'}->{$filename}) {
1083	# don't stat() for existence eg for multiple document input files
1084	# (eg SplitPlug)
1085	return;
1086	}
1087
1088	my $top_section=$doc_obj->get_top_section();
1089
1090	if (-e $filename) {
1091	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1092	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1093	} else {
1094	my $upper_filename = $filename;
1095	$upper_filename =~ s/jpg$/JPG/;
1096	if (-e $upper_filename) {
1097	$doc_obj->associate_file($upper_filename, "cover.jpg",
1098	"image/jpeg");
1099	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1100	} else {
1101	# file doesn't exist, so record the fact that it's missing so
1102	# we don't stat() again (stat is slow)
1103	$self->{'covers_missing_cache'}->{$filename} = 1;
1104	}
1105	}
1106
1107	}
1108
1109
1110	# Overridden by exploding plugins (eg. ISISPlug)
1111	sub clean_up_after_exploding
1112	{
1113	my $self = shift(@_);
1114	}
1115
1116
1117
1118	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: