source: gsdl/trunk/perllib/plugins/BasePlugin.pm@ 18320

Last change on this file since 18320 was 18320, checked in by ak19, 15 years ago

Now plugins provide the option of base64 encoding or url encoding filenames that are to be renamed (when copied into the archives dir). Previously renamed files would always be url-encoded. URL-encoding is the default now for most plugins except MP3Plugin and OggVorbisPlugin, where the default is base64 encoding. Base64 encoding filenames upon renaming them was introduced so that more files that browsers try to open in external applications can open them, since url encoding does not seem to be implemented the same everywhere (for instance, windows media player is unable to handle url-encoded wmv filenames when such files are launched in it through the browser).

  • Property svn:keywords set to Author Date Id Revision
File size: 34.8 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33
34use encodings;
35use unicode;
36use textcat;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
53 'desc' => "{BasePlugin.rename_method.base64}" } ];
54
55our $encoding_list =
56 [ { 'name' => "ascii",
57 'desc' => "{BasePlugin.encoding.ascii}" },
58 { 'name' => "utf8",
59 'desc' => "{BasePlugin.encoding.utf8}" },
60 { 'name' => "unicode",
61 'desc' => "{BasePlugin.encoding.unicode}" } ];
62
63
64my $e = $encodings::encodings;
65foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
66{
67 my $hashEncode =
68 {'name' => $enc,
69 'desc' => $e->{$enc}->{'name'}};
70
71 push(@{$encoding_list},$hashEncode);
72}
73
74our $encoding_plus_auto_list =
75 [ { 'name' => "auto",
76 'desc' => "{BasePlugin.filename_encoding.auto}" },
77 { 'name' => "auto-language-analysis",
78 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
79 { 'name' => "auto-filesystem-encoding",
80 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
81 { 'name' => "auto-fl",
82 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
83 { 'name' => "auto-lf",
84 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
85
86push(@{$encoding_plus_auto_list},@{$encoding_list});
87
88our $oidtype_list =
89 [ { 'name' => "auto",
90 'desc' => "{BasePlugin.OIDtype.auto}" },
91 { 'name' => "hash",
92 'desc' => "{import.OIDtype.hash}" },
93 { 'name' => "assigned",
94 'desc' => "{import.OIDtype.assigned}" },
95 { 'name' => "incremental",
96 'desc' => "{import.OIDtype.incremental}" },
97 { 'name' => "dirname",
98 'desc' => "{import.OIDtype.dirname}" } ];
99
100my $arguments =
101 [ { 'name' => "process_exp",
102 'desc' => "{BasePlugin.process_exp}",
103 'type' => "regexp",
104 'deft' => "",
105 'reqd' => "no" },
106 { 'name' => "no_blocking",
107 'desc' => "{BasePlugin.no_blocking}",
108 'type' => "flag",
109 'reqd' => "no"},
110 { 'name' => "block_exp",
111 'desc' => "{BasePlugin.block_exp}",
112 'type' => "regexp",
113 'deft' => "",
114 'reqd' => "no" },
115 { 'name' => "associate_ext",
116 'desc' => "{BasePlugin.associate_ext}",
117 'type' => "string",
118 'reqd' => "no" },
119 { 'name' => "associate_tail_re",
120 'desc' => "{BasePlugin.associate_tail_re}",
121 'type' => "string",
122 'reqd' => "no" },
123 { 'name' => "OIDtype",
124 'desc' => "{import.OIDtype}",
125 'type' => "enum",
126 'list' => $oidtype_list,
127 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
128 'deft' => "auto",
129 'reqd' => "no",
130 'modegli' => "2" },
131 { 'name' => "OIDmetadata",
132 'desc' => "{import.OIDmetadata}",
133 'type' => "metadata",
134 'deft' => "dc.Identifier",
135 'reqd' => "no",
136 'modegli' => "2" },
137# { 'name' => "use_as_doc_identifier",
138# 'desc' => "{BasePlugin.use_as_doc_identifier}",
139# 'type' => "string",
140# 'reqd' => "no" ,
141# 'deft' => "" } ,
142 { 'name' => "no_cover_image",
143 'desc' => "{BasePlugin.no_cover_image}",
144 'type' => "flag",
145 'reqd' => "no" },
146 { 'name' => "filename_encoding",
147 'desc' => "{BasePlugin.filename_encoding}",
148 'type' => "enum",
149 'deft' => "auto",
150 'list' => $encoding_plus_auto_list,
151 'reqd' => "no" },
152 { 'name' => "smart_block",
153 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
154 'type' => "flag",
155 'reqd' => "no",
156 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
157 { 'name' => "file_rename_method",
158 'desc' => "{BasePlugin.file_rename_method}",
159 'type' => "enum",
160 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
161 'list' => $file_rename_method_list,
162 'reqd' => "no"
163 }
164
165 ];
166
167
168my $options = { 'name' => "BasePlugin",
169 'desc' => "{BasePlugin.desc}",
170 'abstract' => "yes",
171 'inherits' => "no",
172 'args' => $arguments };
173
174sub new {
175
176 my ($class) = shift (@_);
177 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
178 push(@$pluginlist, $class);
179
180 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
181 push(@{$hashArgOptLists->{"OptList"}},$options);
182
183 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
184
185 if ($self->{'info_only'}) {
186 # don't worry about any options etc
187 return bless $self, $class;
188 }
189
190 if ($self->{'smart_block'}) {
191 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
192 }
193 $self->{'smart_block'} = undef;
194
195 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
196 $self->{'plugin_type'} = $plugin_name;
197
198 $self->{'num_processed'} = 0;
199 $self->{'num_not_processed'} = 0;
200 $self->{'num_blocked'} = 0;
201 $self->{'num_archives'} = 0;
202 $self->{'cover_image'} = 1; # cover image is on by default
203 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
204 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
205
206 my $associate_ext = $self->{'associate_ext'};
207 if ((defined $associate_ext) && ($associate_ext ne "")) {
208
209 my $associate_tail_re = $self->{'associate_tail_re'};
210 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
211 my $outhandle = $self->{'outhandle'};
212 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
213 print $outhandle " defaulting to 'associate_tail_re'\n";
214 }
215 else {
216 my @exts = split(/,/,$associate_ext);
217
218 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
219 my $associate_tail_re = join("|",@exts_bracketed);
220 $self->{'associate_tail_re'} = $associate_tail_re;
221 }
222
223 delete $self->{'associate_ext'};
224 }
225
226 return bless $self, $class;
227
228}
229
230sub merge_inheritance
231{
232 my $self = {};
233 my @child_selfs = @_;
234
235 foreach my $child_self (@child_selfs) {
236 foreach my $key (keys %$child_self) {
237 if (defined $self->{$key}) {
238 if ($self->{$key} ne $child_self->{$key}) {
239# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
240# print STDERR "Existing stored value = $self->{$key}\n";
241# print STDERR "New (child) value = $child_self->{$key}\n";
242# print STDERR "Keeping existing value\n";
243 # Existing value seems to be option specified in collect.cfg
244
245 ### $self->{$key} = $child_self->{$key};
246
247 }
248 else {
249## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
250 }
251
252 }
253 else {
254 $self->{$key} = $child_self->{$key};
255 }
256 }
257 }
258
259 return $self;
260}
261
262# initialize BasePlugin options
263# if init() is overridden in a sub-class, remember to call BasePlugin::init()
264sub init {
265 my $self = shift (@_);
266 my ($verbosity, $outhandle, $failhandle) = @_;
267
268 # verbosity is passed through from the processor
269 $self->{'verbosity'} = $verbosity;
270
271 # as are the outhandle and failhandle
272 $self->{'outhandle'} = $outhandle if defined $outhandle;
273 $self->{'failhandle'} = $failhandle;
274# $self->SUPER::init(@_);
275
276 # set process_exp and block_exp to defaults unless they were
277 # explicitly set
278
279 if ((!$self->is_recursive()) and
280 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
281
282 $self->{'process_exp'} = $self->get_default_process_exp ();
283 if ($self->{'process_exp'} eq "") {
284 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
285 }
286 }
287
288 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
289 $self->{'block_exp'} = $self->get_default_block_exp ();
290 }
291
292}
293
294sub begin {
295 my $self = shift (@_);
296 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
297
298 if ($self->{'OIDtype'} eq "auto") {
299 # hasn't been set in the plugin, use the processor values
300 $self->{'OIDtype'} = $processor->{'OIDtype'};
301 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
302 }
303 if ($self->{'OIDtype'} eq "hash") {
304 # should we hash on the file or on the doc xml??
305 $self->{'OIDtype'} = $self->get_oid_hash_type();
306 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
307 $self->{'OIDtype'} = "hash_on_file";
308 }
309 }
310}
311
312sub end {
313 # potentially called at the end of each plugin pass
314 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
315
316 my ($self) = shift (@_);
317}
318
319sub deinit {
320 # called only once, after all plugin passes have been done
321
322 my ($self) = @_;
323}
324
325# default hashing type is to hash on the original file (or converted file)
326# override this to return hash_on_ga_xml for filetypes where hashing on the
327# file is no good eg video
328sub get_oid_hash_type {
329
330 my $self = shift (@_);
331
332 return "hash_on_file";
333}
334
335sub set_incremental {
336 my $self = shift(@_);
337 my ($incremental) = @_;
338
339 $self->{'incremental'} = $incremental;
340}
341
342# this function should be overridden to return 1
343# in recursive plugins
344sub is_recursive {
345 my $self = shift (@_);
346
347 return 0;
348}
349
350sub get_default_block_exp {
351 my $self = shift (@_);
352
353 return "";
354}
355
356sub get_default_process_exp {
357 my $self = shift (@_);
358
359 return "";
360}
361
362# rename imported files and assoc files using URL encoding by default
363# as this will work for most plugins and give more legible filenames
364sub get_default_file_rename_method() {
365 my $self = shift (@_);
366 return "url";
367}
368
369# default implementation is to do nothing
370sub store_block_files {
371
372 my $self =shift (@_);
373 my ($filename_full_path, $block_hash) = @_;
374
375}
376
377# put files to block into hash
378sub use_block_expressions {
379
380 my $self =shift (@_);
381 my ($filename_full_path, $block_hash) = @_;
382
383 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
384 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
385 }
386
387}
388
389#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
390sub block_cover_image
391{
392 my $self =shift;
393 my ($filename, $block_hash) = @_;
394
395 if ($self->{'cover_image'}) {
396 my $coverfile = $filename;
397 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
398 if (!-e $coverfile) {
399 $coverfile =~ s/jpg$/JPG/;
400 }
401 if (-e $coverfile) {
402 $block_hash->{'file_blocks'}->{$coverfile} = 1;
403 }
404 }
405
406 return;
407}
408
409
410# discover all the files that should be blocked by this plugin
411# check the args ...
412sub file_block_read {
413
414 my $self = shift (@_);
415 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
416 # Keep track of filenames with same root but different extensions
417 # Used to support -associate_ext and the more generalised
418 # -associate_tail_re
419 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
420
421 my $associate_tail_re = $self->{'associate_tail_re'};
422 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
423 my ($file_prefix,$file_ext)
424 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
425 if ((defined $file_prefix) && (defined $file_ext)) {
426 my $shared_fileroot = $block_hash->{'shared_fileroot'};
427 if (!defined $shared_fileroot->{$file_prefix}) {
428 my $file_prefix_rec = { 'tie_to' => undef,
429 'exts' => {} };
430 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
431 }
432
433 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
434
435 if ($self->can_process_this_file($filename_full_path)) {
436 # This is the document the others should be tied to
437 $file_prefix_rec->{'tie_to'} = $file_ext;
438 }
439 else {
440 if ($file_ext =~ m/$associate_tail_re$/) {
441 # this file should be associated to the main one
442 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
443 }
444 }
445
446 }
447 }
448
449 # check block expressions
450 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
451
452 # now check whether we are actually processing this
453 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
454 return undef; # can't recognise
455 }
456
457 # if we have a block_exp, then this overrides the normal 'smart' blocking
458 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
459
460 # block the cover image if there is one
461 if ($self->{'cover_image'}) {
462 $self->block_cover_image($filename_full_path, $block_hash);
463 }
464
465 return 1;
466}
467
468# plugins that rely on more than process_exp (eg XML plugins) can override this method
469sub can_process_this_file {
470 my $self = shift(@_);
471 my ($filename) = @_;
472
473 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
474 return 1;
475 }
476 return 0;
477
478}
479
480# just converts path as is to utf8.
481sub filepath_to_utf8 {
482 my $self = shift (@_);
483 my ($file, $file_encoding) = @_;
484 my $filemeta = $file;
485
486 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
487
488 # Whenever filename-encoding is set to any of the auto settings, we
489 # check if the filename is already in UTF8. If it is, then we're done.
490 if($filename_encoding =~ m/auto/) {
491 if(&unicode::check_is_utf8($filemeta))
492 {
493 $filename_encoding = "utf8";
494 return $filemeta;
495 }
496 }
497
498 # Auto setting, but filename is not utf8
499 if ($filename_encoding eq "auto")
500 {
501 # try textcat
502 $filename_encoding = $self->textcat_encoding($filemeta);
503
504 # check the locale next
505 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
506
507
508 # now try the encoding of the document, if available
509 if ($filename_encoding eq "undefined" && defined $file_encoding) {
510 $filename_encoding = $file_encoding;
511 }
512
513 }
514
515 elsif ($filename_encoding eq "auto-language-analysis")
516 {
517 $filename_encoding = $self->textcat_encoding($filemeta);
518
519 # now try the encoding of the document, if available
520 if ($filename_encoding eq "undefined" && defined $file_encoding) {
521 $filename_encoding = $file_encoding;
522 }
523 }
524
525 elsif ($filename_encoding eq "auto-filesystem-encoding")
526 {
527 # try locale
528 $filename_encoding = $self->locale_encoding();
529 }
530
531 elsif ($filename_encoding eq "auto-fl")
532 {
533 # filesystem-encoding (locale) then language-analysis (textcat)
534 $filename_encoding = $self->locale_encoding();
535
536 # try textcat
537 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
538
539 # else assume filename encoding is encoding of file content, if that's available
540 if ($filename_encoding eq "undefined" && defined $file_encoding) {
541 $filename_encoding = $file_encoding;
542 }
543 }
544
545 elsif ($filename_encoding eq "auto-lf")
546 {
547 # language-analysis (textcat) then filesystem-encoding (locale)
548 $filename_encoding = $self->textcat_encoding($filemeta);
549
550 # guess filename encoding from encoding of file content, if available
551 if ($filename_encoding eq "undefined" && defined $file_encoding) {
552 $filename_encoding = $file_encoding;
553 }
554
555 # try locale
556 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
557 }
558
559 # if still undefined, use utf8 as fallback
560 if ($filename_encoding eq "undefined") {
561 $filename_encoding = "utf8";
562 }
563
564 #print STDERR "**** UTF8 encoding the filename $filemeta ";
565
566 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
567 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
568 # be always utf8 (in which case the filename's encoding is also set as utf8 even
569 # though the filename need not be if it originates from another system)--in such
570 # cases attempt to make the filename utf8 to match.
571 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
572 &unicode::ensure_utf8(\$filemeta);
573 }
574
575 # convert non-unicode encodings to utf8
576 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
577 $filemeta = &unicode::unicode2utf8(
578 &unicode::convert2unicode($filename_encoding, \$filemeta)
579 );
580 }
581
582 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
583 return $filemeta;
584}
585
586# gets the filename with no path, converts to utf8, and then dm safes it.
587# filename_encoding set by user
588sub filename_to_utf8_metadata
589{
590 my $self = shift (@_);
591 my ($file, $file_encoding) = @_;
592
593 my $outhandle = $self->{'outhandle'};
594
595 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
596 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
597
598 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
599
600 return $dmsafe_filemeta;
601
602}
603
604sub locale_encoding {
605 my $self = shift(@_);
606
607 if (!defined $self->{'filesystem_encoding'}) {
608 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
609 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
610 }
611
612 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
613 return $self->{'filesystem_encoding'}; # can be the string "undefined"
614}
615
616sub textcat_encoding {
617 my $self = shift(@_);
618 my ($filemeta) = @_;
619
620 # analyse filenames without extensions and digits (and trimmed of surrounding
621 # whitespace), so that irrelevant chars don't confuse textcat
622 my $strictfilemeta = $filemeta;
623 $strictfilemeta =~ s/\.[^\.]+$//g;
624 $strictfilemeta =~ s/\d//g;
625 $strictfilemeta =~ s/^\s*//g;
626 $strictfilemeta =~ s/\s*$//g;
627
628 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
629 if(!defined $filename_encoding) {
630 $filename_encoding = "undefined";
631 }
632
633 return $filename_encoding; # can be the string "undefined"
634}
635
636# performs textcat
637sub encoding_from_language_analysis {
638 my $self = shift(@_);
639 my ($text) = @_;
640
641 my $outhandle = $self->{'outhandle'};
642 my $best_encoding = undef;
643
644 # get the language/encoding of the textstring using textcat
645 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
646 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
647
648
649 if (scalar @$results < 0) {
650 return undef;
651 }
652
653 # We have some results, we choose the first
654 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
655
656 $best_encoding = $encoding;
657 if (!defined $best_encoding) {
658 return undef;
659 }
660
661 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
662 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
663 $best_encoding = 'utf8';
664 }
665
666
667 # check for equivalents where textcat doesn't have some encodings...
668 # eg MS versions of standard encodings
669 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
670 my $iso = $1; # which variant of the iso standard?
671 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
672 if ($text =~ /[\x80-\x9f]/) {
673 # Western Europe
674 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
675 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
676 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
677 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
678 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
679 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
680 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
681 }
682 }
683
684 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
685 !defined $encodings::encodings->{$best_encoding})
686 {
687 if ($self->{'verbosity'}) {
688 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
689 }
690 $best_encoding = undef;
691 }
692
693 return $best_encoding;
694}
695
696# uses locale
697sub get_filesystem_encoding {
698
699 my $self = shift(@_);
700
701 my $outhandle = $self->{'outhandle'};
702 my $filesystem_encoding = undef;
703
704 eval {
705 use POSIX qw(locale_h);
706
707 # With only one parameter, setlocale retrieves the
708 # current value
709 my $current_locale = setlocale(LC_CTYPE);
710
711 if ($current_locale =~ m/^.*\.(.*?)$/) {
712 my $char_encoding = lc($1);
713 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
714 $char_encoding = "$1\_$2\_$3";
715 }
716
717 $char_encoding =~ s/-/_/g;
718 $char_encoding =~ s/^utf_8$/utf8/;
719
720 if ($char_encoding =~ m/^\d+$/) {
721 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
722 $char_encoding = "windows_$char_encoding";
723 }
724 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
725 $char_encoding = "dos_$char_encoding";
726 }
727 }
728
729 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
730 || (defined $encodings::encodings->{$char_encoding})) {
731 $filesystem_encoding = $char_encoding;
732 }
733 else {
734 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
735 }
736 }
737
738
739 };
740 if ($@) {
741 print $outhandle "$@\n";
742 print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
743
744 }
745 return $filesystem_encoding;
746}
747
748# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
749sub set_Source_metadata {
750 my $self = shift (@_);
751 my ($doc_obj, $filename_no_path, $file_encoding) = @_;
752
753 my $top_section = $doc_obj->get_top_section();
754
755 # UTF-8 version of filename
756 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
757
758 # Source is the UTF8 display name - not necessarily the name of the file on the system
759 $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
760
761 $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
762 # If using URL encoding, then SourceFile is the url-reference to url-encoded
763 # filemeta: it's a url that refers to the actual file on the system
764 $filemeta = &unicode::filename_to_url($filemeta, $self->{'file_rename_method'});
765
766 $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
767}
768
769# this should be called by all plugins to set the oid of the doc obj, rather
770# than calling doc_obj->set_OID directly
771sub add_OID {
772 my $self = shift (@_);
773 my ($doc_obj) = @_;
774
775 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
776
777 # see if there is a plugin specific set_OID function
778 if (defined ($self->can('set_OID'))) {
779 $self->set_OID(@_); # pass through doc_obj and any extra arguments
780 }
781 else {
782 # use the default set_OID() in doc.pm
783 $doc_obj->set_OID();
784 }
785
786}
787
788# The BasePlugin read_into_doc_obj() function. This function does all the
789# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
790# a file and sets up a slew of metadata all saved in doc_obj, which
791# it then returns as part of a tuple (process_status,doc_obj)
792#
793# Much of this functionality used to reside in read, but it was broken
794# down into a supporting routine to make the code more flexible.
795#
796# recursive plugins (e.g. RecPlug) and specialized plugins like those
797# capable of processing many documents within a single file (e.g.
798# GMLPlug) will normally want to implement their own version of
799# read_into_doc_obj()
800#
801# Note that $base_dir might be "" and that $file might
802# include directories
803
804# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
805sub read_into_doc_obj {
806 my $self = shift (@_);
807 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
808
809 my $outhandle = $self->{'outhandle'};
810
811 # should we move this to read? What about secondary plugins?
812 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
813 print $outhandle "$self->{'plugin_type'} processing $file\n"
814 if $self->{'verbosity'} > 1;
815
816 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
817 # create a new document
818 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
819 my $top_section = $doc_obj->get_top_section();
820
821 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
822 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
823
824 # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
825 # of the UTF8 filename (SourceFile) for generated files
826 $self->set_Source_metadata($doc_obj, $filename_no_path);
827
828 # plugin specific stuff - what args do we need here??
829 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
830 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
831 return -1;
832 }
833
834 # include any metadata passed in from previous plugins
835 # note that this metadata is associated with the top level section
836 my $section = $doc_obj->get_top_section();
837 # can we merge these two methods??
838 $self->add_associated_files($doc_obj, $filename_full_path);
839 $self->extra_metadata ($doc_obj, $section, $metadata);
840 $self->auto_extract_metadata($doc_obj);
841
842 # if we haven't found any Title so far, assign one
843 # this was shifted to here from inside read()
844 $self->title_fallback($doc_obj,$section,$filename_no_path);
845
846 $self->add_OID($doc_obj);
847
848 return (1,$doc_obj);
849}
850
851sub add_dummy_text {
852 my $self = shift(@_);
853 my ($doc_obj, $section) = @_;
854
855 # add NoText metadata so we can hide this dummy text in format statements
856 $doc_obj->add_metadata($section, "NoText", "1");
857 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
858
859}
860
861# does nothing. Can be overridden by subclass
862sub auto_extract_metadata {
863 my $self = shift(@_);
864 my ($doc_obj) = @_;
865}
866
867# adds cover image, associate_file options stuff. Should be called by sub class
868# read_into_doc_obj
869sub add_associated_files {
870 my $self = shift(@_);
871 # whatis filename??
872 my ($doc_obj, $filename) = @_;
873
874 # add in the cover image
875 if ($self->{'cover_image'}) {
876 $self->associate_cover_image($doc_obj, $filename);
877 }
878
879
880}
881
882# implement this if you are extracting metadata for other documents
883sub metadata_read {
884 my $self = shift (@_);
885 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
886
887 # can we process this file??
888 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
889 return undef unless $self->can_process_this_file($filename_full_path);
890
891 return 1; # we recognise the file, but don't actually do anything with it
892}
893
894
895# The BasePlugin read() function. This function calls read_into_doc_obj()
896# to ensure all the right things to make general options work for a
897# given plugin are done. It then calls the process() function which
898# does all the work specific to a plugin (like the old read functions
899# used to do). Most plugins should define their own process() function
900# and let this read() function keep control.
901#
902# recursive plugins (e.g. RecPlug) and specialized plugins like those
903# capable of processing many documents within a single file (e.g.
904# GMLPlug) might want to implement their own version of read(), but
905# more likely need to implement their own version of read_into_doc_obj()
906#
907# Return number of files processed, undef if can't recognise, -1 if can't
908# process
909
910sub read {
911 my $self = shift (@_);
912 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
913
914 # can we process this file??
915 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
916 return undef unless $self->can_process_this_file($filename_full_path);
917
918 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
919
920 if ((defined $process_status) && ($process_status == 1)) {
921
922 # process the document
923 $processor->process($doc_obj);
924
925 $self->{'num_processed'} ++;
926 undef $doc_obj;
927 }
928 # delete any temp files that we may have created
929 $self->clean_up_after_doc_obj_processing();
930
931 # if process_status == 1, then the file has been processed.
932 return $process_status;
933
934}
935
936# returns undef if file is rejected by the plugin
937sub process {
938 my $self = shift (@_);
939 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
940
941 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
942
943 return undef; # never gets here
944}
945
946# overwrite this method to delete any temp files that we have created
947sub clean_up_after_doc_obj_processing {
948 my $self = shift(@_);
949
950}
951
952# write_file -- used by ConvertToPlug, for example in post processing
953#
954# where should this go, is here the best place??
955sub utf8_write_file {
956 my $self = shift (@_);
957 my ($textref, $filename) = @_;
958
959 if (!open (FILE, ">$filename")) {
960 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
961 die "\n";
962 }
963 print FILE $$textref;
964
965 close FILE;
966}
967
968
969sub filename_based_title
970{
971 my $self = shift (@_);
972 my ($file) = @_;
973
974 my $file_derived_title = $file;
975 $file_derived_title =~ s/_/ /g;
976 $file_derived_title =~ s/\..*?$//;
977
978 return $file_derived_title;
979}
980
981
982sub title_fallback
983{
984 my $self = shift (@_);
985 my ($doc_obj,$section,$file) = @_;
986
987 if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
988
989 my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
990 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
991 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
992 }
993 else {
994 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
995 }
996 }
997
998}
999
1000# add any extra metadata that's been passed around from one
1001# plugin to another.
1002# extra_metadata uses add_utf8_metadata so it expects metadata values
1003# to already be in utf8
1004sub extra_metadata {
1005 my $self = shift (@_);
1006 my ($doc_obj, $cursection, $metadata) = @_;
1007
1008 my $associate_tail_re = $self->{'associate_tail_re'};
1009
1010 foreach my $field (keys(%$metadata)) {
1011 # $metadata->{$field} may be an array reference
1012 if ($field eq "gsdlassocfile_tobe") {
1013 # 'gsdlassocfile_tobe' is artificially introduced metadata
1014 # that is used to signal that certain additional files should
1015 # be tied to this document. Useful in situations where a
1016 # metadata pass in the plugin pipeline works out some files
1017 # need to be associated with a document, but the document hasn't
1018 # been formed yet.
1019 my $equiv_form = "";
1020 foreach my $gaf (@{$metadata->{$field}}) {
1021 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1022 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1023
1024 # we need to make sure the filename is valid utf-8 - we do
1025 # this by url or base64 encoding it
1026 # $tail_filename is the name that we store the file as
1027 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1028 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1029 # If the filename is url_encoded, we need to encode the % signs
1030 # in the filename, so that it works in a url
1031 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1032 # work out extended tail extension (i.e. matching tail re)
1033
1034 my ($file_prefix,$file_extended_ext)
1035 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1036 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1037 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1038 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$url_tail_filename\">";
1039 my $srcicon = "_icon".$doc_ext."_";
1040 my $end_doclink = "</a>";
1041
1042 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1043
1044 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1045 # for metadata such as [mp3._edited] [mp3._full] ...
1046 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1047 }
1048
1049 # for multiple metadata such as [mp3.assoclink]
1050 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1051
1052 $equiv_form .= " $assoc_form";
1053 }
1054 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1055 }
1056 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1057 map {
1058 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1059 } @{$metadata->{$field}};
1060 } else {
1061 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1062 }
1063 }
1064}
1065
1066
1067sub compile_stats {
1068 my $self = shift(@_);
1069 my ($stats) = @_;
1070
1071 $stats->{'num_processed'} += $self->{'num_processed'};
1072 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1073 $stats->{'num_archives'} += $self->{'num_archives'};
1074
1075}
1076
1077sub associate_cover_image {
1078 my $self = shift;
1079 my ($doc_obj, $filename) = @_;
1080
1081 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1082 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1083 # don't stat() for existence eg for multiple document input files
1084 # (eg SplitPlug)
1085 return;
1086 }
1087
1088 my $top_section=$doc_obj->get_top_section();
1089
1090 if (-e $filename) {
1091 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1092 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1093 } else {
1094 my $upper_filename = $filename;
1095 $upper_filename =~ s/jpg$/JPG/;
1096 if (-e $upper_filename) {
1097 $doc_obj->associate_file($upper_filename, "cover.jpg",
1098 "image/jpeg");
1099 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1100 } else {
1101 # file doesn't exist, so record the fact that it's missing so
1102 # we don't stat() again (stat is slow)
1103 $self->{'covers_missing_cache'}->{$filename} = 1;
1104 }
1105 }
1106
1107}
1108
1109
1110# Overridden by exploding plugins (eg. ISISPlug)
1111sub clean_up_after_exploding
1112{
1113 my $self = shift(@_);
1114}
1115
1116
1117
11181;
Note: See TracBrowser for help on using the repository browser.