source: gsdl/trunk/perllib/plugins/BasePlugin.pm@ 16557

Last change on this file since 16557 was 16557, checked in by ak19, 16 years ago

Auto filename encoding has several additional settings now, these are handled by subroutine filepath_to_utf8 which has changed accordingly. Some additional helper subroutines added. This file BasePlugin.pm is an intermediate but working version (still has many debug output statements even when most are commented out, but as I want to test the changes out on Windows first, I want to retain the debug statements).

  • Property svn:keywords set to Author Date Id Revision
File size: 32.4 KB
RevLine 
[537]1###########################################################################
2#
[15868]3# BasePlugin.pm -- base class for all the import plugins
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[9413]8# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
[15868]26package BasePlugin;
[2219]27
[10254]28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]31
[8892]32use File::Basename;
33
[1219]34use multiread;
[1870]35use encodings;
[11389]36use unicode;
[16557]37use textcat;
[1242]38use doc;
[7645]39eval "require diagnostics"; # some perl distros (eg mac) don't have this
[2751]40use ghtml;
[9413]41use gsprintf 'gsprintf';
[4]42
[15868]43use PrintInfo;
[10218]44
[15868]45BEGIN {
46 @BasePlugin::ISA = ( 'PrintInfo' );
47}
[5681]48
[15868]49our $encoding_list =
[10218]50 [ { 'name' => "ascii",
[16014]51 'desc' => "{BasePlugin.encoding.ascii}" },
[4744]52 { 'name' => "utf8",
[16014]53 'desc' => "{BasePlugin.encoding.utf8}" },
[4744]54 { 'name' => "unicode",
[16014]55 'desc' => "{BasePlugin.encoding.unicode}" } ];
[16557]56
[3540]57
[10620]58my $e = $encodings::encodings;
59foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
60{
61 my $hashEncode =
62 {'name' => $enc,
63 'desc' => $e->{$enc}->{'name'}};
64
[15868]65 push(@{$encoding_list},$hashEncode);
[10620]66}
67
[15868]68our $encoding_plus_auto_list =
69 [ { 'name' => "auto",
[16557]70 'desc' => "{BasePlugin.filename_encoding.auto}" },
71 { 'name' => "auto-language-analysis",
72 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
73 { 'name' => "auto-filesystem-encoding",
74 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
75 { 'name' => "auto-fl",
76 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
77 { 'name' => "auto-lf",
78 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
[10620]79
[15868]80push(@{$encoding_plus_auto_list},@{$encoding_list});
81
[4873]82my $arguments =
[3540]83 [ { 'name' => "process_exp",
[15868]84 'desc' => "{BasePlugin.process_exp}",
[6408]85 'type' => "regexp",
[3540]86 'deft' => "",
87 'reqd' => "no" },
[16390]88 { 'name' => "no_blocking",
89 'desc' => "{BasePlugin.no_blocking}",
90 'type' => "flag",
91 'reqd' => "no"},
[3540]92 { 'name' => "block_exp",
[15868]93 'desc' => "{BasePlugin.block_exp}",
[6408]94 'type' => "regexp",
[3540]95 'deft' => "",
96 'reqd' => "no" },
[8892]97 { 'name' => "associate_ext",
[15868]98 'desc' => "{BasePlugin.associate_ext}",
[8892]99 'type' => "string",
100 'reqd' => "no" },
[11122]101 { 'name' => "associate_tail_re",
[15868]102 'desc' => "{BasePlugin.associate_tail_re}",
[11122]103 'type' => "string",
104 'reqd' => "no" },
[15018]105 { 'name' => "use_as_doc_identifier",
[15868]106 'desc' => "{BasePlugin.use_as_doc_identifier}",
[15018]107 'type' => "string",
108 'reqd' => "no" ,
109 'deft' => "" } ,
[15868]110 { 'name' => "no_cover_image",
111 'desc' => "{BasePlugin.no_cover_image}",
[3540]112 'type' => "flag",
113 'reqd' => "no" },
[15868]114 { 'name' => "filename_encoding",
115 'desc' => "{BasePlugin.filename_encoding}",
116 'type' => "enum",
117 'deft' => "auto",
118 'list' => $encoding_plus_auto_list,
[16390]119 'reqd' => "no" },
120 { 'name' => "smart_block",
[16520]121 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
[16390]122 'type' => "flag",
123 'reqd' => "no",
124 'hiddengli' => "yes" } # deprecated, but leave in for old collections
125
[15868]126
127 ];
[3540]128
[9398]129
[15868]130my $options = { 'name' => "BasePlugin",
131 'desc' => "{BasePlugin.desc}",
[6408]132 'abstract' => "yes",
133 'inherits' => "no",
[4750]134 'args' => $arguments };
[3540]135
[4778]136
[4]137sub new {
[10218]138
[15868]139 my ($class) = shift (@_);
140 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
[10218]141 push(@$pluginlist, $class);
[9398]142
[15868]143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]145
[15868]146 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
[16390]147
148 if ($self->{'info_only'}) {
149 # don't worry about any options etc
150 return bless $self, $class;
151 }
[10579]152
[16390]153 if ($self->{'smart_block'}) {
154 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
155 }
156 $self->{'smart_block'} = undef;
157
[15868]158 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
159 $self->{'plugin_type'} = $plugin_name;
[10579]160
[2785]161 $self->{'num_processed'} = 0;
162 $self->{'num_not_processed'} = 0;
163 $self->{'num_blocked'} = 0;
164 $self->{'num_archives'} = 0;
[8678]165 $self->{'cover_image'} = 1; # cover image is on by default
[10218]166 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[10579]167 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]168
[8892]169 my $associate_ext = $self->{'associate_ext'};
170 if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]171
[11122]172 my $associate_tail_re = $self->{'associate_tail_re'};
173 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
174 my $outhandle = $self->{'outhandle'};
175 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
176 print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]177 }
[11122]178 else {
179 my @exts = split(/,/,$associate_ext);
[8892]180
[11122]181 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
182 my $associate_tail_re = join("|",@exts_bracketed);
183 $self->{'associate_tail_re'} = $associate_tail_re;
184 }
185
186 delete $self->{'associate_ext'};
[8892]187 }
188
[15868]189 return bless $self, $class;
[11089]190
[4]191}
192
[15868]193# initialize BasePlugin options
194# if init() is overridden in a sub-class, remember to call BasePlugin::init()
[1242]195sub init {
196 my $self = shift (@_);
[2785]197 my ($verbosity, $outhandle, $failhandle) = @_;
[1242]198
199 # verbosity is passed through from the processor
200 $self->{'verbosity'} = $verbosity;
201
[2785]202 # as are the outhandle and failhandle
[1424]203 $self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]204 $self->{'failhandle'} = $failhandle;
[16390]205# $self->SUPER::init(@_);
206
[1242]207 # set process_exp and block_exp to defaults unless they were
208 # explicitly set
[1244]209
210 if ((!$self->is_recursive()) and
[1242]211 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
[1244]212
[1242]213 $self->{'process_exp'} = $self->get_default_process_exp ();
214 if ($self->{'process_exp'} eq "") {
[1244]215 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]216 }
217 }
218
219 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
220 $self->{'block_exp'} = $self->get_default_block_exp ();
221 }
[11089]222
[1242]223}
224
[839]225sub begin {
226 my $self = shift (@_);
227 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
228}
229
230sub end {
[10155]231 # potentially called at the end of each plugin pass
232 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
233
[15868]234 my ($self) = shift (@_);
[839]235}
236
[10155]237sub deinit {
238 # called only once, after all plugin passes have been done
239
240 my ($self) = @_;
241}
242
[15868]243sub set_incremental {
244 my $self = shift(@_);
245 my ($incremental) = @_;
246
247 $self->{'incremental'} = $incremental;
248}
249
[1242]250# this function should be overridden to return 1
251# in recursive plugins
[4]252sub is_recursive {
253 my $self = shift (@_);
254
[1242]255 return 0;
[4]256}
257
[1242]258sub get_default_block_exp {
259 my $self = shift (@_);
260
261 return "";
262}
263
264sub get_default_process_exp {
265 my $self = shift (@_);
266
267 return "";
268}
269
[16390]270# default implementation is to do nothing
271sub store_block_files {
272
[9067]273 my $self =shift (@_);
[16390]274 my ($filename_full_path, $block_hash) = @_;
275
[9067]276}
277
[16390]278# put files to block into hash
279sub use_block_expressions {
280
281 my $self =shift (@_);
282 my ($filename_full_path, $block_hash) = @_;
283
284 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
285 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
286 }
287
288}
289
[9067]290#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
291sub block_cover_image
292{
[10833]293 my $self =shift;
[16390]294 my ($filename, $block_hash) = @_;
[10833]295
[9067]296 if ($self->{'cover_image'}) {
297 my $coverfile = $filename;
298 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
299 if (!-e $coverfile) {
300 $coverfile =~ s/jpg$/JPG/;
301 }
302 if (-e $coverfile) {
[16390]303 $block_hash->{'file_blocks'}->{$coverfile} = 1;
[11089]304 }
[9067]305 }
306
307 return;
308}
[11122]309
310
[16390]311# discover all the files that should be blocked by this plugin
312# check the args ...
313sub file_block_read {
[11122]314
[8510]315 my $self = shift (@_);
[16390]316 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]317 # Keep track of filenames with same root but different extensions
[11122]318 # Used to support -associate_ext and the more generalised
319 # -associate_tail_re
[16390]320 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]321
[11122]322 my $associate_tail_re = $self->{'associate_tail_re'};
323 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[16390]324
[11122]325 my ($file_prefix,$file_ext)
[16390]326 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
327
[8892]328 if ((defined $file_prefix) && (defined $file_ext)) {
[16390]329 my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]330 if (!defined $shared_fileroot->{$file_prefix}) {
[11122]331 my $file_prefix_rec = { 'tie_to' => undef,
332 'exts' => {} };
[8892]333 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
334 }
335
336 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
337
[16390]338 if ($self->can_process_this_file($filename_full_path)) {
[8892]339 # This is the document the others should be tied to
340 $file_prefix_rec->{'tie_to'} = $file_ext;
341 }
342 else {
[11122]343 if ($file_ext =~ m/$associate_tail_re$/) {
[16390]344 # this file should be associated to the main one
[9351]345 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
346 }
[8892]347 }
[11122]348
[8892]349 }
350 }
[11122]351
[16390]352 # check block expressions
353 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
354
[9067]355 # now check whether we are actually processing this
[16390]356 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
[9067]357 return undef; # can't recognise
358 }
[16390]359
360 $self->store_block_files($filename_full_path, $block_hash) unless $self->{'no_blocking'};
[8892]361
[11089]362 # block the cover image if there is one
363 if ($self->{'cover_image'}) {
[16390]364 $self->block_cover_image($filename_full_path, $block_hash) unless $self->{'no_blocking'};
[11089]365 }
[9067]366
367 return 1;
[8510]368}
369
[16390]370# plugins that rely on more than process_exp (eg XML plugins) can override this method
371sub can_process_this_file {
372 my $self = shift(@_);
373 my ($filename) = @_;
[8892]374
[16390]375 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
376 return 1;
[8892]377 }
378 return 0;
[10280]379
380}
381
[16390]382# just converts path as is to utf8.
383sub filepath_to_utf8 {
[10280]384 my $self = shift (@_);
[15868]385 my ($file, $file_encoding) = @_;
[16390]386 my $filemeta = $file;
[10280]387
[16557]388 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
389
390## print STDERR "**** User chose filename encoding setting: $filename_encoding\n";
391
392 # Whenever filename-encoding is set to any of the auto settings, we
393 # check if the filename is already in UTF8. If it is, then we're done.
394 if($filename_encoding =~ m/auto/) {
395 if(&unicode::check_is_utf8($filemeta))
396 {
397## print STDERR "**** It is already UTF8\n";
398 $filename_encoding = "utf8";
399 return $filemeta;
400 }
401 }
402
403 # Auto setting, but filename is not utf8
404 if ($filename_encoding eq "auto")
405 {
406 # try textcat
407 $filename_encoding = $self->textcat_encoding($filemeta);
408
409 # check the locale next
410 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
411
412
413 # now try the encoding of the document, if available
414 if ($filename_encoding eq "undefined" && defined $file_encoding) {
415 $filename_encoding = $file_encoding;
[15868]416 }
[16557]417
418 }
419
420 elsif ($filename_encoding eq "auto-language-analysis")
421 {
422 $filename_encoding = $self->textcat_encoding($filemeta);
423
424 # now try the encoding of the document, if available
425 if ($filename_encoding eq "undefined" && defined $file_encoding) {
426 $filename_encoding = $file_encoding;
427 }
428 }
429
430 elsif ($filename_encoding eq "auto-filesystem-encoding")
431 {
432 # try locale
433 $filename_encoding = $self->locale_encoding();
434 }
435
436 elsif ($filename_encoding eq "auto-fl")
437 {
438 # filesystem-encoding (locale) then language-analysis (textcat)
439 $filename_encoding = $self->locale_encoding();
440
441 # try textcat
442 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
443
444 # else assume filename encoding is encoding of file content, if that's available
445 if ($filename_encoding eq "undefined" && defined $file_encoding) {
446 $filename_encoding = $file_encoding;
[15868]447 }
[16557]448 }
[15868]449
[16557]450 elsif ($filename_encoding eq "auto-lf")
451 {
452 # language-analysis (textcat) then filesystem-encoding (locale)
453 $filename_encoding = $self->textcat_encoding($filemeta);
454
455 # guess filename encoding from encoding of file content, if available
456 if ($filename_encoding eq "undefined" && defined $file_encoding) {
457 $filename_encoding = $file_encoding;
458 }
459
460 # try locale
461 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[15868]462 }
463
[16557]464## print STDERR "**** filename_encoding selected: $filename_encoding \n";
465
466 # if still undefined, use utf8 as fallback
467 if ($filename_encoding eq "undefined") {
468 $filename_encoding = "utf8";
469 }
470
471 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
472 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
473 # be always utf8 (in which case the filename's encoding is also set as utf8 even
474 # though the filename need not be if it originates from another system)--in such
475 # cases attempt to make the filename utf8 to match.
476 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
477## print STDERR "**** BEFORE utf8 conversion: $filemeta\n";
478 &unicode::ensure_utf8(\$filemeta);
479## print STDERR "**** AFTER utf8 conversion: $filemeta\n";
480 }
481
482
483 # convert non-unicode encodings to utf8
484 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
485 $filemeta = &unicode::unicode2utf8(
486 &unicode::convert2unicode($filename_encoding, \$filemeta)
[15868]487 );
488 }
[16390]489
[16557]490 print "*** filename encoding found: $filename_encoding\n";
491 print "*** utf8 encoded filename: $filemeta\n";
492
[16390]493 return $filemeta;
494}
495
496# gets the filename with no path, converts to utf8, and then dm safes it.
497#filename_encoding set by user
498sub filename_to_utf8_metadata
499{
500 my $self = shift (@_);
501 my ($file, $file_encoding) = @_;
502
503 my $outhandle = $self->{'outhandle'};
504
505 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
506 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
507
[15868]508 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
509
510 return $dmsafe_filemeta;
511
[10280]512}
513
[16557]514sub locale_encoding {
515 my $self = shift(@_);
516
517 if (!defined $self->{'filesystem_encoding'}) {
518 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
519 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
520 }
[10280]521
[16557]522 print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
523 return $self->{'filesystem_encoding'}; # can be the string "undefined"
524}
525
526sub textcat_encoding {
527 my $self = shift(@_);
528 my ($filemeta) = @_;
529
530 # analyse filenames without extensions and digits (and trimmed of surrounding
531 # whitespace), so that irrelevant chars don't confuse textcat
532 my $strictfilemeta = $filemeta;
533 $strictfilemeta =~ s/\.[^\.]+$//g;
534 $strictfilemeta =~ s/\d//g;
535 $strictfilemeta =~ s/^\s*//g;
536 $strictfilemeta =~ s/\s*$//g;
537
538## print STDERR "**** strict filename is |$strictfilemeta|\n";
539 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
540 if(!defined $filename_encoding) {
541 $filename_encoding = "undefined";
542 }
543
544## print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n";
545 return $filename_encoding; # can be the string "undefined"
546}
547
548# performs textcat
549sub encoding_from_language_analysis {
550 my $self = shift(@_);
551 my ($text) = @_;
552
553 my $outhandle = $self->{'outhandle'};
554 my $best_encoding = undef;
555
556 # get the language/encoding of the file using textcat
557 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
558 #my $results = $self->{'textcat'}->classify(\$text);
559 my $results = $self->{'textcat'}->classify_cached(\$text);
560
561
562 if (scalar @$results < 0) {
563 print STDERR "**** Textcat returned 0 results\n";
564 return undef;
565 }
566
567 print STDERR "**** TEXTCAT RESULTS for $text: ";
568 print STDERR join(",", @$results);
569 print STDERR "\n";
570
571 # We have some results, we choose the first
572 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
573
574 $best_encoding = $encoding;
575 if (!defined $best_encoding) {
576## print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n";
577 return undef;
578 }
579
580 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
581 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
582## print STDERR "*** Filename turns out to be UTF8\n";
583 $best_encoding = 'utf8';
584 }
585
586
587 # check for equivalents where textcat doesn't have some encodings...
588 # eg MS versions of standard encodings
589 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
590## print STDERR "**** best_encoding is ISO_8859: $best_encoding\n";
591
592 my $iso = $1; # which variant of the iso standard?
593 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
594 if ($text =~ /[\x80-\x9f]/) {
595## print STDERR "**** best_encoding is some windows value: $best_encoding\n";
596 # Western Europe
597 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
598 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
599 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
600 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
601 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
602 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
603 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
604## print STDERR "**** best_encoding windows value: $best_encoding\n";
605 }
606 }
607
608 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
609 !defined $encodings::encodings->{$best_encoding})
610 {
611 if ($self->{'verbosity'}) {
612 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
613 }
614## print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n";
615 $best_encoding = undef;
616 }
617## print STDERR "**** language: $language\n" if defined $language;
618## print STDERR "**** encoding: $best_encoding\n" if defined $encoding;
619
620 return $best_encoding;
621}
622
623# uses locale
[15868]624sub get_filesystem_encoding {
[10280]625
[15868]626 my $self = shift(@_);
627
[14961]628 my $outhandle = $self->{'outhandle'};
629 my $filesystem_encoding = undef;
630
631 eval {
632 use POSIX qw(locale_h);
[15868]633
634 # With only one parameter, setlocale retrieves the
635 # current value
[14961]636 my $current_locale = setlocale(LC_CTYPE);
[15868]637
[14961]638 if ($current_locale =~ m/^.*\.(.*?)$/) {
639 my $char_encoding = lc($1);
[15446]640 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
641 $char_encoding = "$1\_$2\_$3";
642 }
643
[14961]644 $char_encoding =~ s/-/_/g;
645 $char_encoding =~ s/^utf_8$/utf8/;
[15868]646
[14961]647 if ($char_encoding =~ m/^\d+$/) {
[15607]648 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
[14961]649 $char_encoding = "windows_$char_encoding";
650 }
[15607]651 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
[14961]652 $char_encoding = "dos_$char_encoding";
653 }
654 }
[15868]655
[14961]656 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
[15446]657 || (defined $encodings::encodings->{$char_encoding})) {
[14961]658 $filesystem_encoding = $char_encoding;
659 }
660 else {
661 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
662 }
663 }
[15868]664
[14961]665
666 };
667 if ($@) {
668 print $outhandle "$@\n";
669 print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
670
671 }
[15868]672 return $filesystem_encoding;
673}
[14961]674
[15868]675# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
676sub set_Source_metadata {
677 my $self = shift (@_);
678 my ($doc_obj, $filename_no_path, $file_encoding) = @_;
679
680 my $top_section = $doc_obj->get_top_section();
[14961]681
[15868]682 # UTF-8 version of filename
683 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
[16022]684 $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
[14961]685
686}
[15868]687
688sub add_OID {
[14961]689
[15018]690 my $self = shift (@_);
691 my ($doc_obj) = @_;
692
693 # See if a metadata field is specified as the field
694 if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
695 my $metadata_doc_id = $self->{'use_as_doc_identifier'};
696
697 # Consider "tidying" up metadata_doc_id to be something
698 # suitable in a URL
699 # Could even support a user specified plugin RE for this.
700
701 my $top_section = $doc_obj->get_top_section();
702 my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
703## print STDERR "**** oid = $oid\n";
704 $doc_obj->set_OID($oid);
705 }
706 # See if there is a plugin-specific set_OID function...
707 elsif (defined ($self->can('set_OID'))) {
708 # it will need $doc_obj to set the Identifier metadata...
709 $self->set_OID(@_); # pass through any extra arguments supplied
710 } else {
711 # use the default set_OID() in doc.pm
712 $doc_obj->set_OID();
713 }
714}
715
[15868]716
717
718# The BasePlugin read_into_doc_obj() function. This function does all the
719# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]720# a file and sets up a slew of metadata all saved in doc_obj, which
721# it then returns as part of a tuple (process_status,doc_obj)
722#
723# Much of this functionality used to reside in read, but it was broken
724# down into a supporting routine to make the code more flexible.
725#
726# recursive plugins (e.g. RecPlug) and specialized plugins like those
727# capable of processing many documents within a single file (e.g.
728# GMLPlug) will normally want to implement their own version of
729# read_into_doc_obj()
730#
731# Note that $base_dir might be "" and that $file might
732# include directories
[15868]733
734# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]735sub read_into_doc_obj {
736 my $self = shift (@_);
[16390]737 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]738
[15868]739 my $outhandle = $self->{'outhandle'};
[10280]740
[15868]741 # should we move this to read? What about secondary plugins?
742 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
743 print $outhandle "$self->{'plugin_type'} processing $file\n"
744 if $self->{'verbosity'} > 1;
[10280]745
[16390]746 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[1242]747 # create a new document
[15868]748 my $doc_obj = new doc ($filename_full_path, "indexed_doc");
[14961]749 my $top_section = $doc_obj->get_top_section();
750
[15868]751 # this should look at the plugin option too...
752 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[14961]753 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]754 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
755
[15877]756 $self->set_Source_metadata($doc_obj, $filename_no_path);
[8166]757
[15868]758 # plugin specific stuff - what args do we need here??
759 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
760 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
761 return -1;
[2816]762 }
[1242]763
[15868]764 # include any metadata passed in from previous plugins
765 # note that this metadata is associated with the top level section
766 my $section = $doc_obj->get_top_section();
767 # can we merge these two methods??
768 $self->add_associated_files($doc_obj, $filename_full_path);
769 $self->extra_metadata ($doc_obj, $section, $metadata);
770 $self->auto_extract_metadata($doc_obj);
[1242]771
[15868]772 # if we haven't found any Title so far, assign one
773 # this was shifted to here from inside read()
774 $self->title_fallback($doc_obj,$section,$filename_no_path);
775
776 $self->add_OID($doc_obj);
777
778 return (1,$doc_obj);
779}
[2785]780
[15868]781sub add_dummy_text {
782 my $self = shift(@_);
783 my ($doc_obj, $section) = @_;
[2785]784
[15868]785 # add NoText metadata so we can hide this dummy text in format statements
786 $doc_obj->add_metadata($section, "NoText", "1");
787 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
788
789}
[8510]790
[15868]791# does nothing. Can be overridden by subclass
792sub auto_extract_metadata {
793 my $self = shift(@_);
794 my ($doc_obj) = @_;
795}
[11122]796
[15868]797# adds cover image, associate_file options stuff. Should be called by sub class
798# read_into_doc_obj
799sub add_associated_files {
800 my $self = shift(@_);
801 # whatis filename??
802 my ($doc_obj, $filename) = @_;
803
804 # add in the cover image
805 if ($self->{'cover_image'}) {
806 $self->associate_cover_image($doc_obj, $filename);
[8716]807 }
[9398]808
[15018]809
[10280]810}
[1242]811
[16390]812# implement this if you are extracting metadata for other documents
813sub metadata_read {
814 my $self = shift (@_);
815 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
816
817 # can we process this file??
818 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
819 return undef unless $self->can_process_this_file($filename_full_path);
820
821 return 1; # we recognise the file, but don't actually do anything with it
822}
823
824
[15868]825# The BasePlugin read() function. This function calls read_into_doc_obj()
[10280]826# to ensure all the right things to make general options work for a
827# given plugin are done. It then calls the process() function which
828# does all the work specific to a plugin (like the old read functions
829# used to do). Most plugins should define their own process() function
830# and let this read() function keep control.
831#
832# recursive plugins (e.g. RecPlug) and specialized plugins like those
833# capable of processing many documents within a single file (e.g.
834# GMLPlug) might want to implement their own version of read(), but
835# more likely need to implement their own version of read_into_doc_obj()
836#
837# Return number of files processed, undef if can't recognise, -1 if can't
838# process
839
840sub read {
841 my $self = shift (@_);
[16390]842 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]843
[16390]844 # can we process this file??
845 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
846 return undef unless $self->can_process_this_file($filename_full_path);
847
[10280]848 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[15868]849
[10280]850 if ((defined $process_status) && ($process_status == 1)) {
[15868]851
[10280]852 # process the document
853 $processor->process($doc_obj);
[15868]854
[10280]855 $self->{'num_processed'} ++;
856 undef $doc_obj;
[9398]857 }
[15868]858 # delete any temp files that we may have created
859 $self->clean_up_after_doc_obj_processing();
[9398]860
[10280]861 # if process_status == 1, then the file has been processed.
862 return $process_status;
863
[4]864}
865
[1244]866# returns undef if file is rejected by the plugin
[1242]867sub process {
868 my $self = shift (@_);
[11089]869 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]870
[15868]871 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
[1244]872
873 return undef; # never gets here
[1242]874}
875
[15868]876# overwrite this method to delete any temp files that we have created
877sub clean_up_after_doc_obj_processing {
878 my $self = shift(@_);
[4]879
[10280]880}
[16390]881
[10280]882# write_file -- used by ConvertToPlug, for example in post processing
883#
[15868]884# where should this go, is here the best place??
[10280]885sub utf8_write_file {
886 my $self = shift (@_);
887 my ($textref, $filename) = @_;
888
889 if (!open (FILE, ">$filename")) {
890 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
891 die "\n";
892 }
893 print FILE $$textref;
894
[1219]895 close FILE;
896}
897
[10280]898
[7504]899sub filename_based_title
900{
901 my $self = shift (@_);
902 my ($file) = @_;
903
904 my $file_derived_title = $file;
905 $file_derived_title =~ s/_/ /g;
906 $file_derived_title =~ s/\..*?$//;
907
908 return $file_derived_title;
909}
910
[9398]911
[7504]912sub title_fallback
913{
914 my $self = shift (@_);
915 my ($doc_obj,$section,$file) = @_;
916
[15868]917 if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]918
[15877]919 my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
[15868]920 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
921 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]922 }
[15868]923 else {
924 $doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title);
925 }
[9413]926 }
[15868]927
[1844]928}
[15868]929
[1219]930# add any extra metadata that's been passed around from one
931# plugin to another.
932# extra_metadata uses add_utf8_metadata so it expects metadata values
933# to already be in utf8
934sub extra_metadata {
935 my $self = shift (@_);
936 my ($doc_obj, $cursection, $metadata) = @_;
937
[11122]938 my $associate_tail_re = $self->{'associate_tail_re'};
939
[1219]940 foreach my $field (keys(%$metadata)) {
[839]941 # $metadata->{$field} may be an array reference
[8510]942 if ($field eq "gsdlassocfile_tobe") {
943 # 'gsdlassocfile_tobe' is artificially introduced metadata
944 # that is used to signal that certain additional files should
945 # be tied to this document. Useful in situations where a
946 # metadata pass in the plugin pipeline works out some files
947 # need to be associated with a document, but the document hasn't
948 # been formed yet.
949 my $equiv_form = "";
950 foreach my $gaf (@{$metadata->{$field}}) {
951 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
952 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
953 my $filename = $full_filename;
954 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[11122]955
956 # work out extended tail extension (i.e. matching tail re)
957
958 my ($file_prefix,$file_extended_ext)
[16390]959 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]960 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
961
[8510]962 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
[11834]963 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
[8510]964 my $srcicon = "_icon".$doc_ext."_";
965 my $end_doclink = "</a>";
966
[11122]967 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
968
969 if (defined $pre_doc_ext) {
970 # for metadata such as [mp3._edited] [mp3._full] ...
971 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
972 }
973
974 # for multiple metadata such as [mp3.assoclink]
975 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
976
977 $equiv_form .= " $assoc_form";
[8510]978 }
979 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
980 }
981 elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]982 map {
[1219]983 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]984 } @{$metadata->{$field}};
985 } else {
[1219]986 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]987 }
988 }
989}
990
[1396]991
[2785]992sub compile_stats {
993 my $self = shift(@_);
994 my ($stats) = @_;
995
996 $stats->{'num_processed'} += $self->{'num_processed'};
997 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]998 $stats->{'num_archives'} += $self->{'num_archives'};
[2785]999
1000}
1001
[2816]1002sub associate_cover_image {
[10833]1003 my $self = shift;
[2816]1004 my ($doc_obj, $filename) = @_;
1005
[10833]1006 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1007 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1008 # don't stat() for existence eg for multiple document input files
1009 # (eg SplitPlug)
1010 return;
1011 }
1012
[9413]1013 my $top_section=$doc_obj->get_top_section();
1014
[2816]1015 if (-e $filename) {
[13968]1016 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]1017 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]1018 } else {
[10833]1019 my $upper_filename = $filename;
1020 $upper_filename =~ s/jpg$/JPG/;
1021 if (-e $upper_filename) {
1022 $doc_obj->associate_file($upper_filename, "cover.jpg",
1023 "image/jpeg");
[9413]1024 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]1025 } else {
1026 # file doesn't exist, so record the fact that it's missing so
1027 # we don't stat() again (stat is slow)
1028 $self->{'covers_missing_cache'}->{$filename} = 1;
[3086]1029 }
[2816]1030 }
[10833]1031
[2816]1032}
1033
[11332]1034
1035# Overridden by exploding plugins (eg. ISISPlug)
1036sub clean_up_after_exploding
1037{
1038 my $self = shift(@_);
1039}
1040
1041
[16390]1042
[4]10431;
Note: See TracBrowser for help on using the repository browser.