source: gsdl/trunk/perllib/plugins/BasePlugin.pm@ 20879

Last change on this file since 20879 was 20778, checked in by kjdon, 15 years ago

plugins now need to add any auxiliary source files as source assoc files, so we know when to reimport for incremental import. Have started this, but not finished and not tested :-)

  • Property svn:keywords set to Author Date Id Revision
File size: 35.5 KB
RevLine 
[537]1###########################################################################
2#
[15868]3# BasePlugin.pm -- base class for all the import plugins
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[9413]8# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
[15868]26package BasePlugin;
[2219]27
[10254]28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]31
[8892]32use File::Basename;
33
[1870]34use encodings;
[11389]35use unicode;
[16557]36use textcat;
[1242]37use doc;
[2751]38use ghtml;
[9413]39use gsprintf 'gsprintf';
[4]40
[15868]41use PrintInfo;
[10218]42
[15868]43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
[5681]46
[18320]47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
[18398]53 'desc' => "{BasePlugin.rename_method.base64}" },
[18404]54 { 'name' => "none",
55 'desc' => "{BasePlugin.rename_method.none}",
[18398]56 'hiddengli' => "yes" } ];
[18320]57
[15868]58our $encoding_list =
[10218]59 [ { 'name' => "ascii",
[16014]60 'desc' => "{BasePlugin.encoding.ascii}" },
[4744]61 { 'name' => "utf8",
[16014]62 'desc' => "{BasePlugin.encoding.utf8}" },
[4744]63 { 'name' => "unicode",
[16014]64 'desc' => "{BasePlugin.encoding.unicode}" } ];
[16557]65
[3540]66
[10620]67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70 my $hashEncode =
71 {'name' => $enc,
72 'desc' => $e->{$enc}->{'name'}};
73
[15868]74 push(@{$encoding_list},$hashEncode);
[10620]75}
76
[15868]77our $encoding_plus_auto_list =
78 [ { 'name' => "auto",
[16557]79 'desc' => "{BasePlugin.filename_encoding.auto}" },
80 { 'name' => "auto-language-analysis",
81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82 { 'name' => "auto-filesystem-encoding",
83 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84 { 'name' => "auto-fl",
85 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86 { 'name' => "auto-lf",
87 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
[10620]88
[15868]89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
[16698]91our $oidtype_list =
[17026]92 [ { 'name' => "auto",
93 'desc' => "{BasePlugin.OIDtype.auto}" },
94 { 'name' => "hash",
[16698]95 'desc' => "{import.OIDtype.hash}" },
96 { 'name' => "assigned",
97 'desc' => "{import.OIDtype.assigned}" },
98 { 'name' => "incremental",
99 'desc' => "{import.OIDtype.incremental}" },
100 { 'name' => "dirname",
101 'desc' => "{import.OIDtype.dirname}" } ];
102
[4873]103my $arguments =
[3540]104 [ { 'name' => "process_exp",
[15868]105 'desc' => "{BasePlugin.process_exp}",
[6408]106 'type' => "regexp",
[3540]107 'deft' => "",
108 'reqd' => "no" },
[16390]109 { 'name' => "no_blocking",
110 'desc' => "{BasePlugin.no_blocking}",
111 'type' => "flag",
112 'reqd' => "no"},
[3540]113 { 'name' => "block_exp",
[15868]114 'desc' => "{BasePlugin.block_exp}",
[6408]115 'type' => "regexp",
[3540]116 'deft' => "",
117 'reqd' => "no" },
[8892]118 { 'name' => "associate_ext",
[15868]119 'desc' => "{BasePlugin.associate_ext}",
[8892]120 'type' => "string",
121 'reqd' => "no" },
[11122]122 { 'name' => "associate_tail_re",
[15868]123 'desc' => "{BasePlugin.associate_tail_re}",
[11122]124 'type' => "string",
125 'reqd' => "no" },
[16698]126 { 'name' => "OIDtype",
127 'desc' => "{import.OIDtype}",
128 'type' => "enum",
129 'list' => $oidtype_list,
[16847]130 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
[17026]131 'deft' => "auto",
[18591]132 'reqd' => "no" },
[16698]133 { 'name' => "OIDmetadata",
134 'desc' => "{import.OIDmetadata}",
135 'type' => "metadata",
136 'deft' => "dc.Identifier",
[18591]137 'reqd' => "no" },
[16698]138# { 'name' => "use_as_doc_identifier",
139# 'desc' => "{BasePlugin.use_as_doc_identifier}",
140# 'type' => "string",
141# 'reqd' => "no" ,
142# 'deft' => "" } ,
[18320]143 { 'name' => "no_cover_image",
[15868]144 'desc' => "{BasePlugin.no_cover_image}",
[3540]145 'type' => "flag",
146 'reqd' => "no" },
[15868]147 { 'name' => "filename_encoding",
148 'desc' => "{BasePlugin.filename_encoding}",
149 'type' => "enum",
150 'deft' => "auto",
151 'list' => $encoding_plus_auto_list,
[16390]152 'reqd' => "no" },
153 { 'name' => "smart_block",
[16520]154 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
[16390]155 'type' => "flag",
156 'reqd' => "no",
[18320]157 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
158 { 'name' => "file_rename_method",
159 'desc' => "{BasePlugin.file_rename_method}",
160 'type' => "enum",
161 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
162 'list' => $file_rename_method_list,
163 'reqd' => "no"
164 }
[15868]165
166 ];
[3540]167
[9398]168
[15868]169my $options = { 'name' => "BasePlugin",
170 'desc' => "{BasePlugin.desc}",
[6408]171 'abstract' => "yes",
172 'inherits' => "no",
[4750]173 'args' => $arguments };
[3540]174
[4]175sub new {
[10218]176
[15868]177 my ($class) = shift (@_);
[16698]178 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
[10218]179 push(@$pluginlist, $class);
[9398]180
[15868]181 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
182 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]183
[16698]184 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
[16390]185
186 if ($self->{'info_only'}) {
187 # don't worry about any options etc
188 return bless $self, $class;
189 }
[10579]190
[16390]191 if ($self->{'smart_block'}) {
192 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
193 }
194 $self->{'smart_block'} = undef;
195
[15868]196 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
197 $self->{'plugin_type'} = $plugin_name;
[10579]198
[20451]199 # remove ex. from OIDmetadata
[20605]200 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
[2785]201 $self->{'num_processed'} = 0;
202 $self->{'num_not_processed'} = 0;
203 $self->{'num_blocked'} = 0;
204 $self->{'num_archives'} = 0;
[8678]205 $self->{'cover_image'} = 1; # cover image is on by default
[10218]206 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[19222]207 $self->{'can_process_directories'} = 0;
[10579]208 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]209
[8892]210 my $associate_ext = $self->{'associate_ext'};
211 if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]212
[11122]213 my $associate_tail_re = $self->{'associate_tail_re'};
214 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
215 my $outhandle = $self->{'outhandle'};
216 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
217 print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]218 }
[11122]219 else {
220 my @exts = split(/,/,$associate_ext);
[8892]221
[11122]222 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
223 my $associate_tail_re = join("|",@exts_bracketed);
224 $self->{'associate_tail_re'} = $associate_tail_re;
225 }
226
227 delete $self->{'associate_ext'};
[8892]228 }
229
[15868]230 return bless $self, $class;
[11089]231
[4]232}
233
[16821]234sub merge_inheritance
235{
236 my $self = {};
237 my @child_selfs = @_;
238
239 foreach my $child_self (@child_selfs) {
240 foreach my $key (keys %$child_self) {
241 if (defined $self->{$key}) {
242 if ($self->{$key} ne $child_self->{$key}) {
243# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
244# print STDERR "Existing stored value = $self->{$key}\n";
245# print STDERR "New (child) value = $child_self->{$key}\n";
246# print STDERR "Keeping existing value\n";
247 # Existing value seems to be option specified in collect.cfg
248
249 ### $self->{$key} = $child_self->{$key};
250
251 }
252 else {
253## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
254 }
255
256 }
257 else {
258 $self->{$key} = $child_self->{$key};
259 }
260 }
261 }
262
263 return $self;
264}
265
[15868]266# initialize BasePlugin options
267# if init() is overridden in a sub-class, remember to call BasePlugin::init()
[1242]268sub init {
269 my $self = shift (@_);
[2785]270 my ($verbosity, $outhandle, $failhandle) = @_;
[1242]271
272 # verbosity is passed through from the processor
273 $self->{'verbosity'} = $verbosity;
274
[2785]275 # as are the outhandle and failhandle
[1424]276 $self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]277 $self->{'failhandle'} = $failhandle;
[16390]278# $self->SUPER::init(@_);
279
[1242]280 # set process_exp and block_exp to defaults unless they were
281 # explicitly set
[1244]282
283 if ((!$self->is_recursive()) and
[1242]284 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
[1244]285
[1242]286 $self->{'process_exp'} = $self->get_default_process_exp ();
287 if ($self->{'process_exp'} eq "") {
[1244]288 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]289 }
290 }
291
292 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
293 $self->{'block_exp'} = $self->get_default_block_exp ();
294 }
[11089]295
[1242]296}
297
[839]298sub begin {
299 my $self = shift (@_);
300 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[16821]301
[17026]302 if ($self->{'OIDtype'} eq "auto") {
303 # hasn't been set in the plugin, use the processor values
304 $self->{'OIDtype'} = $processor->{'OIDtype'};
305 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
306 }
307 if ($self->{'OIDtype'} eq "hash") {
308 # should we hash on the file or on the doc xml??
309 $self->{'OIDtype'} = $self->get_oid_hash_type();
310 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
311 $self->{'OIDtype'} = "hash_on_file";
312 }
313 }
[839]314}
315
316sub end {
[10155]317 # potentially called at the end of each plugin pass
318 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
319
[15868]320 my ($self) = shift (@_);
[839]321}
322
[10155]323sub deinit {
324 # called only once, after all plugin passes have been done
325
326 my ($self) = @_;
327}
328
[17026]329# default hashing type is to hash on the original file (or converted file)
330# override this to return hash_on_ga_xml for filetypes where hashing on the
331# file is no good eg video
332sub get_oid_hash_type {
333
334 my $self = shift (@_);
335
336 return "hash_on_file";
337}
338
[15868]339
[1242]340# this function should be overridden to return 1
341# in recursive plugins
[4]342sub is_recursive {
343 my $self = shift (@_);
344
[1242]345 return 0;
[4]346}
347
[1242]348sub get_default_block_exp {
349 my $self = shift (@_);
350
351 return "";
352}
353
354sub get_default_process_exp {
355 my $self = shift (@_);
356
357 return "";
358}
359
[18320]360# rename imported files and assoc files using URL encoding by default
361# as this will work for most plugins and give more legible filenames
362sub get_default_file_rename_method() {
363 my $self = shift (@_);
364 return "url";
365}
366
[18398]367# returns this plugin's active (possibly user-selected) file_rename_method
368sub get_file_rename_method() {
369 my $self = shift (@_);
370 my $rename_method = $self->{'file_rename_method'};
371 if($rename_method) {
372 return $rename_method;
373 } else {
374 return $self->get_default_file_rename_method();
375 }
376}
377
[16390]378# default implementation is to do nothing
379sub store_block_files {
380
[9067]381 my $self =shift (@_);
[16390]382 my ($filename_full_path, $block_hash) = @_;
383
[9067]384}
385
[16390]386# put files to block into hash
387sub use_block_expressions {
388
389 my $self =shift (@_);
390 my ($filename_full_path, $block_hash) = @_;
391
392 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
393 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
394 }
395
396}
397
[9067]398#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
399sub block_cover_image
400{
[10833]401 my $self =shift;
[16390]402 my ($filename, $block_hash) = @_;
[10833]403
[9067]404 if ($self->{'cover_image'}) {
405 my $coverfile = $filename;
406 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
407 if (!-e $coverfile) {
408 $coverfile =~ s/jpg$/JPG/;
409 }
410 if (-e $coverfile) {
[16390]411 $block_hash->{'file_blocks'}->{$coverfile} = 1;
[11089]412 }
[9067]413 }
414
415 return;
416}
[11122]417
418
[16390]419# discover all the files that should be blocked by this plugin
420# check the args ...
421sub file_block_read {
[11122]422
[8510]423 my $self = shift (@_);
[16390]424 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]425 # Keep track of filenames with same root but different extensions
[11122]426 # Used to support -associate_ext and the more generalised
427 # -associate_tail_re
[16390]428 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]429
[18441]430 if (!-d $filename_full_path) {
431 $block_hash->{'all_files'}->{$file} = 1;
432 }
433
[11122]434 my $associate_tail_re = $self->{'associate_tail_re'};
435 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
436 my ($file_prefix,$file_ext)
[16390]437 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
[8892]438 if ((defined $file_prefix) && (defined $file_ext)) {
[16390]439 my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]440 if (!defined $shared_fileroot->{$file_prefix}) {
[11122]441 my $file_prefix_rec = { 'tie_to' => undef,
442 'exts' => {} };
[8892]443 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
444 }
445
446 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
447
[16390]448 if ($self->can_process_this_file($filename_full_path)) {
[8892]449 # This is the document the others should be tied to
450 $file_prefix_rec->{'tie_to'} = $file_ext;
451 }
452 else {
[11122]453 if ($file_ext =~ m/$associate_tail_re$/) {
[16390]454 # this file should be associated to the main one
[9351]455 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
456 }
[8892]457 }
[11122]458
[8892]459 }
460 }
[11122]461
[16390]462 # check block expressions
463 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
464
[9067]465 # now check whether we are actually processing this
[16390]466 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
[9067]467 return undef; # can't recognise
468 }
[16390]469
[16852]470 # if we have a block_exp, then this overrides the normal 'smart' blocking
471 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
[8892]472
[11089]473 # block the cover image if there is one
474 if ($self->{'cover_image'}) {
[16852]475 $self->block_cover_image($filename_full_path, $block_hash);
[11089]476 }
[9067]477
478 return 1;
[8510]479}
480
[16390]481# plugins that rely on more than process_exp (eg XML plugins) can override this method
482sub can_process_this_file {
483 my $self = shift(@_);
484 my ($filename) = @_;
[8892]485
[19222]486 if (-d $filename && !$self->{'can_process_directories'}) {
487 return 0;
488 }
[16390]489 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
490 return 1;
[8892]491 }
492 return 0;
[10280]493
494}
495
[16390]496# just converts path as is to utf8.
497sub filepath_to_utf8 {
[10280]498 my $self = shift (@_);
[15868]499 my ($file, $file_encoding) = @_;
[16390]500 my $filemeta = $file;
[10280]501
[16557]502 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
503
504 # Whenever filename-encoding is set to any of the auto settings, we
505 # check if the filename is already in UTF8. If it is, then we're done.
506 if($filename_encoding =~ m/auto/) {
507 if(&unicode::check_is_utf8($filemeta))
508 {
509 $filename_encoding = "utf8";
510 return $filemeta;
511 }
512 }
513
514 # Auto setting, but filename is not utf8
515 if ($filename_encoding eq "auto")
516 {
517 # try textcat
518 $filename_encoding = $self->textcat_encoding($filemeta);
519
520 # check the locale next
521 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
522
523
524 # now try the encoding of the document, if available
525 if ($filename_encoding eq "undefined" && defined $file_encoding) {
526 $filename_encoding = $file_encoding;
[15868]527 }
[16557]528
529 }
530
531 elsif ($filename_encoding eq "auto-language-analysis")
532 {
533 $filename_encoding = $self->textcat_encoding($filemeta);
534
535 # now try the encoding of the document, if available
536 if ($filename_encoding eq "undefined" && defined $file_encoding) {
537 $filename_encoding = $file_encoding;
538 }
539 }
540
541 elsif ($filename_encoding eq "auto-filesystem-encoding")
542 {
543 # try locale
544 $filename_encoding = $self->locale_encoding();
545 }
546
547 elsif ($filename_encoding eq "auto-fl")
548 {
549 # filesystem-encoding (locale) then language-analysis (textcat)
550 $filename_encoding = $self->locale_encoding();
551
552 # try textcat
553 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
554
555 # else assume filename encoding is encoding of file content, if that's available
556 if ($filename_encoding eq "undefined" && defined $file_encoding) {
557 $filename_encoding = $file_encoding;
[15868]558 }
[16557]559 }
[15868]560
[16557]561 elsif ($filename_encoding eq "auto-lf")
562 {
563 # language-analysis (textcat) then filesystem-encoding (locale)
564 $filename_encoding = $self->textcat_encoding($filemeta);
565
566 # guess filename encoding from encoding of file content, if available
567 if ($filename_encoding eq "undefined" && defined $file_encoding) {
568 $filename_encoding = $file_encoding;
569 }
570
571 # try locale
572 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[15868]573 }
[16557]574
575 # if still undefined, use utf8 as fallback
576 if ($filename_encoding eq "undefined") {
577 $filename_encoding = "utf8";
578 }
579
[18171]580 #print STDERR "**** UTF8 encoding the filename $filemeta ";
[16767]581
[16557]582 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
583 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
584 # be always utf8 (in which case the filename's encoding is also set as utf8 even
585 # though the filename need not be if it originates from another system)--in such
586 # cases attempt to make the filename utf8 to match.
587 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
588 &unicode::ensure_utf8(\$filemeta);
589 }
590
591 # convert non-unicode encodings to utf8
592 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
593 $filemeta = &unicode::unicode2utf8(
594 &unicode::convert2unicode($filename_encoding, \$filemeta)
[15868]595 );
596 }
[16390]597
[18171]598 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
[16390]599 return $filemeta;
600}
601
602# gets the filename with no path, converts to utf8, and then dm safes it.
[18320]603# filename_encoding set by user
[16390]604sub filename_to_utf8_metadata
605{
606 my $self = shift (@_);
607 my ($file, $file_encoding) = @_;
608
609 my $outhandle = $self->{'outhandle'};
610
611 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
612 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
[16767]613
[15868]614 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
615
616 return $dmsafe_filemeta;
617
[10280]618}
619
[16557]620sub locale_encoding {
621 my $self = shift(@_);
622
623 if (!defined $self->{'filesystem_encoding'}) {
624 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
625 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
626 }
[10280]627
[18171]628 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
[16557]629 return $self->{'filesystem_encoding'}; # can be the string "undefined"
630}
631
632sub textcat_encoding {
633 my $self = shift(@_);
634 my ($filemeta) = @_;
635
636 # analyse filenames without extensions and digits (and trimmed of surrounding
637 # whitespace), so that irrelevant chars don't confuse textcat
638 my $strictfilemeta = $filemeta;
639 $strictfilemeta =~ s/\.[^\.]+$//g;
640 $strictfilemeta =~ s/\d//g;
641 $strictfilemeta =~ s/^\s*//g;
642 $strictfilemeta =~ s/\s*$//g;
643
644 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
645 if(!defined $filename_encoding) {
646 $filename_encoding = "undefined";
647 }
648
649 return $filename_encoding; # can be the string "undefined"
650}
651
652# performs textcat
653sub encoding_from_language_analysis {
654 my $self = shift(@_);
655 my ($text) = @_;
656
657 my $outhandle = $self->{'outhandle'};
658 my $best_encoding = undef;
659
[17212]660 # get the language/encoding of the textstring using textcat
[16557]661 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
[16767]662 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
[16557]663
664
665 if (scalar @$results < 0) {
666 return undef;
667 }
668
669 # We have some results, we choose the first
670 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
671
672 $best_encoding = $encoding;
673 if (!defined $best_encoding) {
674 return undef;
675 }
676
677 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
678 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
679 $best_encoding = 'utf8';
680 }
681
682
683 # check for equivalents where textcat doesn't have some encodings...
684 # eg MS versions of standard encodings
685 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
686 my $iso = $1; # which variant of the iso standard?
687 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
688 if ($text =~ /[\x80-\x9f]/) {
689 # Western Europe
690 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
691 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
692 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
693 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
694 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
695 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
696 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
697 }
698 }
699
700 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
701 !defined $encodings::encodings->{$best_encoding})
702 {
703 if ($self->{'verbosity'}) {
704 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
705 }
706 $best_encoding = undef;
707 }
708
709 return $best_encoding;
710}
711
712# uses locale
[15868]713sub get_filesystem_encoding {
[10280]714
[15868]715 my $self = shift(@_);
716
[14961]717 my $outhandle = $self->{'outhandle'};
718 my $filesystem_encoding = undef;
719
720 eval {
721 use POSIX qw(locale_h);
[15868]722
723 # With only one parameter, setlocale retrieves the
724 # current value
[14961]725 my $current_locale = setlocale(LC_CTYPE);
[15868]726
[14961]727 if ($current_locale =~ m/^.*\.(.*?)$/) {
728 my $char_encoding = lc($1);
[15446]729 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
730 $char_encoding = "$1\_$2\_$3";
731 }
732
[14961]733 $char_encoding =~ s/-/_/g;
734 $char_encoding =~ s/^utf_8$/utf8/;
[15868]735
[14961]736 if ($char_encoding =~ m/^\d+$/) {
[15607]737 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
[14961]738 $char_encoding = "windows_$char_encoding";
739 }
[15607]740 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
[14961]741 $char_encoding = "dos_$char_encoding";
742 }
743 }
[15868]744
[14961]745 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
[15446]746 || (defined $encodings::encodings->{$char_encoding})) {
[14961]747 $filesystem_encoding = $char_encoding;
748 }
749 else {
750 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
751 }
752 }
[15868]753
[14961]754
755 };
756 if ($@) {
757 print $outhandle "$@\n";
[18398]758 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
[14961]759
760 }
[15868]761 return $filesystem_encoding;
762}
[14961]763
[15868]764# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
765sub set_Source_metadata {
766 my $self = shift (@_);
767 my ($doc_obj, $filename_no_path, $file_encoding) = @_;
768
769 my $top_section = $doc_obj->get_top_section();
[14961]770
[15868]771 # UTF-8 version of filename
772 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
[18320]773
[16919]774 # Source is the UTF8 display name - not necessarily the name of the file on the system
775 $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
776
[18320]777 $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
778 # If using URL encoding, then SourceFile is the url-reference to url-encoded
779 # filemeta: it's a url that refers to the actual file on the system
[18404]780 $filemeta = &unicode::filename_to_url($filemeta);
[18320]781
[16919]782 $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
[14961]783}
[17026]784
785# this should be called by all plugins to set the oid of the doc obj, rather
786# than calling doc_obj->set_OID directly
[15868]787sub add_OID {
[15018]788 my $self = shift (@_);
789 my ($doc_obj) = @_;
790
[17026]791 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
[15018]792
[17026]793 # see if there is a plugin specific set_OID function
794 if (defined ($self->can('set_OID'))) {
795 $self->set_OID(@_); # pass through doc_obj and any extra arguments
[15018]796 }
[17026]797 else {
[15018]798 # use the default set_OID() in doc.pm
799 $doc_obj->set_OID();
800 }
[17026]801
[15018]802}
[17026]803
[15868]804# The BasePlugin read_into_doc_obj() function. This function does all the
805# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]806# a file and sets up a slew of metadata all saved in doc_obj, which
807# it then returns as part of a tuple (process_status,doc_obj)
808#
809# Much of this functionality used to reside in read, but it was broken
810# down into a supporting routine to make the code more flexible.
811#
812# recursive plugins (e.g. RecPlug) and specialized plugins like those
813# capable of processing many documents within a single file (e.g.
814# GMLPlug) will normally want to implement their own version of
815# read_into_doc_obj()
816#
817# Note that $base_dir might be "" and that $file might
818# include directories
[15868]819
820# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]821sub read_into_doc_obj {
822 my $self = shift (@_);
[16390]823 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]824
[15868]825 my $outhandle = $self->{'outhandle'};
[10280]826
[15868]827 # should we move this to read? What about secondary plugins?
828 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
829 print $outhandle "$self->{'plugin_type'} processing $file\n"
830 if $self->{'verbosity'} > 1;
[10280]831
[16390]832 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[18469]833
[1242]834 # create a new document
[18320]835 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[14961]836 my $top_section = $doc_obj->get_top_section();
837
838 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]839 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
840
[18469]841
[16997]842 # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
843 # of the UTF8 filename (SourceFile) for generated files
[15877]844 $self->set_Source_metadata($doc_obj, $filename_no_path);
[8166]845
[18469]846
[15868]847 # plugin specific stuff - what args do we need here??
848 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
849 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
850 return -1;
[2816]851 }
[1242]852
[15868]853 # include any metadata passed in from previous plugins
854 # note that this metadata is associated with the top level section
855 my $section = $doc_obj->get_top_section();
856 # can we merge these two methods??
857 $self->add_associated_files($doc_obj, $filename_full_path);
858 $self->extra_metadata ($doc_obj, $section, $metadata);
859 $self->auto_extract_metadata($doc_obj);
[1242]860
[15868]861 # if we haven't found any Title so far, assign one
862 # this was shifted to here from inside read()
863 $self->title_fallback($doc_obj,$section,$filename_no_path);
864
865 $self->add_OID($doc_obj);
866
867 return (1,$doc_obj);
868}
[2785]869
[15868]870sub add_dummy_text {
871 my $self = shift(@_);
872 my ($doc_obj, $section) = @_;
[2785]873
[15868]874 # add NoText metadata so we can hide this dummy text in format statements
875 $doc_obj->add_metadata($section, "NoText", "1");
876 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
877
878}
[8510]879
[15868]880# does nothing. Can be overridden by subclass
881sub auto_extract_metadata {
882 my $self = shift(@_);
883 my ($doc_obj) = @_;
884}
[11122]885
[15868]886# adds cover image, associate_file options stuff. Should be called by sub class
887# read_into_doc_obj
888sub add_associated_files {
889 my $self = shift(@_);
890 # whatis filename??
891 my ($doc_obj, $filename) = @_;
892
893 # add in the cover image
894 if ($self->{'cover_image'}) {
895 $self->associate_cover_image($doc_obj, $filename);
[8716]896 }
[9398]897
[15018]898
[10280]899}
[1242]900
[16390]901# implement this if you are extracting metadata for other documents
902sub metadata_read {
903 my $self = shift (@_);
[19493]904 my ($pluginfo, $base_dir, $file, $block_hash,
905 $extrametakeys, $extrametadata, $extrametafile,
906 $processor, $maxdocs, $gli) = @_;
[16390]907
908 # can we process this file??
909 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
910 return undef unless $self->can_process_this_file($filename_full_path);
911
912 return 1; # we recognise the file, but don't actually do anything with it
913}
914
915
[15868]916# The BasePlugin read() function. This function calls read_into_doc_obj()
[10280]917# to ensure all the right things to make general options work for a
918# given plugin are done. It then calls the process() function which
919# does all the work specific to a plugin (like the old read functions
920# used to do). Most plugins should define their own process() function
921# and let this read() function keep control.
922#
923# recursive plugins (e.g. RecPlug) and specialized plugins like those
924# capable of processing many documents within a single file (e.g.
925# GMLPlug) might want to implement their own version of read(), but
926# more likely need to implement their own version of read_into_doc_obj()
927#
928# Return number of files processed, undef if can't recognise, -1 if can't
929# process
930
931sub read {
932 my $self = shift (@_);
[16390]933 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]934
[16390]935 # can we process this file??
936 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[20577]937
[16390]938 return undef unless $self->can_process_this_file($filename_full_path);
939
[10280]940 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[16821]941
[10280]942 if ((defined $process_status) && ($process_status == 1)) {
[15868]943
[10280]944 # process the document
945 $processor->process($doc_obj);
[15868]946
[10280]947 $self->{'num_processed'} ++;
948 undef $doc_obj;
[9398]949 }
[15868]950 # delete any temp files that we may have created
951 $self->clean_up_after_doc_obj_processing();
[9398]952
[18469]953
[10280]954 # if process_status == 1, then the file has been processed.
955 return $process_status;
956
[4]957}
958
[1244]959# returns undef if file is rejected by the plugin
[1242]960sub process {
961 my $self = shift (@_);
[11089]962 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]963
[15868]964 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
[1244]965
966 return undef; # never gets here
[1242]967}
968
[15868]969# overwrite this method to delete any temp files that we have created
970sub clean_up_after_doc_obj_processing {
971 my $self = shift(@_);
[4]972
[10280]973}
[16390]974
[10280]975# write_file -- used by ConvertToPlug, for example in post processing
976#
[15868]977# where should this go, is here the best place??
[10280]978sub utf8_write_file {
979 my $self = shift (@_);
980 my ($textref, $filename) = @_;
981
982 if (!open (FILE, ">$filename")) {
983 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
984 die "\n";
985 }
986 print FILE $$textref;
987
[1219]988 close FILE;
989}
990
[10280]991
[7504]992sub filename_based_title
993{
994 my $self = shift (@_);
995 my ($file) = @_;
996
997 my $file_derived_title = $file;
998 $file_derived_title =~ s/_/ /g;
999 $file_derived_title =~ s/\..*?$//;
1000
1001 return $file_derived_title;
1002}
1003
[9398]1004
[7504]1005sub title_fallback
1006{
1007 my $self = shift (@_);
1008 my ($doc_obj,$section,$file) = @_;
1009
[15868]1010 if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]1011
[15877]1012 my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
[15868]1013 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1014 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]1015 }
[15868]1016 else {
[16995]1017 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
[15868]1018 }
[9413]1019 }
[15868]1020
[1844]1021}
[15868]1022
[1219]1023# add any extra metadata that's been passed around from one
1024# plugin to another.
1025# extra_metadata uses add_utf8_metadata so it expects metadata values
1026# to already be in utf8
1027sub extra_metadata {
1028 my $self = shift (@_);
1029 my ($doc_obj, $cursection, $metadata) = @_;
1030
[11122]1031 my $associate_tail_re = $self->{'associate_tail_re'};
1032
[1219]1033 foreach my $field (keys(%$metadata)) {
[839]1034 # $metadata->{$field} may be an array reference
[8510]1035 if ($field eq "gsdlassocfile_tobe") {
1036 # 'gsdlassocfile_tobe' is artificially introduced metadata
1037 # that is used to signal that certain additional files should
1038 # be tied to this document. Useful in situations where a
1039 # metadata pass in the plugin pipeline works out some files
1040 # need to be associated with a document, but the document hasn't
1041 # been formed yet.
1042 my $equiv_form = "";
1043 foreach my $gaf (@{$metadata->{$field}}) {
1044 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
[18171]1045 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1046
1047 # we need to make sure the filename is valid utf-8 - we do
[18320]1048 # this by url or base64 encoding it
[18171]1049 # $tail_filename is the name that we store the file as
[18320]1050 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
[8510]1051 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[20778]1052 $doc_obj->associate_source_file($full_filename);
[18320]1053 # If the filename is url_encoded, we need to encode the % signs
1054 # in the filename, so that it works in a url
[18404]1055 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
[11122]1056 # work out extended tail extension (i.e. matching tail re)
1057
1058 my ($file_prefix,$file_extended_ext)
[16390]1059 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]1060 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
[8510]1061 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
[18171]1062 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$url_tail_filename\">";
[8510]1063 my $srcicon = "_icon".$doc_ext."_";
1064 my $end_doclink = "</a>";
1065
[11122]1066 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1067
[18171]1068 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
[11122]1069 # for metadata such as [mp3._edited] [mp3._full] ...
1070 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1071 }
1072
1073 # for multiple metadata such as [mp3.assoclink]
1074 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1075
1076 $equiv_form .= " $assoc_form";
[8510]1077 }
1078 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1079 }
1080 elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]1081 map {
[1219]1082 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]1083 } @{$metadata->{$field}};
1084 } else {
[1219]1085 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]1086 }
1087 }
1088}
1089
[1396]1090
[2785]1091sub compile_stats {
1092 my $self = shift(@_);
1093 my ($stats) = @_;
1094
1095 $stats->{'num_processed'} += $self->{'num_processed'};
1096 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]1097 $stats->{'num_archives'} += $self->{'num_archives'};
[2785]1098
1099}
1100
[2816]1101sub associate_cover_image {
[10833]1102 my $self = shift;
[2816]1103 my ($doc_obj, $filename) = @_;
1104
[10833]1105 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1106 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1107 # don't stat() for existence eg for multiple document input files
1108 # (eg SplitPlug)
1109 return;
1110 }
1111
[9413]1112 my $top_section=$doc_obj->get_top_section();
1113
[2816]1114 if (-e $filename) {
[20778]1115 $doc_obj->associate_source_file($filename);
[13968]1116 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]1117 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]1118 } else {
[10833]1119 my $upper_filename = $filename;
1120 $upper_filename =~ s/jpg$/JPG/;
1121 if (-e $upper_filename) {
[20778]1122 $doc_obj->associate_source_file($upper_filename);
[10833]1123 $doc_obj->associate_file($upper_filename, "cover.jpg",
1124 "image/jpeg");
[9413]1125 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]1126 } else {
1127 # file doesn't exist, so record the fact that it's missing so
1128 # we don't stat() again (stat is slow)
1129 $self->{'covers_missing_cache'}->{$filename} = 1;
[3086]1130 }
[2816]1131 }
[10833]1132
[2816]1133}
1134
[11332]1135
1136# Overridden by exploding plugins (eg. ISISPlug)
1137sub clean_up_after_exploding
1138{
1139 my $self = shift(@_);
1140}
1141
1142
[16390]1143
[4]11441;
Note: See TracBrowser for help on using the repository browser.