source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 24403

Last change on this file since 24403 was 24403, checked in by ak19, 13 years ago

Dr Bainbridge has fixed the conflict between OAIPlugin and EmbeddedMetadataPlugin which resulted in the oai tutorial (with the JCDL pictures) going wrong: meta was not attached to the images. Dr Bainbridge solved the problem by introducing a new method in BasePlugin: can_process_this_file_for_metadata, which by default returns undef so that things should work by default mostly. This method has been overridden in OAIPlugin and EmbeddedMetadataPlugin now to do the right thing there.

  • Property svn:keywords set to Author Date Id Revision
File size: 46.0 KB
RevLine 
[537]1###########################################################################
2#
[15868]3# BasePlugin.pm -- base class for all the import plugins
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[9413]8# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
[15868]26package BasePlugin;
[2219]27
[10254]28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]31
[8892]32use File::Basename;
[23335]33use Encode;
[23832]34use Unicode::Normalize 'normalize';
[8892]35
[1870]36use encodings;
[11389]37use unicode;
[1242]38use doc;
[2751]39use ghtml;
[9413]40use gsprintf 'gsprintf';
[4]41
[15868]42use PrintInfo;
[10218]43
[15868]44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
[5681]47
[18320]48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
[18398]54 'desc' => "{BasePlugin.rename_method.base64}" },
[18404]55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
[18398]57 'hiddengli' => "yes" } ];
[18320]58
[15868]59our $encoding_list =
[10218]60 [ { 'name' => "ascii",
[16014]61 'desc' => "{BasePlugin.encoding.ascii}" },
[4744]62 { 'name' => "utf8",
[16014]63 'desc' => "{BasePlugin.encoding.utf8}" },
[4744]64 { 'name' => "unicode",
[16014]65 'desc' => "{BasePlugin.encoding.unicode}" } ];
[3540]66
[23457]67
[10620]68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
[15868]75 push(@{$encoding_list},$hashEncode);
[10620]76}
77
[15868]78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
[16557]80 'desc' => "{BasePlugin.filename_encoding.auto}" },
[23457]81 { 'name' => "auto-language-analysis",
[16557]82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
[10620]89
[15868]90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
[16698]92our $oidtype_list =
[17026]93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
[16698]96 'desc' => "{import.OIDtype.hash}" },
[24290]97 { 'name' => "hash_on_ga_xml",
98 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
[16698]99 { 'name' => "assigned",
100 'desc' => "{import.OIDtype.assigned}" },
101 { 'name' => "incremental",
102 'desc' => "{import.OIDtype.incremental}" },
103 { 'name' => "dirname",
104 'desc' => "{import.OIDtype.dirname}" } ];
105
[4873]106my $arguments =
[3540]107 [ { 'name' => "process_exp",
[15868]108 'desc' => "{BasePlugin.process_exp}",
[6408]109 'type' => "regexp",
[3540]110 'deft' => "",
111 'reqd' => "no" },
[16390]112 { 'name' => "no_blocking",
113 'desc' => "{BasePlugin.no_blocking}",
114 'type' => "flag",
115 'reqd' => "no"},
[3540]116 { 'name' => "block_exp",
[15868]117 'desc' => "{BasePlugin.block_exp}",
[6408]118 'type' => "regexp",
[3540]119 'deft' => "",
120 'reqd' => "no" },
[22215]121 { 'name' => "store_original_file",
122 'desc' => "{BasePlugin.store_original_file}",
123 'type' => "flag",
124 'reqd' => "no" },
[8892]125 { 'name' => "associate_ext",
[15868]126 'desc' => "{BasePlugin.associate_ext}",
[8892]127 'type' => "string",
128 'reqd' => "no" },
[11122]129 { 'name' => "associate_tail_re",
[15868]130 'desc' => "{BasePlugin.associate_tail_re}",
[11122]131 'type' => "string",
132 'reqd' => "no" },
[16698]133 { 'name' => "OIDtype",
134 'desc' => "{import.OIDtype}",
135 'type' => "enum",
136 'list' => $oidtype_list,
[16847]137 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
[17026]138 'deft' => "auto",
[18591]139 'reqd' => "no" },
[16698]140 { 'name' => "OIDmetadata",
141 'desc' => "{import.OIDmetadata}",
142 'type' => "metadata",
143 'deft' => "dc.Identifier",
[18591]144 'reqd' => "no" },
[16698]145# { 'name' => "use_as_doc_identifier",
146# 'desc' => "{BasePlugin.use_as_doc_identifier}",
147# 'type' => "string",
148# 'reqd' => "no" ,
149# 'deft' => "" } ,
[18320]150 { 'name' => "no_cover_image",
[15868]151 'desc' => "{BasePlugin.no_cover_image}",
[3540]152 'type' => "flag",
153 'reqd' => "no" },
[15868]154 { 'name' => "filename_encoding",
155 'desc' => "{BasePlugin.filename_encoding}",
156 'type' => "enum",
157 'deft' => "auto",
158 'list' => $encoding_plus_auto_list,
[16390]159 'reqd' => "no" },
160 { 'name' => "smart_block",
[16520]161 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
[16390]162 'type' => "flag",
163 'reqd' => "no",
[18320]164 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
165 { 'name' => "file_rename_method",
166 'desc' => "{BasePlugin.file_rename_method}",
167 'type' => "enum",
168 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
169 'list' => $file_rename_method_list,
170 'reqd' => "no"
[23457]171 }
[15868]172
173 ];
[3540]174
[9398]175
[15868]176my $options = { 'name' => "BasePlugin",
177 'desc' => "{BasePlugin.desc}",
[6408]178 'abstract' => "yes",
179 'inherits' => "no",
[4750]180 'args' => $arguments };
[3540]181
[4]182sub new {
[10218]183
[15868]184 my ($class) = shift (@_);
[16698]185 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
[10218]186 push(@$pluginlist, $class);
[9398]187
[15868]188 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
189 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]190
[16698]191 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
[16390]192
193 if ($self->{'info_only'}) {
194 # don't worry about any options etc
195 return bless $self, $class;
196 }
[10579]197
[16390]198 if ($self->{'smart_block'}) {
199 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
200 }
201 $self->{'smart_block'} = undef;
202
[15868]203 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
204 $self->{'plugin_type'} = $plugin_name;
[10579]205
[20451]206 # remove ex. from OIDmetadata
[20605]207 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
[2785]208 $self->{'num_processed'} = 0;
209 $self->{'num_not_processed'} = 0;
210 $self->{'num_blocked'} = 0;
211 $self->{'num_archives'} = 0;
[8678]212 $self->{'cover_image'} = 1; # cover image is on by default
[10218]213 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[19222]214 $self->{'can_process_directories'} = 0;
[10579]215 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]216
[8892]217 my $associate_ext = $self->{'associate_ext'};
218 if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]219
[11122]220 my $associate_tail_re = $self->{'associate_tail_re'};
221 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
222 my $outhandle = $self->{'outhandle'};
223 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
224 print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]225 }
[11122]226 else {
227 my @exts = split(/,/,$associate_ext);
[8892]228
[11122]229 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
230 my $associate_tail_re = join("|",@exts_bracketed);
231 $self->{'associate_tail_re'} = $associate_tail_re;
232 }
233
234 delete $self->{'associate_ext'};
[8892]235 }
236
[15868]237 return bless $self, $class;
[11089]238
[4]239}
240
[16821]241sub merge_inheritance
242{
243 my $self = {};
244 my @child_selfs = @_;
245
246 foreach my $child_self (@child_selfs) {
247 foreach my $key (keys %$child_self) {
248 if (defined $self->{$key}) {
249 if ($self->{$key} ne $child_self->{$key}) {
250# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
251# print STDERR "Existing stored value = $self->{$key}\n";
252# print STDERR "New (child) value = $child_self->{$key}\n";
253# print STDERR "Keeping existing value\n";
254 # Existing value seems to be option specified in collect.cfg
255
256 ### $self->{$key} = $child_self->{$key};
257
258 }
259 else {
260## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
261 }
262
263 }
264 else {
265 $self->{$key} = $child_self->{$key};
266 }
267 }
268 }
269
270 return $self;
271}
272
[15868]273# initialize BasePlugin options
274# if init() is overridden in a sub-class, remember to call BasePlugin::init()
[1242]275sub init {
276 my $self = shift (@_);
[2785]277 my ($verbosity, $outhandle, $failhandle) = @_;
[1242]278
279 # verbosity is passed through from the processor
280 $self->{'verbosity'} = $verbosity;
281
[2785]282 # as are the outhandle and failhandle
[1424]283 $self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]284 $self->{'failhandle'} = $failhandle;
[16390]285# $self->SUPER::init(@_);
286
[1242]287 # set process_exp and block_exp to defaults unless they were
288 # explicitly set
[1244]289
290 if ((!$self->is_recursive()) and
[1242]291 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
[1244]292
[1242]293 $self->{'process_exp'} = $self->get_default_process_exp ();
294 if ($self->{'process_exp'} eq "") {
[1244]295 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]296 }
297 }
298
299 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
300 $self->{'block_exp'} = $self->get_default_block_exp ();
301 }
[11089]302
[1242]303}
304
[839]305sub begin {
306 my $self = shift (@_);
307 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[16821]308
[17026]309 if ($self->{'OIDtype'} eq "auto") {
310 # hasn't been set in the plugin, use the processor values
311 $self->{'OIDtype'} = $processor->{'OIDtype'};
312 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
313 }
314 if ($self->{'OIDtype'} eq "hash") {
315 # should we hash on the file or on the doc xml??
316 $self->{'OIDtype'} = $self->get_oid_hash_type();
317 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
318 $self->{'OIDtype'} = "hash_on_file";
319 }
320 }
[839]321}
322
[21308]323# This is called once if removeold is set with import.pl. Most plugins will do
324# nothing but if a plugin does any stuff outside of creating doc obj, then
325# it may need to clear something.
326sub remove_all {
[21286]327 my $self = shift (@_);
328 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
329}
330
[21308]331# This is called per document for docs that have been deleted from the
332# collection. Most plugins will do nothing
333# but if a plugin does any stuff outside of creating doc obj, then it may need
334# to clear something.
335sub remove_one {
336 my $self = shift (@_);
337
[21315]338 my ($file, $oids, $archivedir) = @_;
339 return 0 if $self->can_process_this_file($file);
340 return undef;
[21308]341}
342
[839]343sub end {
[10155]344 # potentially called at the end of each plugin pass
345 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
346
[15868]347 my ($self) = shift (@_);
[839]348}
349
[10155]350sub deinit {
351 # called only once, after all plugin passes have been done
352
353 my ($self) = @_;
354}
355
[17026]356# default hashing type is to hash on the original file (or converted file)
357# override this to return hash_on_ga_xml for filetypes where hashing on the
358# file is no good eg video
359sub get_oid_hash_type {
360
361 my $self = shift (@_);
362
363 return "hash_on_file";
364}
365
[15868]366
[1242]367# this function should be overridden to return 1
368# in recursive plugins
[4]369sub is_recursive {
370 my $self = shift (@_);
371
[1242]372 return 0;
[4]373}
374
[1242]375sub get_default_block_exp {
376 my $self = shift (@_);
377
378 return "";
379}
380
381sub get_default_process_exp {
382 my $self = shift (@_);
383
384 return "";
385}
386
[23419]387
[18320]388# rename imported files and assoc files using URL encoding by default
389# as this will work for most plugins and give more legible filenames
390sub get_default_file_rename_method() {
391 my $self = shift (@_);
392 return "url";
393}
394
[18398]395# returns this plugin's active (possibly user-selected) file_rename_method
396sub get_file_rename_method() {
397 my $self = shift (@_);
398 my $rename_method = $self->{'file_rename_method'};
399 if($rename_method) {
400 return $rename_method;
401 } else {
402 return $self->get_default_file_rename_method();
403 }
404}
405
[16390]406# default implementation is to do nothing
407sub store_block_files {
408
[9067]409 my $self =shift (@_);
[16390]410 my ($filename_full_path, $block_hash) = @_;
411
[9067]412}
413
[16390]414# put files to block into hash
415sub use_block_expressions {
416
417 my $self =shift (@_);
418 my ($filename_full_path, $block_hash) = @_;
419
[23363]420 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
421
[16390]422 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
[23561]423 &util::block_filename($block_hash,$filename_full_path);
[16390]424 }
425
426}
427
[9067]428#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
429sub block_cover_image
430{
[10833]431 my $self =shift;
[16390]432 my ($filename, $block_hash) = @_;
[10833]433
[23363]434 $filename = &util::upgrade_if_dos_filename($filename);
435
[9067]436 if ($self->{'cover_image'}) {
437 my $coverfile = $filename;
438 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
[23759]439
[23363]440 if (!&util::fd_exists($coverfile)) {
[9067]441 $coverfile =~ s/jpg$/JPG/;
442 }
[23363]443 if (&util::fd_exists($coverfile)) {
[23561]444 &util::block_filename($block_hash,$coverfile);
[11089]445 }
[9067]446 }
447
448 return;
449}
[11122]450
451
[16390]452# discover all the files that should be blocked by this plugin
453# check the args ...
454sub file_block_read {
[11122]455
[8510]456 my $self = shift (@_);
[16390]457 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]458 # Keep track of filenames with same root but different extensions
[11122]459 # Used to support -associate_ext and the more generalised
460 # -associate_tail_re
[16390]461 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]462
[18441]463 if (!-d $filename_full_path) {
464 $block_hash->{'all_files'}->{$file} = 1;
465 }
466
[11122]467 my $associate_tail_re = $self->{'associate_tail_re'};
468 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
469 my ($file_prefix,$file_ext)
[16390]470 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
[8892]471 if ((defined $file_prefix) && (defined $file_ext)) {
[16390]472 my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]473 if (!defined $shared_fileroot->{$file_prefix}) {
[11122]474 my $file_prefix_rec = { 'tie_to' => undef,
475 'exts' => {} };
[8892]476 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
477 }
478
479 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
480
[16390]481 if ($self->can_process_this_file($filename_full_path)) {
[8892]482 # This is the document the others should be tied to
483 $file_prefix_rec->{'tie_to'} = $file_ext;
484 }
485 else {
[11122]486 if ($file_ext =~ m/$associate_tail_re$/) {
[16390]487 # this file should be associated to the main one
[9351]488 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
489 }
[8892]490 }
[11122]491
[8892]492 }
493 }
[11122]494
[16390]495 # check block expressions
496 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
497
[9067]498 # now check whether we are actually processing this
[16390]499 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
[9067]500 return undef; # can't recognise
501 }
[23457]502
[16852]503 # if we have a block_exp, then this overrides the normal 'smart' blocking
504 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
[8892]505
[11089]506 # block the cover image if there is one
507 if ($self->{'cover_image'}) {
[16852]508 $self->block_cover_image($filename_full_path, $block_hash);
[11089]509 }
[23457]510
[9067]511 return 1;
[8510]512}
513
[16390]514# plugins that rely on more than process_exp (eg XML plugins) can override this method
515sub can_process_this_file {
516 my $self = shift(@_);
517 my ($filename) = @_;
[8892]518
[19222]519 if (-d $filename && !$self->{'can_process_directories'}) {
520 return 0;
521 }
[22427]522
[16390]523 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
524 return 1;
[8892]525 }
526 return 0;
[10280]527
528}
529
[24403]530# Even if a plugin can extract metadata in its metadata_read pass,
531# make the default return 'undef' so processing of the file continues
532# down the pipeline, so other plugins can also have the opportunity to
533# locate metadata and set it up in the extrametakeys variables that
534# are passed around.
535
536sub can_process_this_file_for_metadata {
537 my $self = shift(@_);
538
539 return undef;
540}
541
542
[16390]543# just converts path as is to utf8.
544sub filepath_to_utf8 {
[10280]545 my $self = shift (@_);
[15868]546 my ($file, $file_encoding) = @_;
[16390]547 my $filemeta = $file;
[10280]548
[16557]549 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
550
551 # Whenever filename-encoding is set to any of the auto settings, we
552 # check if the filename is already in UTF8. If it is, then we're done.
553 if($filename_encoding =~ m/auto/) {
[23457]554 if(&unicode::check_is_utf8($filemeta))
555 {
556 $filename_encoding = "utf8";
557 return $filemeta;
558 }
[16557]559 }
560
561 # Auto setting, but filename is not utf8
562 if ($filename_encoding eq "auto")
563 {
[23457]564 # try textcat
565 $filename_encoding = $self->textcat_encoding($filemeta);
[16557]566
[23457]567 # check the locale next
568 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[16557]569
[23457]570
571 # now try the encoding of the document, if available
572 if ($filename_encoding eq "undefined" && defined $file_encoding) {
573 $filename_encoding = $file_encoding;
574 }
[16557]575
576 }
577
578 elsif ($filename_encoding eq "auto-language-analysis")
579 {
580 $filename_encoding = $self->textcat_encoding($filemeta);
581
582 # now try the encoding of the document, if available
583 if ($filename_encoding eq "undefined" && defined $file_encoding) {
584 $filename_encoding = $file_encoding;
585 }
586 }
587
588 elsif ($filename_encoding eq "auto-filesystem-encoding")
[23352]589 {
[16557]590 # try locale
591 $filename_encoding = $self->locale_encoding();
592 }
593
594 elsif ($filename_encoding eq "auto-fl")
595 {
596 # filesystem-encoding (locale) then language-analysis (textcat)
597 $filename_encoding = $self->locale_encoding();
598
599 # try textcat
600 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
[23457]601
[16557]602 # else assume filename encoding is encoding of file content, if that's available
603 if ($filename_encoding eq "undefined" && defined $file_encoding) {
604 $filename_encoding = $file_encoding;
[15868]605 }
[16557]606 }
[23457]607
[16557]608 elsif ($filename_encoding eq "auto-lf")
609 {
610 # language-analysis (textcat) then filesystem-encoding (locale)
611 $filename_encoding = $self->textcat_encoding($filemeta);
612
613 # guess filename encoding from encoding of file content, if available
614 if ($filename_encoding eq "undefined" && defined $file_encoding) {
615 $filename_encoding = $file_encoding;
616 }
617
618 # try locale
619 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[15868]620 }
[23457]621
[16557]622 # if still undefined, use utf8 as fallback
623 if ($filename_encoding eq "undefined") {
624 $filename_encoding = "utf8";
625 }
626
[18171]627 #print STDERR "**** UTF8 encoding the filename $filemeta ";
[16767]628
[16557]629 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
630 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
631 # be always utf8 (in which case the filename's encoding is also set as utf8 even
632 # though the filename need not be if it originates from another system)--in such
633 # cases attempt to make the filename utf8 to match.
634 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
635 &unicode::ensure_utf8(\$filemeta);
636 }
637
638 # convert non-unicode encodings to utf8
639 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
640 $filemeta = &unicode::unicode2utf8(
[23457]641 &unicode::convert2unicode($filename_encoding, \$filemeta)
642 );
[15868]643 }
[16390]644
[18171]645 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
[16390]646 return $filemeta;
647}
648
649# gets the filename with no path, converts to utf8, and then dm safes it.
[18320]650# filename_encoding set by user
[16390]651sub filename_to_utf8_metadata
652{
653 my $self = shift (@_);
654 my ($file, $file_encoding) = @_;
655
656 my $outhandle = $self->{'outhandle'};
657
[23457]658 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
659 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
660 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
[23347]661
662
[16390]663 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
664 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
[15868]665
[22652]666 return $filemeta;
[10280]667}
668
[16557]669sub locale_encoding {
670 my $self = shift(@_);
671
672 if (!defined $self->{'filesystem_encoding'}) {
673 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
674 }
[10280]675
[18171]676 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
[16557]677 return $self->{'filesystem_encoding'}; # can be the string "undefined"
678}
679
680sub textcat_encoding {
681 my $self = shift(@_);
682 my ($filemeta) = @_;
683
[22705]684 # analyse filenames without extensions and digits (and trimmed of
685 # surrounding whitespace), so that irrelevant chars don't confuse
686 # textcat
[16557]687 my $strictfilemeta = $filemeta;
688 $strictfilemeta =~ s/\.[^\.]+$//g;
689 $strictfilemeta =~ s/\d//g;
690 $strictfilemeta =~ s/^\s*//g;
691 $strictfilemeta =~ s/\s*$//g;
692
693 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
694 if(!defined $filename_encoding) {
695 $filename_encoding = "undefined";
696 }
697
698 return $filename_encoding; # can be the string "undefined"
699}
700
701# performs textcat
702sub encoding_from_language_analysis {
703 my $self = shift(@_);
704 my ($text) = @_;
705
706 my $outhandle = $self->{'outhandle'};
707 my $best_encoding = undef;
[23457]708
[17212]709 # get the language/encoding of the textstring using textcat
[22632]710 require textcat; # Only load the textcat module if it is required
[16557]711 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
[16767]712 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
[16557]713
714
715 if (scalar @$results < 0) {
716 return undef;
717 }
718
719 # We have some results, we choose the first
720 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
721
722 $best_encoding = $encoding;
723 if (!defined $best_encoding) {
724 return undef;
725 }
[23457]726
[16557]727 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
728 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
729 $best_encoding = 'utf8';
730 }
731
732
733 # check for equivalents where textcat doesn't have some encodings...
734 # eg MS versions of standard encodings
735 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
736 my $iso = $1; # which variant of the iso standard?
737 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
738 if ($text =~ /[\x80-\x9f]/) {
739 # Western Europe
740 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
741 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
742 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
743 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
744 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
745 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
746 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
747 }
748 }
749
750 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
751 !defined $encodings::encodings->{$best_encoding})
752 {
753 if ($self->{'verbosity'}) {
754 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
755 }
756 $best_encoding = undef;
757 }
758
759 return $best_encoding;
760}
761
762# uses locale
[23352]763sub get_filesystem_encoding
764{
[10280]765
[15868]766 my $self = shift(@_);
767
[14961]768 my $outhandle = $self->{'outhandle'};
769 my $filesystem_encoding = undef;
770
771 eval {
[23352]772 # Works for Windows as well, returning the DOS code page in use
[14961]773 use POSIX qw(locale_h);
[15868]774
775 # With only one parameter, setlocale retrieves the
776 # current value
[14961]777 my $current_locale = setlocale(LC_CTYPE);
[15868]778
[23364]779 my $char_encoding = undef;
780 if ($current_locale =~ m/\./) {
781 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
782 $char_encoding = lc($char_encoding);
783 }
784 else {
785 if ($current_locale =~ m/^(posix|c)$/i) {
786 $char_encoding = "ascii";
787 }
788 }
789
790 if (defined $char_encoding) {
[15446]791 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
792 $char_encoding = "$1\_$2\_$3";
793 }
794
[14961]795 $char_encoding =~ s/-/_/g;
796 $char_encoding =~ s/^utf_8$/utf8/;
[15868]797
[14961]798 if ($char_encoding =~ m/^\d+$/) {
[15607]799 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
[14961]800 $char_encoding = "windows_$char_encoding";
801 }
[15607]802 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
[14961]803 $char_encoding = "dos_$char_encoding";
804 }
805 }
[15868]806
[14961]807 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
[15446]808 || (defined $encodings::encodings->{$char_encoding})) {
[14961]809 $filesystem_encoding = $char_encoding;
810 }
811 else {
812 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
813 }
814 }
[15868]815
[14961]816
817 };
818 if ($@) {
819 print $outhandle "$@\n";
[18398]820 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
[14961]821
822 }
[23352]823
[15868]824 return $filesystem_encoding;
825}
[14961]826
[23335]827
828
829sub deduce_filename_encoding
830{
831 my $self = shift (@_);
[23347]832 my ($file,$metadata,$plugin_filename_encoding) = @_;
[23335]833
[23465]834 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
[23335]835 my $deduced_filename_encoding = undef;
836
837 # Start by looking for manually assigned metadata
838 if (defined $gs_filename_encoding) {
[23457]839 if (ref ($gs_filename_encoding) eq "ARRAY") {
840 my $outhandle = $self->{'outhandle'};
841
842 $deduced_filename_encoding = $gs_filename_encoding->[0];
843
844 my $num_vals = scalar(@$gs_filename_encoding);
845 if ($num_vals>1) {
[23465]846 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
[23457]847 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
848 }
849 }
850 else {
851 $deduced_filename_encoding = $gs_filename_encoding;
852 }
[23335]853 }
[23457]854
[23335]855 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
[23457]856 # Look to see if plugin specifies this value
[23347]857
[23457]858 if (defined $plugin_filename_encoding) {
859 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
860 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
861 my $outhandle = $self->{'outhandle'};
862 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
863 print $outhandle " default to 'auto'\n";
864 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
865 }
866
867 if ($plugin_filename_encoding ne "auto") {
868 # We've been given a specific filenamne encoding
869 # => so use it!
870 $deduced_filename_encoding = $plugin_filename_encoding;
871 }
872 }
[23335]873 }
874
875 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
[23347]876
[23352]877 # Look to file system to provide a character encoding
[23347]878
[23352]879 # If Windows NTFS, then -- assuming we work with long file names got through
880 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
881
882 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
883 # Can do better than working with the DOS character encoding returned by locale
884 $deduced_filename_encoding = "unicode";
885 }
886 else {
887 # Unix of some form or other
888
889 # See if we can determine the file system encoding through locale
890 $deduced_filename_encoding = $self->locale_encoding();
[23457]891
[23352]892 # if locale shows us filesystem is utf8, check to see filename is consistent
893 # => if not, then we have an "alien" filename on our hands
[23457]894
[23458]895 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
[23352]896 if (!&unicode::check_is_utf8($file)) {
897 # "alien" filename, so revert
898 $deduced_filename_encoding = undef;
[23347]899 }
[23352]900 }
901 }
[23335]902 }
903
[23347]904# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
905# # Last chance, apply textcat to deduce filename encoding
906# $deduced_filename_encoding = $self->textcat_encoding($file);
907# }
908
[23457]909 if ($self->{'verbosity'}>3) {
910 my $outhandle = $self->{'outhandle'};
[23347]911
[23457]912 if (defined $deduced_filename_encoding) {
913 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
[23347]914 }
[23457]915 else {
916 print $outhandle " No filename encoding deduced\n";
917 }
918 }
919
[23335]920 return $deduced_filename_encoding;
921}
922
923
924
925
926# Notionally written to be called once for each document, it is however safe to
927# call multiple times (as in the case of ImagePlugin) which calls this later on
928# after the original image has potentially been converted to a *new* source image
929# format (e.g. TIFF to PNG)
930
[15868]931sub set_Source_metadata {
932 my $self = shift (@_);
[23461]933 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
934
[23335]935 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
936 # and (as a fallback) using %xx if not for non-ascii chars
937 # 2. Sets the url ref (SourceFile) to the URL encoded version
938 # of filename for generated files
[23352]939
940 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
[23335]941
[23461]942 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
[23352]943
944 my $octet_file = $raw_file;
945
946 # UTF-8 version of filename
[23387]947# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
948# print STDERR "**** Setting Source Metadata given: $octet_file\n";
949# }
[14961]950
[23352]951 # Deal with (on Windows) raw filenames that are in their
952 # abbreviated DOS form
953
954 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
955 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
956 if (-e $raw_filename) {
957 my $unicode_filename = Win32::GetLongPathName($raw_filename);
958
959 my $unused_full_uf;
960 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
961 }
[23347]962 }
[23352]963 }
[23335]964
965 my $url_encoded_filename;
[23364]966 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
[23352]967 # => Generate a pretty print version of filename that is mapped to Unicode
968
969 # Use filename_encoding to map raw filename to a Perl unicode-aware string
970 $url_encoded_filename = decode($filename_encoding,$octet_file);
[23335]971 }
972 else {
[23352]973 # otherwise generate %xx encoded version of filename for char > 127
974 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
[23335]975 }
[18320]976
[23387]977# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
978# print STDERR "****** saving Source as: $url_encoded_filename\n";
979# }
[23759]980
981 # In the case of converted files and (generalized) exploded documents, there
982 # will already be a source filename => store as OrigSource before overriding
983 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
984 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
985 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
986 }
987
[16919]988 # Source is the UTF8 display name - not necessarily the name of the file on the system
[23829]989 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
990 # on Darwin want all display strings to be in composed form, then can search on that
991 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
992 }
[23461]993 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
[23759]994
[23352]995
[23335]996 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
[18320]997 # If using URL encoding, then SourceFile is the url-reference to url-encoded
[23335]998 # renamed_raw_url: it's a url that refers to the actual file on the system
999 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
[18320]1000
[23461]1001 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
[23335]1002 $renamed_raw_url);
1003
[23387]1004# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1005# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1006# }
[14961]1007}
[23457]1008
[17026]1009# this should be called by all plugins to set the oid of the doc obj, rather
1010# than calling doc_obj->set_OID directly
[15868]1011sub add_OID {
[15018]1012 my $self = shift (@_);
1013 my ($doc_obj) = @_;
1014
[17026]1015 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
[15018]1016
[17026]1017 # see if there is a plugin specific set_OID function
1018 if (defined ($self->can('set_OID'))) {
1019 $self->set_OID(@_); # pass through doc_obj and any extra arguments
[15018]1020 }
[17026]1021 else {
[15018]1022 # use the default set_OID() in doc.pm
1023 $doc_obj->set_OID();
1024 }
[17026]1025
[15018]1026}
[23457]1027
[15868]1028# The BasePlugin read_into_doc_obj() function. This function does all the
1029# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]1030# a file and sets up a slew of metadata all saved in doc_obj, which
1031# it then returns as part of a tuple (process_status,doc_obj)
1032#
1033# Much of this functionality used to reside in read, but it was broken
1034# down into a supporting routine to make the code more flexible.
1035#
1036# recursive plugins (e.g. RecPlug) and specialized plugins like those
1037# capable of processing many documents within a single file (e.g.
1038# GMLPlug) will normally want to implement their own version of
1039# read_into_doc_obj()
1040#
1041# Note that $base_dir might be "" and that $file might
1042# include directories
[15868]1043
1044# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]1045sub read_into_doc_obj {
1046 my $self = shift (@_);
[16390]1047 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]1048
[15868]1049 my $outhandle = $self->{'outhandle'};
[10280]1050
[15868]1051 # should we move this to read? What about secondary plugins?
[23759]1052 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
[15868]1053 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
[23363]1054 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
[23457]1055 if $self->{'verbosity'} > 1;
[10280]1056
[16390]1057 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23457]1058
[1242]1059 # create a new document
[18320]1060 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[14961]1061 my $top_section = $doc_obj->get_top_section();
1062
1063 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]1064 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
[23457]1065
[18469]1066
[23352]1067 my $plugin_filename_encoding = $self->{'filename_encoding'};
[23347]1068 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23472]1069 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
[8166]1070
[15868]1071 # plugin specific stuff - what args do we need here??
1072 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1073 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1074 return -1;
[2816]1075 }
[1242]1076
[15868]1077 # include any metadata passed in from previous plugins
1078 # note that this metadata is associated with the top level section
1079 my $section = $doc_obj->get_top_section();
1080 # can we merge these two methods??
1081 $self->add_associated_files($doc_obj, $filename_full_path);
1082 $self->extra_metadata ($doc_obj, $section, $metadata);
1083 $self->auto_extract_metadata($doc_obj);
[1242]1084
[15868]1085 # if we haven't found any Title so far, assign one
1086 # this was shifted to here from inside read()
1087 $self->title_fallback($doc_obj,$section,$filename_no_path);
1088
1089 $self->add_OID($doc_obj);
1090
[21219]1091 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[15868]1092 return (1,$doc_obj);
1093}
[2785]1094
[21219]1095sub post_process_doc_obj {
1096 my $self = shift (@_);
1097 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1098
1099 return 1;
1100}
1101
[15868]1102sub add_dummy_text {
1103 my $self = shift(@_);
1104 my ($doc_obj, $section) = @_;
[2785]1105
[15868]1106 # add NoText metadata so we can hide this dummy text in format statements
1107 $doc_obj->add_metadata($section, "NoText", "1");
1108 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1109
1110}
[8510]1111
[15868]1112# does nothing. Can be overridden by subclass
1113sub auto_extract_metadata {
1114 my $self = shift(@_);
1115 my ($doc_obj) = @_;
1116}
[11122]1117
[15868]1118# adds cover image, associate_file options stuff. Should be called by sub class
1119# read_into_doc_obj
1120sub add_associated_files {
1121 my $self = shift(@_);
1122 # whatis filename??
1123 my ($doc_obj, $filename) = @_;
1124
1125 # add in the cover image
1126 if ($self->{'cover_image'}) {
1127 $self->associate_cover_image($doc_obj, $filename);
[8716]1128 }
[22215]1129 # store the original (used for eg TextPlugin to store the original for OAI)
1130 if ($self->{'store_original_file'}) {
1131 $self->associate_source_file($doc_obj, $filename);
1132 }
[23457]1133
[15018]1134
[10280]1135}
[1242]1136
[16390]1137# implement this if you are extracting metadata for other documents
1138sub metadata_read {
1139 my $self = shift (@_);
[19493]1140 my ($pluginfo, $base_dir, $file, $block_hash,
1141 $extrametakeys, $extrametadata, $extrametafile,
[23212]1142 $processor, $gli, $aux) = @_;
[16390]1143
1144 # can we process this file??
1145 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[24403]1146 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
[16390]1147
1148 return 1; # we recognise the file, but don't actually do anything with it
1149}
1150
1151
[15868]1152# The BasePlugin read() function. This function calls read_into_doc_obj()
[10280]1153# to ensure all the right things to make general options work for a
1154# given plugin are done. It then calls the process() function which
1155# does all the work specific to a plugin (like the old read functions
1156# used to do). Most plugins should define their own process() function
1157# and let this read() function keep control.
1158#
1159# recursive plugins (e.g. RecPlug) and specialized plugins like those
1160# capable of processing many documents within a single file (e.g.
1161# GMLPlug) might want to implement their own version of read(), but
1162# more likely need to implement their own version of read_into_doc_obj()
1163#
1164# Return number of files processed, undef if can't recognise, -1 if can't
1165# process
1166
1167sub read {
1168 my $self = shift (@_);
[16390]1169 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]1170
[16390]1171 # can we process this file??
1172 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[20577]1173
[16390]1174 return undef unless $self->can_process_this_file($filename_full_path);
1175
[10280]1176 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[21219]1177
[10280]1178 if ((defined $process_status) && ($process_status == 1)) {
[21219]1179
[10280]1180 # process the document
1181 $processor->process($doc_obj);
[15868]1182
[10280]1183 $self->{'num_processed'} ++;
1184 undef $doc_obj;
[9398]1185 }
[15868]1186 # delete any temp files that we may have created
1187 $self->clean_up_after_doc_obj_processing();
[9398]1188
[18469]1189
[10280]1190 # if process_status == 1, then the file has been processed.
1191 return $process_status;
1192
[4]1193}
1194
[1244]1195# returns undef if file is rejected by the plugin
[1242]1196sub process {
1197 my $self = shift (@_);
[11089]1198 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]1199
[23419]1200 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
[1244]1201
[23457]1202 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[23419]1203 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1204
[23457]1205 die "\n";
[23419]1206
[1244]1207 return undef; # never gets here
[1242]1208}
1209
[15868]1210# overwrite this method to delete any temp files that we have created
1211sub clean_up_after_doc_obj_processing {
1212 my $self = shift(@_);
[4]1213
[10280]1214}
[16390]1215
[10280]1216# write_file -- used by ConvertToPlug, for example in post processing
1217#
[15868]1218# where should this go, is here the best place??
[10280]1219sub utf8_write_file {
1220 my $self = shift (@_);
1221 my ($textref, $filename) = @_;
1222
[22953]1223 if (!open (FILE, ">:utf8", $filename)) {
[10280]1224 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
[23457]1225 die "\n";
1226 }
[10280]1227 print FILE $$textref;
1228
[1219]1229 close FILE;
1230}
1231
[10280]1232
[7504]1233sub filename_based_title
1234{
1235 my $self = shift (@_);
1236 my ($file) = @_;
1237
1238 my $file_derived_title = $file;
1239 $file_derived_title =~ s/_/ /g;
[23335]1240 $file_derived_title =~ s/\.[^.]+$//;
[7504]1241
1242 return $file_derived_title;
1243}
1244
[9398]1245
[7504]1246sub title_fallback
1247{
1248 my $self = shift (@_);
1249 my ($doc_obj,$section,$file) = @_;
1250
[22705]1251 if (!defined $doc_obj->get_metadata_element ($section, "Title")
[23335]1252 || $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]1253
[23335]1254 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1255 my $file_derived_title;
1256 if (defined $source_file) {
1257 $file_derived_title = $self->filename_based_title($source_file);
1258 }
1259 else {
1260 # pp = pretty print
1261 my $pp_file = (defined $source_file) ? $source_file : $file;
1262
1263 my $raw_title = $self->filename_based_title($file);
1264 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1265 }
1266
1267
[15868]1268 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1269 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]1270 }
[15868]1271 else {
[16995]1272 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
[15868]1273 }
[9413]1274 }
[23457]1275
[1844]1276}
[23457]1277
[1219]1278# add any extra metadata that's been passed around from one
1279# plugin to another.
1280# extra_metadata uses add_utf8_metadata so it expects metadata values
1281# to already be in utf8
1282sub extra_metadata {
1283 my $self = shift (@_);
1284 my ($doc_obj, $cursection, $metadata) = @_;
1285
[11122]1286 my $associate_tail_re = $self->{'associate_tail_re'};
1287
[1219]1288 foreach my $field (keys(%$metadata)) {
[839]1289 # $metadata->{$field} may be an array reference
[8510]1290 if ($field eq "gsdlassocfile_tobe") {
1291 # 'gsdlassocfile_tobe' is artificially introduced metadata
1292 # that is used to signal that certain additional files should
1293 # be tied to this document. Useful in situations where a
1294 # metadata pass in the plugin pipeline works out some files
1295 # need to be associated with a document, but the document hasn't
1296 # been formed yet.
1297 my $equiv_form = "";
1298 foreach my $gaf (@{$metadata->{$field}}) {
1299 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
[18171]1300 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1301
1302 # we need to make sure the filename is valid utf-8 - we do
[18320]1303 # this by url or base64 encoding it
[18171]1304 # $tail_filename is the name that we store the file as
[18320]1305 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
[8510]1306 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[20778]1307 $doc_obj->associate_source_file($full_filename);
[18320]1308 # If the filename is url_encoded, we need to encode the % signs
1309 # in the filename, so that it works in a url
[18404]1310 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
[11122]1311 # work out extended tail extension (i.e. matching tail re)
1312
1313 my ($file_prefix,$file_extended_ext)
[16390]1314 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]1315 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
[8510]1316 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
[24219]1317
1318## my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1319 my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1320 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1321
[8510]1322 my $srcicon = "_icon".$doc_ext."_";
1323 my $end_doclink = "</a>";
[24219]1324
[11122]1325 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1326
[24219]1327 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink);
1328 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1329 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1330
[18171]1331 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
[11122]1332 # for metadata such as [mp3._edited] [mp3._full] ...
1333 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1334 }
1335
1336 # for multiple metadata such as [mp3.assoclink]
1337 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
[24219]1338
[11122]1339 $equiv_form .= " $assoc_form";
[8510]1340 }
1341 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1342 }
[23279]1343 elsif ($field eq "gsdlzipfilename") {
1344 # special case for when files have come out of a zip. source_path
1345 # (used for archives dbs and keeping track for incremental import)
1346 # must be set to the zip file name
1347 my $zip_filename = $metadata->{$field};
1348 # overwrite the source_path
[23363]1349 $doc_obj->set_source_path($zip_filename);
[23279]1350 # and set the metadata
1351 $zip_filename = &util::filename_within_collection($zip_filename);
1352 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1353 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1354 }
[8510]1355 elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]1356 map {
[1219]1357 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]1358 } @{$metadata->{$field}};
1359 } else {
[1219]1360 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]1361 }
1362 }
1363}
1364
[1396]1365
[2785]1366sub compile_stats {
1367 my $self = shift(@_);
1368 my ($stats) = @_;
1369
1370 $stats->{'num_processed'} += $self->{'num_processed'};
1371 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]1372 $stats->{'num_archives'} += $self->{'num_archives'};
[2785]1373
1374}
[22215]1375sub associate_source_file {
1376 my $self = shift(@_);
1377
1378 my ($doc_obj, $filename) = @_;
1379 my $cursection = $doc_obj->get_top_section();
1380 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1381
1382 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
[24225]1383 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
[22663]1384 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
[24225]1385 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
[22215]1386}
[2785]1387
[2816]1388sub associate_cover_image {
[22215]1389 my $self = shift(@_);
[2816]1390 my ($doc_obj, $filename) = @_;
1391
[23363]1392 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1393
[10833]1394 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
[23363]1395 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1396
1397 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1398 # don't stat() for existence e.g. for multiple document input files
[10833]1399 # (eg SplitPlug)
1400 return;
1401 }
1402
[9413]1403 my $top_section=$doc_obj->get_top_section();
1404
[23363]1405 if (&util::fd_exists($upgraded_filename)) {
[20778]1406 $doc_obj->associate_source_file($filename);
[13968]1407 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]1408 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]1409 } else {
[10833]1410 my $upper_filename = $filename;
[23363]1411 my $upgraded_upper_filename = $upgraded_filename;
1412
[10833]1413 $upper_filename =~ s/jpg$/JPG/;
[23363]1414 $upgraded_upper_filename =~ s/jpg$/JPG/;
1415
1416 if (&util::fd_exists($upgraded_upper_filename)) {
[20778]1417 $doc_obj->associate_source_file($upper_filename);
[10833]1418 $doc_obj->associate_file($upper_filename, "cover.jpg",
1419 "image/jpeg");
[9413]1420 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]1421 } else {
1422 # file doesn't exist, so record the fact that it's missing so
1423 # we don't stat() again (stat is slow)
[23363]1424 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
[3086]1425 }
[2816]1426 }
[10833]1427
[2816]1428}
1429
[11332]1430
1431# Overridden by exploding plugins (eg. ISISPlug)
1432sub clean_up_after_exploding
1433{
1434 my $self = shift(@_);
1435}
1436
1437
[16390]1438
[4]14391;
Note: See TracBrowser for help on using the repository browser.