source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 23832

Last change on this file since 23832 was 23832, checked in by ak19, 13 years ago

Forgot to copy the import statement for Normalize into the svn version for committing.

  • Property svn:keywords set to Author Date Id Revision
File size: 44.9 KB
RevLine 
[537]1###########################################################################
2#
[15868]3# BasePlugin.pm -- base class for all the import plugins
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[9413]8# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
[15868]26package BasePlugin;
[2219]27
[10254]28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]31
[8892]32use File::Basename;
[23335]33use Encode;
[23832]34use Unicode::Normalize 'normalize';
[8892]35
[1870]36use encodings;
[11389]37use unicode;
[1242]38use doc;
[2751]39use ghtml;
[9413]40use gsprintf 'gsprintf';
[4]41
[15868]42use PrintInfo;
[10218]43
[15868]44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
[5681]47
[18320]48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
[18398]54 'desc' => "{BasePlugin.rename_method.base64}" },
[18404]55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
[18398]57 'hiddengli' => "yes" } ];
[18320]58
[15868]59our $encoding_list =
[10218]60 [ { 'name' => "ascii",
[16014]61 'desc' => "{BasePlugin.encoding.ascii}" },
[4744]62 { 'name' => "utf8",
[16014]63 'desc' => "{BasePlugin.encoding.utf8}" },
[4744]64 { 'name' => "unicode",
[16014]65 'desc' => "{BasePlugin.encoding.unicode}" } ];
[3540]66
[23457]67
[10620]68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
[15868]75 push(@{$encoding_list},$hashEncode);
[10620]76}
77
[15868]78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
[16557]80 'desc' => "{BasePlugin.filename_encoding.auto}" },
[23457]81 { 'name' => "auto-language-analysis",
[16557]82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
[10620]89
[15868]90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
[16698]92our $oidtype_list =
[17026]93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
[16698]96 'desc' => "{import.OIDtype.hash}" },
97 { 'name' => "assigned",
98 'desc' => "{import.OIDtype.assigned}" },
99 { 'name' => "incremental",
100 'desc' => "{import.OIDtype.incremental}" },
101 { 'name' => "dirname",
102 'desc' => "{import.OIDtype.dirname}" } ];
103
[4873]104my $arguments =
[3540]105 [ { 'name' => "process_exp",
[15868]106 'desc' => "{BasePlugin.process_exp}",
[6408]107 'type' => "regexp",
[3540]108 'deft' => "",
109 'reqd' => "no" },
[16390]110 { 'name' => "no_blocking",
111 'desc' => "{BasePlugin.no_blocking}",
112 'type' => "flag",
113 'reqd' => "no"},
[3540]114 { 'name' => "block_exp",
[15868]115 'desc' => "{BasePlugin.block_exp}",
[6408]116 'type' => "regexp",
[3540]117 'deft' => "",
118 'reqd' => "no" },
[22215]119 { 'name' => "store_original_file",
120 'desc' => "{BasePlugin.store_original_file}",
121 'type' => "flag",
122 'reqd' => "no" },
[8892]123 { 'name' => "associate_ext",
[15868]124 'desc' => "{BasePlugin.associate_ext}",
[8892]125 'type' => "string",
126 'reqd' => "no" },
[11122]127 { 'name' => "associate_tail_re",
[15868]128 'desc' => "{BasePlugin.associate_tail_re}",
[11122]129 'type' => "string",
130 'reqd' => "no" },
[16698]131 { 'name' => "OIDtype",
132 'desc' => "{import.OIDtype}",
133 'type' => "enum",
134 'list' => $oidtype_list,
[16847]135 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
[17026]136 'deft' => "auto",
[18591]137 'reqd' => "no" },
[16698]138 { 'name' => "OIDmetadata",
139 'desc' => "{import.OIDmetadata}",
140 'type' => "metadata",
141 'deft' => "dc.Identifier",
[18591]142 'reqd' => "no" },
[16698]143# { 'name' => "use_as_doc_identifier",
144# 'desc' => "{BasePlugin.use_as_doc_identifier}",
145# 'type' => "string",
146# 'reqd' => "no" ,
147# 'deft' => "" } ,
[18320]148 { 'name' => "no_cover_image",
[15868]149 'desc' => "{BasePlugin.no_cover_image}",
[3540]150 'type' => "flag",
151 'reqd' => "no" },
[15868]152 { 'name' => "filename_encoding",
153 'desc' => "{BasePlugin.filename_encoding}",
154 'type' => "enum",
155 'deft' => "auto",
156 'list' => $encoding_plus_auto_list,
[16390]157 'reqd' => "no" },
158 { 'name' => "smart_block",
[16520]159 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
[16390]160 'type' => "flag",
161 'reqd' => "no",
[18320]162 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
163 { 'name' => "file_rename_method",
164 'desc' => "{BasePlugin.file_rename_method}",
165 'type' => "enum",
166 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
167 'list' => $file_rename_method_list,
168 'reqd' => "no"
[23457]169 }
[15868]170
171 ];
[3540]172
[9398]173
[15868]174my $options = { 'name' => "BasePlugin",
175 'desc' => "{BasePlugin.desc}",
[6408]176 'abstract' => "yes",
177 'inherits' => "no",
[4750]178 'args' => $arguments };
[3540]179
[4]180sub new {
[10218]181
[15868]182 my ($class) = shift (@_);
[16698]183 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
[10218]184 push(@$pluginlist, $class);
[9398]185
[15868]186 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
187 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]188
[16698]189 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
[16390]190
191 if ($self->{'info_only'}) {
192 # don't worry about any options etc
193 return bless $self, $class;
194 }
[10579]195
[16390]196 if ($self->{'smart_block'}) {
197 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
198 }
199 $self->{'smart_block'} = undef;
200
[15868]201 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
202 $self->{'plugin_type'} = $plugin_name;
[10579]203
[20451]204 # remove ex. from OIDmetadata
[20605]205 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
[2785]206 $self->{'num_processed'} = 0;
207 $self->{'num_not_processed'} = 0;
208 $self->{'num_blocked'} = 0;
209 $self->{'num_archives'} = 0;
[8678]210 $self->{'cover_image'} = 1; # cover image is on by default
[10218]211 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[19222]212 $self->{'can_process_directories'} = 0;
[10579]213 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]214
[8892]215 my $associate_ext = $self->{'associate_ext'};
216 if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]217
[11122]218 my $associate_tail_re = $self->{'associate_tail_re'};
219 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
220 my $outhandle = $self->{'outhandle'};
221 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
222 print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]223 }
[11122]224 else {
225 my @exts = split(/,/,$associate_ext);
[8892]226
[11122]227 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
228 my $associate_tail_re = join("|",@exts_bracketed);
229 $self->{'associate_tail_re'} = $associate_tail_re;
230 }
231
232 delete $self->{'associate_ext'};
[8892]233 }
234
[15868]235 return bless $self, $class;
[11089]236
[4]237}
238
[16821]239sub merge_inheritance
240{
241 my $self = {};
242 my @child_selfs = @_;
243
244 foreach my $child_self (@child_selfs) {
245 foreach my $key (keys %$child_self) {
246 if (defined $self->{$key}) {
247 if ($self->{$key} ne $child_self->{$key}) {
248# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
249# print STDERR "Existing stored value = $self->{$key}\n";
250# print STDERR "New (child) value = $child_self->{$key}\n";
251# print STDERR "Keeping existing value\n";
252 # Existing value seems to be option specified in collect.cfg
253
254 ### $self->{$key} = $child_self->{$key};
255
256 }
257 else {
258## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
259 }
260
261 }
262 else {
263 $self->{$key} = $child_self->{$key};
264 }
265 }
266 }
267
268 return $self;
269}
270
[15868]271# initialize BasePlugin options
272# if init() is overridden in a sub-class, remember to call BasePlugin::init()
[1242]273sub init {
274 my $self = shift (@_);
[2785]275 my ($verbosity, $outhandle, $failhandle) = @_;
[1242]276
277 # verbosity is passed through from the processor
278 $self->{'verbosity'} = $verbosity;
279
[2785]280 # as are the outhandle and failhandle
[1424]281 $self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]282 $self->{'failhandle'} = $failhandle;
[16390]283# $self->SUPER::init(@_);
284
[1242]285 # set process_exp and block_exp to defaults unless they were
286 # explicitly set
[1244]287
288 if ((!$self->is_recursive()) and
[1242]289 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
[1244]290
[1242]291 $self->{'process_exp'} = $self->get_default_process_exp ();
292 if ($self->{'process_exp'} eq "") {
[1244]293 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]294 }
295 }
296
297 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
298 $self->{'block_exp'} = $self->get_default_block_exp ();
299 }
[11089]300
[1242]301}
302
[839]303sub begin {
304 my $self = shift (@_);
305 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[16821]306
[17026]307 if ($self->{'OIDtype'} eq "auto") {
308 # hasn't been set in the plugin, use the processor values
309 $self->{'OIDtype'} = $processor->{'OIDtype'};
310 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
311 }
312 if ($self->{'OIDtype'} eq "hash") {
313 # should we hash on the file or on the doc xml??
314 $self->{'OIDtype'} = $self->get_oid_hash_type();
315 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
316 $self->{'OIDtype'} = "hash_on_file";
317 }
318 }
[839]319}
320
[21308]321# This is called once if removeold is set with import.pl. Most plugins will do
322# nothing but if a plugin does any stuff outside of creating doc obj, then
323# it may need to clear something.
324sub remove_all {
[21286]325 my $self = shift (@_);
326 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
327}
328
[21308]329# This is called per document for docs that have been deleted from the
330# collection. Most plugins will do nothing
331# but if a plugin does any stuff outside of creating doc obj, then it may need
332# to clear something.
333sub remove_one {
334 my $self = shift (@_);
335
[21315]336 my ($file, $oids, $archivedir) = @_;
337 return 0 if $self->can_process_this_file($file);
338 return undef;
[21308]339}
340
[839]341sub end {
[10155]342 # potentially called at the end of each plugin pass
343 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
344
[15868]345 my ($self) = shift (@_);
[839]346}
347
[10155]348sub deinit {
349 # called only once, after all plugin passes have been done
350
351 my ($self) = @_;
352}
353
[17026]354# default hashing type is to hash on the original file (or converted file)
355# override this to return hash_on_ga_xml for filetypes where hashing on the
356# file is no good eg video
357sub get_oid_hash_type {
358
359 my $self = shift (@_);
360
361 return "hash_on_file";
362}
363
[15868]364
[1242]365# this function should be overridden to return 1
366# in recursive plugins
[4]367sub is_recursive {
368 my $self = shift (@_);
369
[1242]370 return 0;
[4]371}
372
[1242]373sub get_default_block_exp {
374 my $self = shift (@_);
375
376 return "";
377}
378
379sub get_default_process_exp {
380 my $self = shift (@_);
381
382 return "";
383}
384
[23419]385
[18320]386# rename imported files and assoc files using URL encoding by default
387# as this will work for most plugins and give more legible filenames
388sub get_default_file_rename_method() {
389 my $self = shift (@_);
390 return "url";
391}
392
[18398]393# returns this plugin's active (possibly user-selected) file_rename_method
394sub get_file_rename_method() {
395 my $self = shift (@_);
396 my $rename_method = $self->{'file_rename_method'};
397 if($rename_method) {
398 return $rename_method;
399 } else {
400 return $self->get_default_file_rename_method();
401 }
402}
403
[16390]404# default implementation is to do nothing
405sub store_block_files {
406
[9067]407 my $self =shift (@_);
[16390]408 my ($filename_full_path, $block_hash) = @_;
409
[9067]410}
411
[16390]412# put files to block into hash
413sub use_block_expressions {
414
415 my $self =shift (@_);
416 my ($filename_full_path, $block_hash) = @_;
417
[23363]418 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
419
[16390]420 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
[23561]421 &util::block_filename($block_hash,$filename_full_path);
[16390]422 }
423
424}
425
[9067]426#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
427sub block_cover_image
428{
[10833]429 my $self =shift;
[16390]430 my ($filename, $block_hash) = @_;
[10833]431
[23363]432 $filename = &util::upgrade_if_dos_filename($filename);
433
[9067]434 if ($self->{'cover_image'}) {
435 my $coverfile = $filename;
436 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
[23759]437
[23363]438 if (!&util::fd_exists($coverfile)) {
[9067]439 $coverfile =~ s/jpg$/JPG/;
440 }
[23363]441 if (&util::fd_exists($coverfile)) {
[23561]442 &util::block_filename($block_hash,$coverfile);
[11089]443 }
[9067]444 }
445
446 return;
447}
[11122]448
449
[16390]450# discover all the files that should be blocked by this plugin
451# check the args ...
452sub file_block_read {
[11122]453
[8510]454 my $self = shift (@_);
[16390]455 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]456 # Keep track of filenames with same root but different extensions
[11122]457 # Used to support -associate_ext and the more generalised
458 # -associate_tail_re
[16390]459 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]460
[18441]461 if (!-d $filename_full_path) {
462 $block_hash->{'all_files'}->{$file} = 1;
463 }
464
[11122]465 my $associate_tail_re = $self->{'associate_tail_re'};
466 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
467 my ($file_prefix,$file_ext)
[16390]468 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
[8892]469 if ((defined $file_prefix) && (defined $file_ext)) {
[16390]470 my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]471 if (!defined $shared_fileroot->{$file_prefix}) {
[11122]472 my $file_prefix_rec = { 'tie_to' => undef,
473 'exts' => {} };
[8892]474 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
475 }
476
477 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
478
[16390]479 if ($self->can_process_this_file($filename_full_path)) {
[8892]480 # This is the document the others should be tied to
481 $file_prefix_rec->{'tie_to'} = $file_ext;
482 }
483 else {
[11122]484 if ($file_ext =~ m/$associate_tail_re$/) {
[16390]485 # this file should be associated to the main one
[9351]486 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
487 }
[8892]488 }
[11122]489
[8892]490 }
491 }
[11122]492
[16390]493 # check block expressions
494 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
495
[9067]496 # now check whether we are actually processing this
[16390]497 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
[9067]498 return undef; # can't recognise
499 }
[23457]500
[16852]501 # if we have a block_exp, then this overrides the normal 'smart' blocking
502 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
[8892]503
[11089]504 # block the cover image if there is one
505 if ($self->{'cover_image'}) {
[16852]506 $self->block_cover_image($filename_full_path, $block_hash);
[11089]507 }
[23457]508
[9067]509 return 1;
[8510]510}
511
[16390]512# plugins that rely on more than process_exp (eg XML plugins) can override this method
513sub can_process_this_file {
514 my $self = shift(@_);
515 my ($filename) = @_;
[8892]516
[19222]517 if (-d $filename && !$self->{'can_process_directories'}) {
518 return 0;
519 }
[22427]520
[16390]521 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
522 return 1;
[8892]523 }
524 return 0;
[10280]525
526}
527
[16390]528# just converts path as is to utf8.
529sub filepath_to_utf8 {
[10280]530 my $self = shift (@_);
[15868]531 my ($file, $file_encoding) = @_;
[16390]532 my $filemeta = $file;
[10280]533
[16557]534 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
535
536 # Whenever filename-encoding is set to any of the auto settings, we
537 # check if the filename is already in UTF8. If it is, then we're done.
538 if($filename_encoding =~ m/auto/) {
[23457]539 if(&unicode::check_is_utf8($filemeta))
540 {
541 $filename_encoding = "utf8";
542 return $filemeta;
543 }
[16557]544 }
545
546 # Auto setting, but filename is not utf8
547 if ($filename_encoding eq "auto")
548 {
[23457]549 # try textcat
550 $filename_encoding = $self->textcat_encoding($filemeta);
[16557]551
[23457]552 # check the locale next
553 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[16557]554
[23457]555
556 # now try the encoding of the document, if available
557 if ($filename_encoding eq "undefined" && defined $file_encoding) {
558 $filename_encoding = $file_encoding;
559 }
[16557]560
561 }
562
563 elsif ($filename_encoding eq "auto-language-analysis")
564 {
565 $filename_encoding = $self->textcat_encoding($filemeta);
566
567 # now try the encoding of the document, if available
568 if ($filename_encoding eq "undefined" && defined $file_encoding) {
569 $filename_encoding = $file_encoding;
570 }
571 }
572
573 elsif ($filename_encoding eq "auto-filesystem-encoding")
[23352]574 {
[16557]575 # try locale
576 $filename_encoding = $self->locale_encoding();
577 }
578
579 elsif ($filename_encoding eq "auto-fl")
580 {
581 # filesystem-encoding (locale) then language-analysis (textcat)
582 $filename_encoding = $self->locale_encoding();
583
584 # try textcat
585 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
[23457]586
[16557]587 # else assume filename encoding is encoding of file content, if that's available
588 if ($filename_encoding eq "undefined" && defined $file_encoding) {
589 $filename_encoding = $file_encoding;
[15868]590 }
[16557]591 }
[23457]592
[16557]593 elsif ($filename_encoding eq "auto-lf")
594 {
595 # language-analysis (textcat) then filesystem-encoding (locale)
596 $filename_encoding = $self->textcat_encoding($filemeta);
597
598 # guess filename encoding from encoding of file content, if available
599 if ($filename_encoding eq "undefined" && defined $file_encoding) {
600 $filename_encoding = $file_encoding;
601 }
602
603 # try locale
604 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[15868]605 }
[23457]606
[16557]607 # if still undefined, use utf8 as fallback
608 if ($filename_encoding eq "undefined") {
609 $filename_encoding = "utf8";
610 }
611
[18171]612 #print STDERR "**** UTF8 encoding the filename $filemeta ";
[16767]613
[16557]614 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
615 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
616 # be always utf8 (in which case the filename's encoding is also set as utf8 even
617 # though the filename need not be if it originates from another system)--in such
618 # cases attempt to make the filename utf8 to match.
619 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
620 &unicode::ensure_utf8(\$filemeta);
621 }
622
623 # convert non-unicode encodings to utf8
624 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
625 $filemeta = &unicode::unicode2utf8(
[23457]626 &unicode::convert2unicode($filename_encoding, \$filemeta)
627 );
[15868]628 }
[16390]629
[18171]630 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
[16390]631 return $filemeta;
632}
633
634# gets the filename with no path, converts to utf8, and then dm safes it.
[18320]635# filename_encoding set by user
[16390]636sub filename_to_utf8_metadata
637{
638 my $self = shift (@_);
639 my ($file, $file_encoding) = @_;
640
641 my $outhandle = $self->{'outhandle'};
642
[23457]643 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
644 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
645 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
[23347]646
647
[16390]648 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
649 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
[15868]650
[22652]651 return $filemeta;
[10280]652}
653
[16557]654sub locale_encoding {
655 my $self = shift(@_);
656
657 if (!defined $self->{'filesystem_encoding'}) {
658 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
659 }
[10280]660
[18171]661 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
[16557]662 return $self->{'filesystem_encoding'}; # can be the string "undefined"
663}
664
665sub textcat_encoding {
666 my $self = shift(@_);
667 my ($filemeta) = @_;
668
[22705]669 # analyse filenames without extensions and digits (and trimmed of
670 # surrounding whitespace), so that irrelevant chars don't confuse
671 # textcat
[16557]672 my $strictfilemeta = $filemeta;
673 $strictfilemeta =~ s/\.[^\.]+$//g;
674 $strictfilemeta =~ s/\d//g;
675 $strictfilemeta =~ s/^\s*//g;
676 $strictfilemeta =~ s/\s*$//g;
677
678 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
679 if(!defined $filename_encoding) {
680 $filename_encoding = "undefined";
681 }
682
683 return $filename_encoding; # can be the string "undefined"
684}
685
686# performs textcat
687sub encoding_from_language_analysis {
688 my $self = shift(@_);
689 my ($text) = @_;
690
691 my $outhandle = $self->{'outhandle'};
692 my $best_encoding = undef;
[23457]693
[17212]694 # get the language/encoding of the textstring using textcat
[22632]695 require textcat; # Only load the textcat module if it is required
[16557]696 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
[16767]697 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
[16557]698
699
700 if (scalar @$results < 0) {
701 return undef;
702 }
703
704 # We have some results, we choose the first
705 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
706
707 $best_encoding = $encoding;
708 if (!defined $best_encoding) {
709 return undef;
710 }
[23457]711
[16557]712 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
713 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
714 $best_encoding = 'utf8';
715 }
716
717
718 # check for equivalents where textcat doesn't have some encodings...
719 # eg MS versions of standard encodings
720 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
721 my $iso = $1; # which variant of the iso standard?
722 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
723 if ($text =~ /[\x80-\x9f]/) {
724 # Western Europe
725 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
726 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
727 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
728 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
729 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
730 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
731 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
732 }
733 }
734
735 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
736 !defined $encodings::encodings->{$best_encoding})
737 {
738 if ($self->{'verbosity'}) {
739 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
740 }
741 $best_encoding = undef;
742 }
743
744 return $best_encoding;
745}
746
747# uses locale
[23352]748sub get_filesystem_encoding
749{
[10280]750
[15868]751 my $self = shift(@_);
752
[14961]753 my $outhandle = $self->{'outhandle'};
754 my $filesystem_encoding = undef;
755
756 eval {
[23352]757 # Works for Windows as well, returning the DOS code page in use
[14961]758 use POSIX qw(locale_h);
[15868]759
760 # With only one parameter, setlocale retrieves the
761 # current value
[14961]762 my $current_locale = setlocale(LC_CTYPE);
[15868]763
[23364]764 my $char_encoding = undef;
765 if ($current_locale =~ m/\./) {
766 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
767 $char_encoding = lc($char_encoding);
768 }
769 else {
770 if ($current_locale =~ m/^(posix|c)$/i) {
771 $char_encoding = "ascii";
772 }
773 }
774
775 if (defined $char_encoding) {
[15446]776 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
777 $char_encoding = "$1\_$2\_$3";
778 }
779
[14961]780 $char_encoding =~ s/-/_/g;
781 $char_encoding =~ s/^utf_8$/utf8/;
[15868]782
[14961]783 if ($char_encoding =~ m/^\d+$/) {
[15607]784 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
[14961]785 $char_encoding = "windows_$char_encoding";
786 }
[15607]787 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
[14961]788 $char_encoding = "dos_$char_encoding";
789 }
790 }
[15868]791
[14961]792 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
[15446]793 || (defined $encodings::encodings->{$char_encoding})) {
[14961]794 $filesystem_encoding = $char_encoding;
795 }
796 else {
797 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
798 }
799 }
[15868]800
[14961]801
802 };
803 if ($@) {
804 print $outhandle "$@\n";
[18398]805 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
[14961]806
807 }
[23352]808
[15868]809 return $filesystem_encoding;
810}
[14961]811
[23335]812
813
814sub deduce_filename_encoding
815{
816 my $self = shift (@_);
[23347]817 my ($file,$metadata,$plugin_filename_encoding) = @_;
[23335]818
[23465]819 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
[23335]820 my $deduced_filename_encoding = undef;
821
822 # Start by looking for manually assigned metadata
823 if (defined $gs_filename_encoding) {
[23457]824 if (ref ($gs_filename_encoding) eq "ARRAY") {
825 my $outhandle = $self->{'outhandle'};
826
827 $deduced_filename_encoding = $gs_filename_encoding->[0];
828
829 my $num_vals = scalar(@$gs_filename_encoding);
830 if ($num_vals>1) {
[23465]831 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
[23457]832 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
833 }
834 }
835 else {
836 $deduced_filename_encoding = $gs_filename_encoding;
837 }
[23335]838 }
[23457]839
[23335]840 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
[23457]841 # Look to see if plugin specifies this value
[23347]842
[23457]843 if (defined $plugin_filename_encoding) {
844 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
845 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
846 my $outhandle = $self->{'outhandle'};
847 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
848 print $outhandle " default to 'auto'\n";
849 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
850 }
851
852 if ($plugin_filename_encoding ne "auto") {
853 # We've been given a specific filenamne encoding
854 # => so use it!
855 $deduced_filename_encoding = $plugin_filename_encoding;
856 }
857 }
[23335]858 }
859
860 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
[23347]861
[23352]862 # Look to file system to provide a character encoding
[23347]863
[23352]864 # If Windows NTFS, then -- assuming we work with long file names got through
865 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
866
867 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
868 # Can do better than working with the DOS character encoding returned by locale
869 $deduced_filename_encoding = "unicode";
870 }
871 else {
872 # Unix of some form or other
873
874 # See if we can determine the file system encoding through locale
875 $deduced_filename_encoding = $self->locale_encoding();
[23457]876
[23352]877 # if locale shows us filesystem is utf8, check to see filename is consistent
878 # => if not, then we have an "alien" filename on our hands
[23457]879
[23458]880 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
[23352]881 if (!&unicode::check_is_utf8($file)) {
882 # "alien" filename, so revert
883 $deduced_filename_encoding = undef;
[23347]884 }
[23352]885 }
886 }
[23335]887 }
888
[23347]889# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
890# # Last chance, apply textcat to deduce filename encoding
891# $deduced_filename_encoding = $self->textcat_encoding($file);
892# }
893
[23457]894 if ($self->{'verbosity'}>3) {
895 my $outhandle = $self->{'outhandle'};
[23347]896
[23457]897 if (defined $deduced_filename_encoding) {
898 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
[23347]899 }
[23457]900 else {
901 print $outhandle " No filename encoding deduced\n";
902 }
903 }
904
[23335]905 return $deduced_filename_encoding;
906}
907
908
909
910
911# Notionally written to be called once for each document, it is however safe to
912# call multiple times (as in the case of ImagePlugin) which calls this later on
913# after the original image has potentially been converted to a *new* source image
914# format (e.g. TIFF to PNG)
915
[15868]916sub set_Source_metadata {
917 my $self = shift (@_);
[23461]918 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
919
[23335]920 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
921 # and (as a fallback) using %xx if not for non-ascii chars
922 # 2. Sets the url ref (SourceFile) to the URL encoded version
923 # of filename for generated files
[23352]924
925 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
[23335]926
[23461]927 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
[23352]928
929 my $octet_file = $raw_file;
930
931 # UTF-8 version of filename
[23387]932# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
933# print STDERR "**** Setting Source Metadata given: $octet_file\n";
934# }
[14961]935
[23352]936 # Deal with (on Windows) raw filenames that are in their
937 # abbreviated DOS form
938
939 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
940 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
941 if (-e $raw_filename) {
942 my $unicode_filename = Win32::GetLongPathName($raw_filename);
943
944 my $unused_full_uf;
945 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
946 }
[23347]947 }
[23352]948 }
[23335]949
950 my $url_encoded_filename;
[23364]951 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
[23352]952 # => Generate a pretty print version of filename that is mapped to Unicode
953
954 # Use filename_encoding to map raw filename to a Perl unicode-aware string
955 $url_encoded_filename = decode($filename_encoding,$octet_file);
[23335]956 }
957 else {
[23352]958 # otherwise generate %xx encoded version of filename for char > 127
959 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
[23335]960 }
[18320]961
[23387]962# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
963# print STDERR "****** saving Source as: $url_encoded_filename\n";
964# }
[23759]965
966 # In the case of converted files and (generalized) exploded documents, there
967 # will already be a source filename => store as OrigSource before overriding
968 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
969 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
970 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
971 }
972
[16919]973 # Source is the UTF8 display name - not necessarily the name of the file on the system
[23829]974 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
975 # on Darwin want all display strings to be in composed form, then can search on that
976 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
977 }
[23461]978 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
[23759]979
[23352]980
[23335]981 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
[18320]982 # If using URL encoding, then SourceFile is the url-reference to url-encoded
[23335]983 # renamed_raw_url: it's a url that refers to the actual file on the system
984 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
[18320]985
[23461]986 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
[23335]987 $renamed_raw_url);
988
[23387]989# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
990# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
991# }
[14961]992}
[23457]993
[17026]994# this should be called by all plugins to set the oid of the doc obj, rather
995# than calling doc_obj->set_OID directly
[15868]996sub add_OID {
[15018]997 my $self = shift (@_);
998 my ($doc_obj) = @_;
999
[17026]1000 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
[15018]1001
[17026]1002 # see if there is a plugin specific set_OID function
1003 if (defined ($self->can('set_OID'))) {
1004 $self->set_OID(@_); # pass through doc_obj and any extra arguments
[15018]1005 }
[17026]1006 else {
[15018]1007 # use the default set_OID() in doc.pm
1008 $doc_obj->set_OID();
1009 }
[17026]1010
[15018]1011}
[23457]1012
[15868]1013# The BasePlugin read_into_doc_obj() function. This function does all the
1014# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]1015# a file and sets up a slew of metadata all saved in doc_obj, which
1016# it then returns as part of a tuple (process_status,doc_obj)
1017#
1018# Much of this functionality used to reside in read, but it was broken
1019# down into a supporting routine to make the code more flexible.
1020#
1021# recursive plugins (e.g. RecPlug) and specialized plugins like those
1022# capable of processing many documents within a single file (e.g.
1023# GMLPlug) will normally want to implement their own version of
1024# read_into_doc_obj()
1025#
1026# Note that $base_dir might be "" and that $file might
1027# include directories
[15868]1028
1029# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]1030sub read_into_doc_obj {
1031 my $self = shift (@_);
[16390]1032 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]1033
[15868]1034 my $outhandle = $self->{'outhandle'};
[10280]1035
[15868]1036 # should we move this to read? What about secondary plugins?
[23759]1037 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
[15868]1038 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
[23363]1039 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
[23457]1040 if $self->{'verbosity'} > 1;
[10280]1041
[16390]1042 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23457]1043
[1242]1044 # create a new document
[18320]1045 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[14961]1046 my $top_section = $doc_obj->get_top_section();
1047
1048 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]1049 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
[23457]1050
[18469]1051
[23352]1052 my $plugin_filename_encoding = $self->{'filename_encoding'};
[23347]1053 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23472]1054 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
[8166]1055
[15868]1056 # plugin specific stuff - what args do we need here??
1057 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1058 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1059 return -1;
[2816]1060 }
[1242]1061
[15868]1062 # include any metadata passed in from previous plugins
1063 # note that this metadata is associated with the top level section
1064 my $section = $doc_obj->get_top_section();
1065 # can we merge these two methods??
1066 $self->add_associated_files($doc_obj, $filename_full_path);
1067 $self->extra_metadata ($doc_obj, $section, $metadata);
1068 $self->auto_extract_metadata($doc_obj);
[1242]1069
[15868]1070 # if we haven't found any Title so far, assign one
1071 # this was shifted to here from inside read()
1072 $self->title_fallback($doc_obj,$section,$filename_no_path);
1073
1074 $self->add_OID($doc_obj);
1075
[21219]1076 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[15868]1077 return (1,$doc_obj);
1078}
[2785]1079
[21219]1080sub post_process_doc_obj {
1081 my $self = shift (@_);
1082 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1083
1084 return 1;
1085}
1086
[15868]1087sub add_dummy_text {
1088 my $self = shift(@_);
1089 my ($doc_obj, $section) = @_;
[2785]1090
[15868]1091 # add NoText metadata so we can hide this dummy text in format statements
1092 $doc_obj->add_metadata($section, "NoText", "1");
1093 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1094
1095}
[8510]1096
[15868]1097# does nothing. Can be overridden by subclass
1098sub auto_extract_metadata {
1099 my $self = shift(@_);
1100 my ($doc_obj) = @_;
1101}
[11122]1102
[15868]1103# adds cover image, associate_file options stuff. Should be called by sub class
1104# read_into_doc_obj
1105sub add_associated_files {
1106 my $self = shift(@_);
1107 # whatis filename??
1108 my ($doc_obj, $filename) = @_;
1109
1110 # add in the cover image
1111 if ($self->{'cover_image'}) {
1112 $self->associate_cover_image($doc_obj, $filename);
[8716]1113 }
[22215]1114 # store the original (used for eg TextPlugin to store the original for OAI)
1115 if ($self->{'store_original_file'}) {
1116 $self->associate_source_file($doc_obj, $filename);
1117 }
[23457]1118
[15018]1119
[10280]1120}
[1242]1121
[16390]1122# implement this if you are extracting metadata for other documents
1123sub metadata_read {
1124 my $self = shift (@_);
[19493]1125 my ($pluginfo, $base_dir, $file, $block_hash,
1126 $extrametakeys, $extrametadata, $extrametafile,
[23212]1127 $processor, $gli, $aux) = @_;
[16390]1128
1129 # can we process this file??
1130 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1131 return undef unless $self->can_process_this_file($filename_full_path);
1132
1133 return 1; # we recognise the file, but don't actually do anything with it
1134}
1135
1136
[15868]1137# The BasePlugin read() function. This function calls read_into_doc_obj()
[10280]1138# to ensure all the right things to make general options work for a
1139# given plugin are done. It then calls the process() function which
1140# does all the work specific to a plugin (like the old read functions
1141# used to do). Most plugins should define their own process() function
1142# and let this read() function keep control.
1143#
1144# recursive plugins (e.g. RecPlug) and specialized plugins like those
1145# capable of processing many documents within a single file (e.g.
1146# GMLPlug) might want to implement their own version of read(), but
1147# more likely need to implement their own version of read_into_doc_obj()
1148#
1149# Return number of files processed, undef if can't recognise, -1 if can't
1150# process
1151
1152sub read {
1153 my $self = shift (@_);
[16390]1154 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]1155
[16390]1156 # can we process this file??
1157 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[20577]1158
[16390]1159 return undef unless $self->can_process_this_file($filename_full_path);
1160
[10280]1161 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[21219]1162
[10280]1163 if ((defined $process_status) && ($process_status == 1)) {
[21219]1164
[10280]1165 # process the document
1166 $processor->process($doc_obj);
[15868]1167
[10280]1168 $self->{'num_processed'} ++;
1169 undef $doc_obj;
[9398]1170 }
[15868]1171 # delete any temp files that we may have created
1172 $self->clean_up_after_doc_obj_processing();
[9398]1173
[18469]1174
[10280]1175 # if process_status == 1, then the file has been processed.
1176 return $process_status;
1177
[4]1178}
1179
[1244]1180# returns undef if file is rejected by the plugin
[1242]1181sub process {
1182 my $self = shift (@_);
[11089]1183 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]1184
[23419]1185 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
[1244]1186
[23457]1187 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[23419]1188 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1189
[23457]1190 die "\n";
[23419]1191
[1244]1192 return undef; # never gets here
[1242]1193}
1194
[15868]1195# overwrite this method to delete any temp files that we have created
1196sub clean_up_after_doc_obj_processing {
1197 my $self = shift(@_);
[4]1198
[10280]1199}
[16390]1200
[10280]1201# write_file -- used by ConvertToPlug, for example in post processing
1202#
[15868]1203# where should this go, is here the best place??
[10280]1204sub utf8_write_file {
1205 my $self = shift (@_);
1206 my ($textref, $filename) = @_;
1207
[22953]1208 if (!open (FILE, ">:utf8", $filename)) {
[10280]1209 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
[23457]1210 die "\n";
1211 }
[10280]1212 print FILE $$textref;
1213
[1219]1214 close FILE;
1215}
1216
[10280]1217
[7504]1218sub filename_based_title
1219{
1220 my $self = shift (@_);
1221 my ($file) = @_;
1222
1223 my $file_derived_title = $file;
1224 $file_derived_title =~ s/_/ /g;
[23335]1225 $file_derived_title =~ s/\.[^.]+$//;
[7504]1226
1227 return $file_derived_title;
1228}
1229
[9398]1230
[7504]1231sub title_fallback
1232{
1233 my $self = shift (@_);
1234 my ($doc_obj,$section,$file) = @_;
1235
[22705]1236 if (!defined $doc_obj->get_metadata_element ($section, "Title")
[23335]1237 || $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]1238
[23335]1239 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1240 my $file_derived_title;
1241 if (defined $source_file) {
1242 $file_derived_title = $self->filename_based_title($source_file);
1243 }
1244 else {
1245 # pp = pretty print
1246 my $pp_file = (defined $source_file) ? $source_file : $file;
1247
1248 my $raw_title = $self->filename_based_title($file);
1249 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1250 }
1251
1252
[15868]1253 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1254 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]1255 }
[15868]1256 else {
[16995]1257 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
[15868]1258 }
[9413]1259 }
[23457]1260
[1844]1261}
[23457]1262
[1219]1263# add any extra metadata that's been passed around from one
1264# plugin to another.
1265# extra_metadata uses add_utf8_metadata so it expects metadata values
1266# to already be in utf8
1267sub extra_metadata {
1268 my $self = shift (@_);
1269 my ($doc_obj, $cursection, $metadata) = @_;
1270
[11122]1271 my $associate_tail_re = $self->{'associate_tail_re'};
1272
[1219]1273 foreach my $field (keys(%$metadata)) {
[839]1274 # $metadata->{$field} may be an array reference
[8510]1275 if ($field eq "gsdlassocfile_tobe") {
1276 # 'gsdlassocfile_tobe' is artificially introduced metadata
1277 # that is used to signal that certain additional files should
1278 # be tied to this document. Useful in situations where a
1279 # metadata pass in the plugin pipeline works out some files
1280 # need to be associated with a document, but the document hasn't
1281 # been formed yet.
1282 my $equiv_form = "";
1283 foreach my $gaf (@{$metadata->{$field}}) {
1284 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
[18171]1285 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1286
1287 # we need to make sure the filename is valid utf-8 - we do
[18320]1288 # this by url or base64 encoding it
[18171]1289 # $tail_filename is the name that we store the file as
[18320]1290 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
[8510]1291 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[20778]1292 $doc_obj->associate_source_file($full_filename);
[18320]1293 # If the filename is url_encoded, we need to encode the % signs
1294 # in the filename, so that it works in a url
[18404]1295 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
[11122]1296 # work out extended tail extension (i.e. matching tail re)
1297
1298 my ($file_prefix,$file_extended_ext)
[16390]1299 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]1300 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
[8510]1301 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
[21742]1302 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
[8510]1303 my $srcicon = "_icon".$doc_ext."_";
1304 my $end_doclink = "</a>";
1305
[11122]1306 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1307
[18171]1308 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
[11122]1309 # for metadata such as [mp3._edited] [mp3._full] ...
1310 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1311 }
1312
1313 # for multiple metadata such as [mp3.assoclink]
1314 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1315
1316 $equiv_form .= " $assoc_form";
[8510]1317 }
1318 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1319 }
[23279]1320 elsif ($field eq "gsdlzipfilename") {
1321 # special case for when files have come out of a zip. source_path
1322 # (used for archives dbs and keeping track for incremental import)
1323 # must be set to the zip file name
1324 my $zip_filename = $metadata->{$field};
1325 # overwrite the source_path
[23363]1326 $doc_obj->set_source_path($zip_filename);
[23279]1327 # and set the metadata
1328 $zip_filename = &util::filename_within_collection($zip_filename);
1329 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1330 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1331 }
[8510]1332 elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]1333 map {
[1219]1334 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]1335 } @{$metadata->{$field}};
1336 } else {
[1219]1337 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]1338 }
1339 }
1340}
1341
[1396]1342
[2785]1343sub compile_stats {
1344 my $self = shift(@_);
1345 my ($stats) = @_;
1346
1347 $stats->{'num_processed'} += $self->{'num_processed'};
1348 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]1349 $stats->{'num_archives'} += $self->{'num_archives'};
[2785]1350
1351}
[22215]1352sub associate_source_file {
1353 my $self = shift(@_);
1354
1355 my ($doc_obj, $filename) = @_;
1356 my $cursection = $doc_obj->get_top_section();
1357 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1358
1359 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
[22663]1360 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
[22215]1361}
[2785]1362
[2816]1363sub associate_cover_image {
[22215]1364 my $self = shift(@_);
[2816]1365 my ($doc_obj, $filename) = @_;
1366
[23363]1367 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1368
[10833]1369 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
[23363]1370 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1371
1372 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1373 # don't stat() for existence e.g. for multiple document input files
[10833]1374 # (eg SplitPlug)
1375 return;
1376 }
1377
[9413]1378 my $top_section=$doc_obj->get_top_section();
1379
[23363]1380 if (&util::fd_exists($upgraded_filename)) {
[20778]1381 $doc_obj->associate_source_file($filename);
[13968]1382 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]1383 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]1384 } else {
[10833]1385 my $upper_filename = $filename;
[23363]1386 my $upgraded_upper_filename = $upgraded_filename;
1387
[10833]1388 $upper_filename =~ s/jpg$/JPG/;
[23363]1389 $upgraded_upper_filename =~ s/jpg$/JPG/;
1390
1391 if (&util::fd_exists($upgraded_upper_filename)) {
[20778]1392 $doc_obj->associate_source_file($upper_filename);
[10833]1393 $doc_obj->associate_file($upper_filename, "cover.jpg",
1394 "image/jpeg");
[9413]1395 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]1396 } else {
1397 # file doesn't exist, so record the fact that it's missing so
1398 # we don't stat() again (stat is slow)
[23363]1399 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
[3086]1400 }
[2816]1401 }
[10833]1402
[2816]1403}
1404
[11332]1405
1406# Overridden by exploding plugins (eg. ISISPlug)
1407sub clean_up_after_exploding
1408{
1409 my $self = shift(@_);
1410}
1411
1412
[16390]1413
[4]14141;
Note: See TracBrowser for help on using the repository browser.