source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 46.7 KB
RevLine 
[537]1###########################################################################
2#
[15868]3# BasePlugin.pm -- base class for all the import plugins
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[9413]8# Copyright (C) 1999-2005 New Zealand Digital Library Project
[537]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
[15868]26package BasePlugin;
[2219]27
[10254]28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
[9413]31
[8892]32use File::Basename;
[23335]33use Encode;
[23832]34use Unicode::Normalize 'normalize';
[8892]35
[1870]36use encodings;
[11389]37use unicode;
[1242]38use doc;
[2751]39use ghtml;
[9413]40use gsprintf 'gsprintf';
[27306]41use util;
42use FileUtils;
[4]43
[15868]44use PrintInfo;
[10218]45
[15868]46BEGIN {
47 @BasePlugin::ISA = ( 'PrintInfo' );
48}
[5681]49
[18320]50# the different methods that can be applied when renaming
51# imported documents and their associated files
52our $file_rename_method_list =
53 [ { 'name' => "url",
54 'desc' => "{BasePlugin.rename_method.url}" },
55 { 'name' => "base64",
[18398]56 'desc' => "{BasePlugin.rename_method.base64}" },
[18404]57 { 'name' => "none",
58 'desc' => "{BasePlugin.rename_method.none}",
[18398]59 'hiddengli' => "yes" } ];
[18320]60
[15868]61our $encoding_list =
[10218]62 [ { 'name' => "ascii",
[16014]63 'desc' => "{BasePlugin.encoding.ascii}" },
[4744]64 { 'name' => "utf8",
[16014]65 'desc' => "{BasePlugin.encoding.utf8}" },
[4744]66 { 'name' => "unicode",
[16014]67 'desc' => "{BasePlugin.encoding.unicode}" } ];
[3540]68
[23457]69
[10620]70my $e = $encodings::encodings;
71foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
72{
73 my $hashEncode =
74 {'name' => $enc,
75 'desc' => $e->{$enc}->{'name'}};
76
[15868]77 push(@{$encoding_list},$hashEncode);
[10620]78}
79
[15868]80our $encoding_plus_auto_list =
81 [ { 'name' => "auto",
[16557]82 'desc' => "{BasePlugin.filename_encoding.auto}" },
[23457]83 { 'name' => "auto-language-analysis",
[16557]84 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
85 { 'name' => "auto-filesystem-encoding",
86 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
87 { 'name' => "auto-fl",
88 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
89 { 'name' => "auto-lf",
90 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
[10620]91
[15868]92push(@{$encoding_plus_auto_list},@{$encoding_list});
93
[16698]94our $oidtype_list =
[17026]95 [ { 'name' => "auto",
96 'desc' => "{BasePlugin.OIDtype.auto}" },
97 { 'name' => "hash",
[16698]98 'desc' => "{import.OIDtype.hash}" },
[24290]99 { 'name' => "hash_on_ga_xml",
100 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
[26536]101 { 'name' => "hash_on_full_filename",
102 'desc' => "{import.OIDtype.hash_on_full_filename}" },
[16698]103 { 'name' => "assigned",
104 'desc' => "{import.OIDtype.assigned}" },
105 { 'name' => "incremental",
106 'desc' => "{import.OIDtype.incremental}" },
[26221]107 { 'name' => "filename",
108 'desc' => "{import.OIDtype.filename}" },
[16698]109 { 'name' => "dirname",
[26536]110 'desc' => "{import.OIDtype.dirname}" },
111 { 'name' => "full_filename",
112 'desc' => "{import.OIDtype.full_filename}" } ];
[16698]113
[4873]114my $arguments =
[3540]115 [ { 'name' => "process_exp",
[15868]116 'desc' => "{BasePlugin.process_exp}",
[6408]117 'type' => "regexp",
[3540]118 'deft' => "",
119 'reqd' => "no" },
[16390]120 { 'name' => "no_blocking",
121 'desc' => "{BasePlugin.no_blocking}",
122 'type' => "flag",
123 'reqd' => "no"},
[3540]124 { 'name' => "block_exp",
[15868]125 'desc' => "{BasePlugin.block_exp}",
[6408]126 'type' => "regexp",
[3540]127 'deft' => "",
128 'reqd' => "no" },
[22215]129 { 'name' => "store_original_file",
130 'desc' => "{BasePlugin.store_original_file}",
131 'type' => "flag",
132 'reqd' => "no" },
[8892]133 { 'name' => "associate_ext",
[15868]134 'desc' => "{BasePlugin.associate_ext}",
[8892]135 'type' => "string",
136 'reqd' => "no" },
[11122]137 { 'name' => "associate_tail_re",
[15868]138 'desc' => "{BasePlugin.associate_tail_re}",
[11122]139 'type' => "string",
140 'reqd' => "no" },
[16698]141 { 'name' => "OIDtype",
142 'desc' => "{import.OIDtype}",
143 'type' => "enum",
144 'list' => $oidtype_list,
[16847]145 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
[17026]146 'deft' => "auto",
[18591]147 'reqd' => "no" },
[16698]148 { 'name' => "OIDmetadata",
149 'desc' => "{import.OIDmetadata}",
150 'type' => "metadata",
151 'deft' => "dc.Identifier",
[18591]152 'reqd' => "no" },
[16698]153# { 'name' => "use_as_doc_identifier",
154# 'desc' => "{BasePlugin.use_as_doc_identifier}",
155# 'type' => "string",
156# 'reqd' => "no" ,
157# 'deft' => "" } ,
[18320]158 { 'name' => "no_cover_image",
[15868]159 'desc' => "{BasePlugin.no_cover_image}",
[3540]160 'type' => "flag",
161 'reqd' => "no" },
[15868]162 { 'name' => "filename_encoding",
163 'desc' => "{BasePlugin.filename_encoding}",
164 'type' => "enum",
165 'deft' => "auto",
166 'list' => $encoding_plus_auto_list,
[16390]167 'reqd' => "no" },
168 { 'name' => "smart_block",
[16520]169 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
[16390]170 'type' => "flag",
171 'reqd' => "no",
[18320]172 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
173 { 'name' => "file_rename_method",
174 'desc' => "{BasePlugin.file_rename_method}",
175 'type' => "enum",
176 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
177 'list' => $file_rename_method_list,
178 'reqd' => "no"
[23457]179 }
[15868]180
181 ];
[3540]182
[9398]183
[15868]184my $options = { 'name' => "BasePlugin",
185 'desc' => "{BasePlugin.desc}",
[6408]186 'abstract' => "yes",
187 'inherits' => "no",
[4750]188 'args' => $arguments };
[3540]189
[4]190sub new {
[10218]191
[15868]192 my ($class) = shift (@_);
[16698]193 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
[10218]194 push(@$pluginlist, $class);
[9398]195
[15868]196 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
197 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]198
[16698]199 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
[16390]200
201 if ($self->{'info_only'}) {
202 # don't worry about any options etc
203 return bless $self, $class;
204 }
[10579]205
[16390]206 if ($self->{'smart_block'}) {
207 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
208 }
209 $self->{'smart_block'} = undef;
210
[15868]211 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
212 $self->{'plugin_type'} = $plugin_name;
[10579]213
[24404]214 # remove ex. from OIDmetadata iff it's the only namespace prefix
215 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
[2785]216 $self->{'num_processed'} = 0;
217 $self->{'num_not_processed'} = 0;
218 $self->{'num_blocked'} = 0;
219 $self->{'num_archives'} = 0;
[8678]220 $self->{'cover_image'} = 1; # cover image is on by default
[10218]221 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
[19222]222 $self->{'can_process_directories'} = 0;
[10579]223 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
[3540]224
[8892]225 my $associate_ext = $self->{'associate_ext'};
226 if ((defined $associate_ext) && ($associate_ext ne "")) {
[9351]227
[11122]228 my $associate_tail_re = $self->{'associate_tail_re'};
229 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
230 my $outhandle = $self->{'outhandle'};
231 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
232 print $outhandle " defaulting to 'associate_tail_re'\n";
[8892]233 }
[11122]234 else {
235 my @exts = split(/,/,$associate_ext);
[8892]236
[11122]237 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
238 my $associate_tail_re = join("|",@exts_bracketed);
239 $self->{'associate_tail_re'} = $associate_tail_re;
240 }
241
242 delete $self->{'associate_ext'};
[8892]243 }
244
[15868]245 return bless $self, $class;
[11089]246
[4]247}
248
[16821]249sub merge_inheritance
250{
251 my $self = {};
252 my @child_selfs = @_;
253
254 foreach my $child_self (@child_selfs) {
255 foreach my $key (keys %$child_self) {
256 if (defined $self->{$key}) {
257 if ($self->{$key} ne $child_self->{$key}) {
258# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
259# print STDERR "Existing stored value = $self->{$key}\n";
260# print STDERR "New (child) value = $child_self->{$key}\n";
261# print STDERR "Keeping existing value\n";
262 # Existing value seems to be option specified in collect.cfg
263
264 ### $self->{$key} = $child_self->{$key};
265
266 }
267 else {
268## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
269 }
270
271 }
272 else {
273 $self->{$key} = $child_self->{$key};
274 }
275 }
276 }
277
278 return $self;
279}
280
[15868]281# initialize BasePlugin options
282# if init() is overridden in a sub-class, remember to call BasePlugin::init()
[1242]283sub init {
284 my $self = shift (@_);
[2785]285 my ($verbosity, $outhandle, $failhandle) = @_;
[1242]286
287 # verbosity is passed through from the processor
288 $self->{'verbosity'} = $verbosity;
289
[2785]290 # as are the outhandle and failhandle
[1424]291 $self->{'outhandle'} = $outhandle if defined $outhandle;
[2785]292 $self->{'failhandle'} = $failhandle;
[16390]293# $self->SUPER::init(@_);
294
[1242]295 # set process_exp and block_exp to defaults unless they were
296 # explicitly set
[1244]297
298 if ((!$self->is_recursive()) and
[1242]299 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
[1244]300
[1242]301 $self->{'process_exp'} = $self->get_default_process_exp ();
302 if ($self->{'process_exp'} eq "") {
[1244]303 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]304 }
305 }
306
307 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
308 $self->{'block_exp'} = $self->get_default_block_exp ();
309 }
[11089]310
[1242]311}
312
[839]313sub begin {
314 my $self = shift (@_);
315 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[16821]316
[17026]317 if ($self->{'OIDtype'} eq "auto") {
318 # hasn't been set in the plugin, use the processor values
319 $self->{'OIDtype'} = $processor->{'OIDtype'};
320 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
321 }
322 if ($self->{'OIDtype'} eq "hash") {
323 # should we hash on the file or on the doc xml??
324 $self->{'OIDtype'} = $self->get_oid_hash_type();
325 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
326 $self->{'OIDtype'} = "hash_on_file";
327 }
328 }
[839]329}
330
[21308]331# This is called once if removeold is set with import.pl. Most plugins will do
332# nothing but if a plugin does any stuff outside of creating doc obj, then
333# it may need to clear something.
334sub remove_all {
[21286]335 my $self = shift (@_);
336 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
337}
338
[21308]339# This is called per document for docs that have been deleted from the
340# collection. Most plugins will do nothing
341# but if a plugin does any stuff outside of creating doc obj, then it may need
342# to clear something.
343sub remove_one {
344 my $self = shift (@_);
345
[21315]346 my ($file, $oids, $archivedir) = @_;
347 return 0 if $self->can_process_this_file($file);
348 return undef;
[21308]349}
350
[839]351sub end {
[10155]352 # potentially called at the end of each plugin pass
353 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
354
[15868]355 my ($self) = shift (@_);
[839]356}
357
[10155]358sub deinit {
359 # called only once, after all plugin passes have been done
360
361 my ($self) = @_;
362}
363
[17026]364# default hashing type is to hash on the original file (or converted file)
365# override this to return hash_on_ga_xml for filetypes where hashing on the
366# file is no good eg video
367sub get_oid_hash_type {
368
369 my $self = shift (@_);
370
371 return "hash_on_file";
372}
373
[15868]374
[1242]375# this function should be overridden to return 1
376# in recursive plugins
[4]377sub is_recursive {
378 my $self = shift (@_);
379
[1242]380 return 0;
[4]381}
382
[1242]383sub get_default_block_exp {
384 my $self = shift (@_);
385
386 return "";
387}
388
389sub get_default_process_exp {
390 my $self = shift (@_);
391
392 return "";
393}
394
[23419]395
[18320]396# rename imported files and assoc files using URL encoding by default
397# as this will work for most plugins and give more legible filenames
398sub get_default_file_rename_method() {
399 my $self = shift (@_);
400 return "url";
401}
402
[18398]403# returns this plugin's active (possibly user-selected) file_rename_method
404sub get_file_rename_method() {
405 my $self = shift (@_);
406 my $rename_method = $self->{'file_rename_method'};
407 if($rename_method) {
408 return $rename_method;
409 } else {
410 return $self->get_default_file_rename_method();
411 }
412}
413
[16390]414# default implementation is to do nothing
415sub store_block_files {
416
[9067]417 my $self =shift (@_);
[16390]418 my ($filename_full_path, $block_hash) = @_;
419
[9067]420}
421
[16390]422# put files to block into hash
423sub use_block_expressions {
424
425 my $self =shift (@_);
426 my ($filename_full_path, $block_hash) = @_;
427
[23363]428 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
429
[16390]430 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
[23561]431 &util::block_filename($block_hash,$filename_full_path);
[16390]432 }
433
434}
435
[9067]436#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
437sub block_cover_image
438{
[10833]439 my $self =shift;
[16390]440 my ($filename, $block_hash) = @_;
[10833]441
[23363]442 $filename = &util::upgrade_if_dos_filename($filename);
443
[9067]444 if ($self->{'cover_image'}) {
445 my $coverfile = $filename;
446 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
[23759]447
[25743]448 #if there is no file extension, coverfile will be the same as filename
449 return if $coverfile eq $filename;
450
[27306]451 if (!&FileUtils::fileExists($coverfile)) {
[9067]452 $coverfile =~ s/jpg$/JPG/;
453 }
[27306]454 if (&FileUtils::fileExists($coverfile)) {
[23561]455 &util::block_filename($block_hash,$coverfile);
[11089]456 }
[9067]457 }
458
459 return;
460}
[11122]461
462
[16390]463# discover all the files that should be blocked by this plugin
464# check the args ...
465sub file_block_read {
[11122]466
[8510]467 my $self = shift (@_);
[16390]468 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[8892]469 # Keep track of filenames with same root but different extensions
[11122]470 # Used to support -associate_ext and the more generalised
471 # -associate_tail_re
[16390]472 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[8892]473
[18441]474 if (!-d $filename_full_path) {
475 $block_hash->{'all_files'}->{$file} = 1;
476 }
477
[11122]478 my $associate_tail_re = $self->{'associate_tail_re'};
479 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
480 my ($file_prefix,$file_ext)
[16390]481 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
[8892]482 if ((defined $file_prefix) && (defined $file_ext)) {
[16390]483 my $shared_fileroot = $block_hash->{'shared_fileroot'};
[8892]484 if (!defined $shared_fileroot->{$file_prefix}) {
[11122]485 my $file_prefix_rec = { 'tie_to' => undef,
486 'exts' => {} };
[8892]487 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
488 }
489
490 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
491
[16390]492 if ($self->can_process_this_file($filename_full_path)) {
[8892]493 # This is the document the others should be tied to
494 $file_prefix_rec->{'tie_to'} = $file_ext;
495 }
496 else {
[11122]497 if ($file_ext =~ m/$associate_tail_re$/) {
[16390]498 # this file should be associated to the main one
[9351]499 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
500 }
[8892]501 }
[11122]502
[8892]503 }
504 }
[11122]505
[16390]506 # check block expressions
507 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
508
[9067]509 # now check whether we are actually processing this
[16390]510 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
[9067]511 return undef; # can't recognise
512 }
[23457]513
[16852]514 # if we have a block_exp, then this overrides the normal 'smart' blocking
515 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
[8892]516
[11089]517 # block the cover image if there is one
518 if ($self->{'cover_image'}) {
[16852]519 $self->block_cover_image($filename_full_path, $block_hash);
[11089]520 }
[23457]521
[9067]522 return 1;
[8510]523}
524
[16390]525# plugins that rely on more than process_exp (eg XML plugins) can override this method
526sub can_process_this_file {
527 my $self = shift(@_);
528 my ($filename) = @_;
[8892]529
[19222]530 if (-d $filename && !$self->{'can_process_directories'}) {
531 return 0;
532 }
[22427]533
[16390]534 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
535 return 1;
[8892]536 }
537 return 0;
[10280]538
539}
540
[24403]541# Even if a plugin can extract metadata in its metadata_read pass,
542# make the default return 'undef' so processing of the file continues
543# down the pipeline, so other plugins can also have the opportunity to
544# locate metadata and set it up in the extrametakeys variables that
545# are passed around.
546
547sub can_process_this_file_for_metadata {
548 my $self = shift(@_);
549
550 return undef;
551}
552
553
[16390]554# just converts path as is to utf8.
555sub filepath_to_utf8 {
[10280]556 my $self = shift (@_);
[15868]557 my ($file, $file_encoding) = @_;
[16390]558 my $filemeta = $file;
[10280]559
[16557]560 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
561
562 # Whenever filename-encoding is set to any of the auto settings, we
563 # check if the filename is already in UTF8. If it is, then we're done.
564 if($filename_encoding =~ m/auto/) {
[23457]565 if(&unicode::check_is_utf8($filemeta))
566 {
567 $filename_encoding = "utf8";
568 return $filemeta;
569 }
[16557]570 }
571
572 # Auto setting, but filename is not utf8
573 if ($filename_encoding eq "auto")
574 {
[23457]575 # try textcat
576 $filename_encoding = $self->textcat_encoding($filemeta);
[16557]577
[23457]578 # check the locale next
579 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[16557]580
[23457]581
582 # now try the encoding of the document, if available
583 if ($filename_encoding eq "undefined" && defined $file_encoding) {
584 $filename_encoding = $file_encoding;
585 }
[16557]586
587 }
588
589 elsif ($filename_encoding eq "auto-language-analysis")
590 {
591 $filename_encoding = $self->textcat_encoding($filemeta);
592
593 # now try the encoding of the document, if available
594 if ($filename_encoding eq "undefined" && defined $file_encoding) {
595 $filename_encoding = $file_encoding;
596 }
597 }
598
599 elsif ($filename_encoding eq "auto-filesystem-encoding")
[23352]600 {
[16557]601 # try locale
602 $filename_encoding = $self->locale_encoding();
603 }
604
605 elsif ($filename_encoding eq "auto-fl")
606 {
607 # filesystem-encoding (locale) then language-analysis (textcat)
608 $filename_encoding = $self->locale_encoding();
609
610 # try textcat
611 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
[23457]612
[16557]613 # else assume filename encoding is encoding of file content, if that's available
614 if ($filename_encoding eq "undefined" && defined $file_encoding) {
615 $filename_encoding = $file_encoding;
[15868]616 }
[16557]617 }
[23457]618
[16557]619 elsif ($filename_encoding eq "auto-lf")
620 {
621 # language-analysis (textcat) then filesystem-encoding (locale)
622 $filename_encoding = $self->textcat_encoding($filemeta);
623
624 # guess filename encoding from encoding of file content, if available
625 if ($filename_encoding eq "undefined" && defined $file_encoding) {
626 $filename_encoding = $file_encoding;
627 }
628
629 # try locale
630 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
[15868]631 }
[23457]632
[16557]633 # if still undefined, use utf8 as fallback
634 if ($filename_encoding eq "undefined") {
635 $filename_encoding = "utf8";
636 }
637
[18171]638 #print STDERR "**** UTF8 encoding the filename $filemeta ";
[16767]639
[16557]640 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
641 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
642 # be always utf8 (in which case the filename's encoding is also set as utf8 even
643 # though the filename need not be if it originates from another system)--in such
644 # cases attempt to make the filename utf8 to match.
645 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
646 &unicode::ensure_utf8(\$filemeta);
647 }
648
649 # convert non-unicode encodings to utf8
650 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
651 $filemeta = &unicode::unicode2utf8(
[23457]652 &unicode::convert2unicode($filename_encoding, \$filemeta)
653 );
[15868]654 }
[16390]655
[18171]656 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
[16390]657 return $filemeta;
658}
659
660# gets the filename with no path, converts to utf8, and then dm safes it.
[18320]661# filename_encoding set by user
[16390]662sub filename_to_utf8_metadata
663{
664 my $self = shift (@_);
665 my ($file, $file_encoding) = @_;
666
667 my $outhandle = $self->{'outhandle'};
668
[23457]669 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
670 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
671 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
[23347]672
673
[16390]674 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
675 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
[15868]676
[22652]677 return $filemeta;
[10280]678}
679
[16557]680sub locale_encoding {
681 my $self = shift(@_);
682
683 if (!defined $self->{'filesystem_encoding'}) {
684 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
685 }
[10280]686
[18171]687 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
[16557]688 return $self->{'filesystem_encoding'}; # can be the string "undefined"
689}
690
691sub textcat_encoding {
692 my $self = shift(@_);
693 my ($filemeta) = @_;
694
[22705]695 # analyse filenames without extensions and digits (and trimmed of
696 # surrounding whitespace), so that irrelevant chars don't confuse
697 # textcat
[16557]698 my $strictfilemeta = $filemeta;
699 $strictfilemeta =~ s/\.[^\.]+$//g;
700 $strictfilemeta =~ s/\d//g;
701 $strictfilemeta =~ s/^\s*//g;
702 $strictfilemeta =~ s/\s*$//g;
703
704 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
705 if(!defined $filename_encoding) {
706 $filename_encoding = "undefined";
707 }
708
709 return $filename_encoding; # can be the string "undefined"
710}
711
712# performs textcat
713sub encoding_from_language_analysis {
714 my $self = shift(@_);
715 my ($text) = @_;
716
717 my $outhandle = $self->{'outhandle'};
718 my $best_encoding = undef;
[23457]719
[17212]720 # get the language/encoding of the textstring using textcat
[22632]721 require textcat; # Only load the textcat module if it is required
[16557]722 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
[16767]723 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
[16557]724
725
726 if (scalar @$results < 0) {
727 return undef;
728 }
729
730 # We have some results, we choose the first
731 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
732
733 $best_encoding = $encoding;
734 if (!defined $best_encoding) {
735 return undef;
736 }
[23457]737
[16557]738 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
739 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
740 $best_encoding = 'utf8';
741 }
742
743
744 # check for equivalents where textcat doesn't have some encodings...
745 # eg MS versions of standard encodings
746 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
747 my $iso = $1; # which variant of the iso standard?
748 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
749 if ($text =~ /[\x80-\x9f]/) {
750 # Western Europe
751 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
752 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
753 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
754 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
755 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
756 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
757 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
758 }
759 }
760
761 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
762 !defined $encodings::encodings->{$best_encoding})
763 {
764 if ($self->{'verbosity'}) {
765 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
766 }
767 $best_encoding = undef;
768 }
769
770 return $best_encoding;
771}
772
773# uses locale
[23352]774sub get_filesystem_encoding
775{
[10280]776
[15868]777 my $self = shift(@_);
778
[14961]779 my $outhandle = $self->{'outhandle'};
780 my $filesystem_encoding = undef;
781
782 eval {
[23352]783 # Works for Windows as well, returning the DOS code page in use
[14961]784 use POSIX qw(locale_h);
[15868]785
786 # With only one parameter, setlocale retrieves the
787 # current value
[14961]788 my $current_locale = setlocale(LC_CTYPE);
[15868]789
[23364]790 my $char_encoding = undef;
791 if ($current_locale =~ m/\./) {
792 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
793 $char_encoding = lc($char_encoding);
794 }
795 else {
796 if ($current_locale =~ m/^(posix|c)$/i) {
797 $char_encoding = "ascii";
798 }
799 }
800
801 if (defined $char_encoding) {
[24564]802 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
[15446]803 $char_encoding = "$1\_$2\_$3";
804 }
805
[14961]806 $char_encoding =~ s/-/_/g;
807 $char_encoding =~ s/^utf_8$/utf8/;
[15868]808
[14961]809 if ($char_encoding =~ m/^\d+$/) {
[15607]810 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
[14961]811 $char_encoding = "windows_$char_encoding";
812 }
[15607]813 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
[14961]814 $char_encoding = "dos_$char_encoding";
815 }
816 }
[15868]817
[14961]818 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
[15446]819 || (defined $encodings::encodings->{$char_encoding})) {
[14961]820 $filesystem_encoding = $char_encoding;
821 }
822 else {
823 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
824 }
825 }
[15868]826
[14961]827
828 };
829 if ($@) {
830 print $outhandle "$@\n";
[18398]831 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
[14961]832
833 }
[23352]834
[15868]835 return $filesystem_encoding;
836}
[14961]837
[23335]838
839
840sub deduce_filename_encoding
841{
842 my $self = shift (@_);
[23347]843 my ($file,$metadata,$plugin_filename_encoding) = @_;
[23335]844
[23465]845 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
[23335]846 my $deduced_filename_encoding = undef;
847
848 # Start by looking for manually assigned metadata
849 if (defined $gs_filename_encoding) {
[23457]850 if (ref ($gs_filename_encoding) eq "ARRAY") {
851 my $outhandle = $self->{'outhandle'};
852
853 $deduced_filename_encoding = $gs_filename_encoding->[0];
854
855 my $num_vals = scalar(@$gs_filename_encoding);
856 if ($num_vals>1) {
[23465]857 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
[23457]858 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
859 }
860 }
861 else {
862 $deduced_filename_encoding = $gs_filename_encoding;
863 }
[23335]864 }
[23457]865
[23335]866 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
[23457]867 # Look to see if plugin specifies this value
[23347]868
[23457]869 if (defined $plugin_filename_encoding) {
870 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
871 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
872 my $outhandle = $self->{'outhandle'};
873 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
874 print $outhandle " default to 'auto'\n";
875 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
876 }
877
878 if ($plugin_filename_encoding ne "auto") {
879 # We've been given a specific filenamne encoding
880 # => so use it!
881 $deduced_filename_encoding = $plugin_filename_encoding;
882 }
883 }
[23335]884 }
885
886 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
[23347]887
[23352]888 # Look to file system to provide a character encoding
[23347]889
[23352]890 # If Windows NTFS, then -- assuming we work with long file names got through
891 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
892
893 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
894 # Can do better than working with the DOS character encoding returned by locale
895 $deduced_filename_encoding = "unicode";
896 }
897 else {
898 # Unix of some form or other
899
900 # See if we can determine the file system encoding through locale
901 $deduced_filename_encoding = $self->locale_encoding();
[23457]902
[23352]903 # if locale shows us filesystem is utf8, check to see filename is consistent
904 # => if not, then we have an "alien" filename on our hands
[23457]905
[23458]906 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
[23352]907 if (!&unicode::check_is_utf8($file)) {
908 # "alien" filename, so revert
909 $deduced_filename_encoding = undef;
[23347]910 }
[23352]911 }
912 }
[23335]913 }
914
[23347]915# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
916# # Last chance, apply textcat to deduce filename encoding
917# $deduced_filename_encoding = $self->textcat_encoding($file);
918# }
919
[23457]920 if ($self->{'verbosity'}>3) {
921 my $outhandle = $self->{'outhandle'};
[23347]922
[23457]923 if (defined $deduced_filename_encoding) {
924 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
[23347]925 }
[23457]926 else {
927 print $outhandle " No filename encoding deduced\n";
928 }
929 }
930
[23335]931 return $deduced_filename_encoding;
932}
933
934
935
936
937# Notionally written to be called once for each document, it is however safe to
938# call multiple times (as in the case of ImagePlugin) which calls this later on
939# after the original image has potentially been converted to a *new* source image
940# format (e.g. TIFF to PNG)
941
[15868]942sub set_Source_metadata {
943 my $self = shift (@_);
[23461]944 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
945
[23335]946 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
947 # and (as a fallback) using %xx if not for non-ascii chars
948 # 2. Sets the url ref (SourceFile) to the URL encoded version
949 # of filename for generated files
[23352]950
951 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
[23335]952
[23461]953 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
[23352]954
955 my $octet_file = $raw_file;
956
957 # UTF-8 version of filename
[23387]958# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
959# print STDERR "**** Setting Source Metadata given: $octet_file\n";
960# }
[14961]961
[23352]962 # Deal with (on Windows) raw filenames that are in their
963 # abbreviated DOS form
964
965 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
966 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
967 if (-e $raw_filename) {
968 my $unicode_filename = Win32::GetLongPathName($raw_filename);
969
970 my $unused_full_uf;
971 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
972 }
[23347]973 }
[23352]974 }
[23335]975
976 my $url_encoded_filename;
[23364]977 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
[23352]978 # => Generate a pretty print version of filename that is mapped to Unicode
979
980 # Use filename_encoding to map raw filename to a Perl unicode-aware string
981 $url_encoded_filename = decode($filename_encoding,$octet_file);
[23335]982 }
983 else {
[23352]984 # otherwise generate %xx encoded version of filename for char > 127
985 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
[23335]986 }
[18320]987
[23387]988# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
989# print STDERR "****** saving Source as: $url_encoded_filename\n";
990# }
[23759]991
992 # In the case of converted files and (generalized) exploded documents, there
993 # will already be a source filename => store as OrigSource before overriding
994 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
995 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
996 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
997 }
998
[16919]999 # Source is the UTF8 display name - not necessarily the name of the file on the system
[23829]1000 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1001 # on Darwin want all display strings to be in composed form, then can search on that
1002 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
1003 }
[23461]1004 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
[23759]1005
[23352]1006
[23335]1007 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
[18320]1008 # If using URL encoding, then SourceFile is the url-reference to url-encoded
[23335]1009 # renamed_raw_url: it's a url that refers to the actual file on the system
1010 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
[18320]1011
[23461]1012 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
[23335]1013 $renamed_raw_url);
1014
[23387]1015# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1016# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1017# }
[14961]1018}
[23457]1019
[17026]1020# this should be called by all plugins to set the oid of the doc obj, rather
1021# than calling doc_obj->set_OID directly
[15868]1022sub add_OID {
[15018]1023 my $self = shift (@_);
[26893]1024 my ($doc_obj, $force) = @_;
[15018]1025
[26893]1026 # don't add one if there is one already set, unless we are forced to do so
1027 return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
[17026]1028 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
[15018]1029
[17026]1030 # see if there is a plugin specific set_OID function
1031 if (defined ($self->can('set_OID'))) {
1032 $self->set_OID(@_); # pass through doc_obj and any extra arguments
[15018]1033 }
[17026]1034 else {
[15018]1035 # use the default set_OID() in doc.pm
1036 $doc_obj->set_OID();
1037 }
[17026]1038
[15018]1039}
[23457]1040
[15868]1041# The BasePlugin read_into_doc_obj() function. This function does all the
1042# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
[10280]1043# a file and sets up a slew of metadata all saved in doc_obj, which
1044# it then returns as part of a tuple (process_status,doc_obj)
1045#
1046# Much of this functionality used to reside in read, but it was broken
1047# down into a supporting routine to make the code more flexible.
1048#
1049# recursive plugins (e.g. RecPlug) and specialized plugins like those
1050# capable of processing many documents within a single file (e.g.
1051# GMLPlug) will normally want to implement their own version of
1052# read_into_doc_obj()
1053#
1054# Note that $base_dir might be "" and that $file might
1055# include directories
[15868]1056
1057# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
[10280]1058sub read_into_doc_obj {
1059 my $self = shift (@_);
[16390]1060 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]1061
[15868]1062 my $outhandle = $self->{'outhandle'};
[10280]1063
[15868]1064 # should we move this to read? What about secondary plugins?
[23759]1065 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
[15868]1066 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
[23363]1067 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
[23457]1068 if $self->{'verbosity'} > 1;
[10280]1069
[16390]1070 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23457]1071
[1242]1072 # create a new document
[18320]1073 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[14961]1074 my $top_section = $doc_obj->get_top_section();
1075
1076 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[15868]1077 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
[23457]1078
[18469]1079
[23352]1080 my $plugin_filename_encoding = $self->{'filename_encoding'};
[23347]1081 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23472]1082 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
[8166]1083
[15868]1084 # plugin specific stuff - what args do we need here??
1085 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1086 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1087 return -1;
[2816]1088 }
[1242]1089
[15868]1090 # include any metadata passed in from previous plugins
1091 # note that this metadata is associated with the top level section
1092 my $section = $doc_obj->get_top_section();
1093 # can we merge these two methods??
1094 $self->add_associated_files($doc_obj, $filename_full_path);
1095 $self->extra_metadata ($doc_obj, $section, $metadata);
1096 $self->auto_extract_metadata($doc_obj);
[1242]1097
[15868]1098 # if we haven't found any Title so far, assign one
1099 # this was shifted to here from inside read()
1100 $self->title_fallback($doc_obj,$section,$filename_no_path);
1101
1102 $self->add_OID($doc_obj);
1103
[21219]1104 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[15868]1105 return (1,$doc_obj);
1106}
[2785]1107
[21219]1108sub post_process_doc_obj {
1109 my $self = shift (@_);
1110 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1111
1112 return 1;
1113}
1114
[15868]1115sub add_dummy_text {
1116 my $self = shift(@_);
1117 my ($doc_obj, $section) = @_;
[2785]1118
[15868]1119 # add NoText metadata so we can hide this dummy text in format statements
1120 $doc_obj->add_metadata($section, "NoText", "1");
1121 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1122
1123}
[8510]1124
[15868]1125# does nothing. Can be overridden by subclass
1126sub auto_extract_metadata {
1127 my $self = shift(@_);
1128 my ($doc_obj) = @_;
1129}
[11122]1130
[15868]1131# adds cover image, associate_file options stuff. Should be called by sub class
1132# read_into_doc_obj
1133sub add_associated_files {
1134 my $self = shift(@_);
1135 # whatis filename??
1136 my ($doc_obj, $filename) = @_;
1137
1138 # add in the cover image
1139 if ($self->{'cover_image'}) {
1140 $self->associate_cover_image($doc_obj, $filename);
[8716]1141 }
[22215]1142 # store the original (used for eg TextPlugin to store the original for OAI)
1143 if ($self->{'store_original_file'}) {
1144 $self->associate_source_file($doc_obj, $filename);
1145 }
[23457]1146
[15018]1147
[10280]1148}
[1242]1149
[16390]1150# implement this if you are extracting metadata for other documents
1151sub metadata_read {
1152 my $self = shift (@_);
[19493]1153 my ($pluginfo, $base_dir, $file, $block_hash,
1154 $extrametakeys, $extrametadata, $extrametafile,
[23212]1155 $processor, $gli, $aux) = @_;
[16390]1156
1157 # can we process this file??
1158 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[24403]1159 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
[16390]1160
1161 return 1; # we recognise the file, but don't actually do anything with it
1162}
1163
1164
[15868]1165# The BasePlugin read() function. This function calls read_into_doc_obj()
[10280]1166# to ensure all the right things to make general options work for a
1167# given plugin are done. It then calls the process() function which
1168# does all the work specific to a plugin (like the old read functions
1169# used to do). Most plugins should define their own process() function
1170# and let this read() function keep control.
1171#
1172# recursive plugins (e.g. RecPlug) and specialized plugins like those
1173# capable of processing many documents within a single file (e.g.
1174# GMLPlug) might want to implement their own version of read(), but
1175# more likely need to implement their own version of read_into_doc_obj()
1176#
1177# Return number of files processed, undef if can't recognise, -1 if can't
1178# process
1179
1180sub read {
1181 my $self = shift (@_);
[16390]1182 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[10280]1183
[16390]1184 # can we process this file??
1185 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[20577]1186
[16390]1187 return undef unless $self->can_process_this_file($filename_full_path);
1188
[10280]1189 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
[21219]1190
[10280]1191 if ((defined $process_status) && ($process_status == 1)) {
[21219]1192
[10280]1193 # process the document
1194 $processor->process($doc_obj);
[15868]1195
[10280]1196 $self->{'num_processed'} ++;
1197 undef $doc_obj;
[9398]1198 }
[15868]1199 # delete any temp files that we may have created
1200 $self->clean_up_after_doc_obj_processing();
[9398]1201
[18469]1202
[10280]1203 # if process_status == 1, then the file has been processed.
1204 return $process_status;
1205
[4]1206}
1207
[1244]1208# returns undef if file is rejected by the plugin
[1242]1209sub process {
1210 my $self = shift (@_);
[11089]1211 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1242]1212
[23419]1213 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
[1244]1214
[23457]1215 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[23419]1216 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1217
[23457]1218 die "\n";
[23419]1219
[1244]1220 return undef; # never gets here
[1242]1221}
1222
[15868]1223# overwrite this method to delete any temp files that we have created
1224sub clean_up_after_doc_obj_processing {
1225 my $self = shift(@_);
[4]1226
[10280]1227}
[16390]1228
[10280]1229# write_file -- used by ConvertToPlug, for example in post processing
1230#
[15868]1231# where should this go, is here the best place??
[10280]1232sub utf8_write_file {
1233 my $self = shift (@_);
1234 my ($textref, $filename) = @_;
1235
[22953]1236 if (!open (FILE, ">:utf8", $filename)) {
[10280]1237 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
[23457]1238 die "\n";
1239 }
[10280]1240 print FILE $$textref;
1241
[1219]1242 close FILE;
1243}
1244
[10280]1245
[7504]1246sub filename_based_title
1247{
1248 my $self = shift (@_);
1249 my ($file) = @_;
1250
1251 my $file_derived_title = $file;
1252 $file_derived_title =~ s/_/ /g;
[23335]1253 $file_derived_title =~ s/\.[^.]+$//;
[7504]1254
1255 return $file_derived_title;
1256}
1257
[9398]1258
[7504]1259sub title_fallback
1260{
1261 my $self = shift (@_);
1262 my ($doc_obj,$section,$file) = @_;
1263
[22705]1264 if (!defined $doc_obj->get_metadata_element ($section, "Title")
[23335]1265 || $doc_obj->get_metadata_element($section, "Title") eq "") {
[7504]1266
[23335]1267 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1268 my $file_derived_title;
1269 if (defined $source_file) {
1270 $file_derived_title = $self->filename_based_title($source_file);
1271 }
1272 else {
1273 # pp = pretty print
1274 my $pp_file = (defined $source_file) ? $source_file : $file;
1275
1276 my $raw_title = $self->filename_based_title($file);
1277 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1278 }
1279
1280
[15868]1281 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1282 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
[9413]1283 }
[15868]1284 else {
[16995]1285 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
[15868]1286 }
[9413]1287 }
[23457]1288
[1844]1289}
[23457]1290
[1219]1291# add any extra metadata that's been passed around from one
1292# plugin to another.
1293# extra_metadata uses add_utf8_metadata so it expects metadata values
1294# to already be in utf8
1295sub extra_metadata {
1296 my $self = shift (@_);
1297 my ($doc_obj, $cursection, $metadata) = @_;
1298
[11122]1299 my $associate_tail_re = $self->{'associate_tail_re'};
1300
[1219]1301 foreach my $field (keys(%$metadata)) {
[839]1302 # $metadata->{$field} may be an array reference
[8510]1303 if ($field eq "gsdlassocfile_tobe") {
1304 # 'gsdlassocfile_tobe' is artificially introduced metadata
1305 # that is used to signal that certain additional files should
1306 # be tied to this document. Useful in situations where a
1307 # metadata pass in the plugin pipeline works out some files
1308 # need to be associated with a document, but the document hasn't
1309 # been formed yet.
1310 my $equiv_form = "";
1311 foreach my $gaf (@{$metadata->{$field}}) {
1312 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
[18171]1313 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1314
1315 # we need to make sure the filename is valid utf-8 - we do
[18320]1316 # this by url or base64 encoding it
[18171]1317 # $tail_filename is the name that we store the file as
[18320]1318 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
[8510]1319 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
[20778]1320 $doc_obj->associate_source_file($full_filename);
[18320]1321 # If the filename is url_encoded, we need to encode the % signs
1322 # in the filename, so that it works in a url
[18404]1323 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
[11122]1324 # work out extended tail extension (i.e. matching tail re)
1325
1326 my ($file_prefix,$file_extended_ext)
[16390]1327 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
[11122]1328 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
[8510]1329 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
[24219]1330
[25345]1331 # the greenstone 2 stuff
1332 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1333 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
[24219]1334 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1335
[8510]1336 my $srcicon = "_icon".$doc_ext."_";
1337 my $end_doclink = "</a>";
[24219]1338
[11122]1339 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1340
[24219]1341
[18171]1342 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
[11122]1343 # for metadata such as [mp3._edited] [mp3._full] ...
1344 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1345 }
1346
1347 # for multiple metadata such as [mp3.assoclink]
1348 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
[24219]1349
[11122]1350 $equiv_form .= " $assoc_form";
[25345]1351
1352 # following are used for greenstone 3,
1353 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1354 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1355 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1356
[8510]1357 }
1358 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1359 }
[23279]1360 elsif ($field eq "gsdlzipfilename") {
1361 # special case for when files have come out of a zip. source_path
1362 # (used for archives dbs and keeping track for incremental import)
1363 # must be set to the zip file name
1364 my $zip_filename = $metadata->{$field};
1365 # overwrite the source_path
[23363]1366 $doc_obj->set_source_path($zip_filename);
[23279]1367 # and set the metadata
1368 $zip_filename = &util::filename_within_collection($zip_filename);
1369 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1370 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1371 }
[8510]1372 elsif (ref ($metadata->{$field}) eq "ARRAY") {
[839]1373 map {
[1219]1374 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]1375 } @{$metadata->{$field}};
1376 } else {
[1219]1377 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]1378 }
1379 }
1380}
1381
[1396]1382
[2785]1383sub compile_stats {
1384 my $self = shift(@_);
1385 my ($stats) = @_;
1386
1387 $stats->{'num_processed'} += $self->{'num_processed'};
1388 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
[2796]1389 $stats->{'num_archives'} += $self->{'num_archives'};
[2785]1390
1391}
[22215]1392sub associate_source_file {
1393 my $self = shift(@_);
1394
1395 my ($doc_obj, $filename) = @_;
1396 my $cursection = $doc_obj->get_top_section();
1397 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1398
1399 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
[24225]1400 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
[22663]1401 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
[24225]1402 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
[22215]1403}
[2785]1404
[2816]1405sub associate_cover_image {
[22215]1406 my $self = shift(@_);
[2816]1407 my ($doc_obj, $filename) = @_;
1408
[23363]1409 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1410
[10833]1411 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
[23363]1412 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1413
1414 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1415 # don't stat() for existence e.g. for multiple document input files
[10833]1416 # (eg SplitPlug)
1417 return;
1418 }
1419
[9413]1420 my $top_section=$doc_obj->get_top_section();
1421
[27306]1422 if (&FileUtils::fileExists($upgraded_filename)) {
[20778]1423 $doc_obj->associate_source_file($filename);
[13968]1424 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
[9413]1425 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[3086]1426 } else {
[10833]1427 my $upper_filename = $filename;
[23363]1428 my $upgraded_upper_filename = $upgraded_filename;
1429
[10833]1430 $upper_filename =~ s/jpg$/JPG/;
[23363]1431 $upgraded_upper_filename =~ s/jpg$/JPG/;
1432
[27306]1433 if (&FileUtils::fileExists($upgraded_upper_filename)) {
[20778]1434 $doc_obj->associate_source_file($upper_filename);
[10833]1435 $doc_obj->associate_file($upper_filename, "cover.jpg",
1436 "image/jpeg");
[9413]1437 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
[10833]1438 } else {
1439 # file doesn't exist, so record the fact that it's missing so
1440 # we don't stat() again (stat is slow)
[23363]1441 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
[3086]1442 }
[2816]1443 }
[10833]1444
[2816]1445}
1446
[11332]1447
1448# Overridden by exploding plugins (eg. ISISPlug)
1449sub clean_up_after_exploding
1450{
1451 my $self = shift(@_);
1452}
1453
1454
[16390]1455
[4]14561;
Note: See TracBrowser for help on using the repository browser.