source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 23387

Last change on this file since 23387 was 23387, checked in by davidb, 13 years ago

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

  • Property svn:keywords set to Author Date Id Revision
File size: 44.0 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34
35use encodings;
36use unicode;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
53 'desc' => "{BasePlugin.rename_method.base64}" },
54 { 'name' => "none",
55 'desc' => "{BasePlugin.rename_method.none}",
56 'hiddengli' => "yes" } ];
57
58our $encoding_list =
59 [ { 'name' => "ascii",
60 'desc' => "{BasePlugin.encoding.ascii}" },
61 { 'name' => "utf8",
62 'desc' => "{BasePlugin.encoding.utf8}" },
63 { 'name' => "unicode",
64 'desc' => "{BasePlugin.encoding.unicode}" } ];
65
66
67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70 my $hashEncode =
71 {'name' => $enc,
72 'desc' => $e->{$enc}->{'name'}};
73
74 push(@{$encoding_list},$hashEncode);
75}
76
77our $encoding_plus_auto_list =
78 [ { 'name' => "auto",
79 'desc' => "{BasePlugin.filename_encoding.auto}" },
80 { 'name' => "auto-language-analysis",
81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82 { 'name' => "auto-filesystem-encoding",
83 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84 { 'name' => "auto-fl",
85 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86 { 'name' => "auto-lf",
87 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
88
89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
91our $oidtype_list =
92 [ { 'name' => "auto",
93 'desc' => "{BasePlugin.OIDtype.auto}" },
94 { 'name' => "hash",
95 'desc' => "{import.OIDtype.hash}" },
96 { 'name' => "assigned",
97 'desc' => "{import.OIDtype.assigned}" },
98 { 'name' => "incremental",
99 'desc' => "{import.OIDtype.incremental}" },
100 { 'name' => "dirname",
101 'desc' => "{import.OIDtype.dirname}" } ];
102
103my $arguments =
104 [ { 'name' => "process_exp",
105 'desc' => "{BasePlugin.process_exp}",
106 'type' => "regexp",
107 'deft' => "",
108 'reqd' => "no" },
109 { 'name' => "no_blocking",
110 'desc' => "{BasePlugin.no_blocking}",
111 'type' => "flag",
112 'reqd' => "no"},
113 { 'name' => "block_exp",
114 'desc' => "{BasePlugin.block_exp}",
115 'type' => "regexp",
116 'deft' => "",
117 'reqd' => "no" },
118 { 'name' => "store_original_file",
119 'desc' => "{BasePlugin.store_original_file}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "associate_ext",
123 'desc' => "{BasePlugin.associate_ext}",
124 'type' => "string",
125 'reqd' => "no" },
126 { 'name' => "associate_tail_re",
127 'desc' => "{BasePlugin.associate_tail_re}",
128 'type' => "string",
129 'reqd' => "no" },
130 { 'name' => "OIDtype",
131 'desc' => "{import.OIDtype}",
132 'type' => "enum",
133 'list' => $oidtype_list,
134 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
135 'deft' => "auto",
136 'reqd' => "no" },
137 { 'name' => "OIDmetadata",
138 'desc' => "{import.OIDmetadata}",
139 'type' => "metadata",
140 'deft' => "dc.Identifier",
141 'reqd' => "no" },
142# { 'name' => "use_as_doc_identifier",
143# 'desc' => "{BasePlugin.use_as_doc_identifier}",
144# 'type' => "string",
145# 'reqd' => "no" ,
146# 'deft' => "" } ,
147 { 'name' => "no_cover_image",
148 'desc' => "{BasePlugin.no_cover_image}",
149 'type' => "flag",
150 'reqd' => "no" },
151 { 'name' => "filename_encoding",
152 'desc' => "{BasePlugin.filename_encoding}",
153 'type' => "enum",
154 'deft' => "auto",
155 'list' => $encoding_plus_auto_list,
156 'reqd' => "no" },
157 { 'name' => "smart_block",
158 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
159 'type' => "flag",
160 'reqd' => "no",
161 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
162 { 'name' => "file_rename_method",
163 'desc' => "{BasePlugin.file_rename_method}",
164 'type' => "enum",
165 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
166 'list' => $file_rename_method_list,
167 'reqd' => "no"
168 }
169
170 ];
171
172
173my $options = { 'name' => "BasePlugin",
174 'desc' => "{BasePlugin.desc}",
175 'abstract' => "yes",
176 'inherits' => "no",
177 'args' => $arguments };
178
179sub new {
180
181 my ($class) = shift (@_);
182 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
183 push(@$pluginlist, $class);
184
185 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
186 push(@{$hashArgOptLists->{"OptList"}},$options);
187
188 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
189
190 if ($self->{'info_only'}) {
191 # don't worry about any options etc
192 return bless $self, $class;
193 }
194
195 if ($self->{'smart_block'}) {
196 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
197 }
198 $self->{'smart_block'} = undef;
199
200 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
201 $self->{'plugin_type'} = $plugin_name;
202
203 # remove ex. from OIDmetadata
204 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
205 $self->{'num_processed'} = 0;
206 $self->{'num_not_processed'} = 0;
207 $self->{'num_blocked'} = 0;
208 $self->{'num_archives'} = 0;
209 $self->{'cover_image'} = 1; # cover image is on by default
210 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
211 $self->{'can_process_directories'} = 0;
212 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
213
214 my $associate_ext = $self->{'associate_ext'};
215 if ((defined $associate_ext) && ($associate_ext ne "")) {
216
217 my $associate_tail_re = $self->{'associate_tail_re'};
218 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
219 my $outhandle = $self->{'outhandle'};
220 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
221 print $outhandle " defaulting to 'associate_tail_re'\n";
222 }
223 else {
224 my @exts = split(/,/,$associate_ext);
225
226 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
227 my $associate_tail_re = join("|",@exts_bracketed);
228 $self->{'associate_tail_re'} = $associate_tail_re;
229 }
230
231 delete $self->{'associate_ext'};
232 }
233
234 return bless $self, $class;
235
236}
237
238sub merge_inheritance
239{
240 my $self = {};
241 my @child_selfs = @_;
242
243 foreach my $child_self (@child_selfs) {
244 foreach my $key (keys %$child_self) {
245 if (defined $self->{$key}) {
246 if ($self->{$key} ne $child_self->{$key}) {
247# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
248# print STDERR "Existing stored value = $self->{$key}\n";
249# print STDERR "New (child) value = $child_self->{$key}\n";
250# print STDERR "Keeping existing value\n";
251 # Existing value seems to be option specified in collect.cfg
252
253 ### $self->{$key} = $child_self->{$key};
254
255 }
256 else {
257## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
258 }
259
260 }
261 else {
262 $self->{$key} = $child_self->{$key};
263 }
264 }
265 }
266
267 return $self;
268}
269
270# initialize BasePlugin options
271# if init() is overridden in a sub-class, remember to call BasePlugin::init()
272sub init {
273 my $self = shift (@_);
274 my ($verbosity, $outhandle, $failhandle) = @_;
275
276 # verbosity is passed through from the processor
277 $self->{'verbosity'} = $verbosity;
278
279 # as are the outhandle and failhandle
280 $self->{'outhandle'} = $outhandle if defined $outhandle;
281 $self->{'failhandle'} = $failhandle;
282# $self->SUPER::init(@_);
283
284 # set process_exp and block_exp to defaults unless they were
285 # explicitly set
286
287 if ((!$self->is_recursive()) and
288 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
289
290 $self->{'process_exp'} = $self->get_default_process_exp ();
291 if ($self->{'process_exp'} eq "") {
292 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
293 }
294 }
295
296 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
297 $self->{'block_exp'} = $self->get_default_block_exp ();
298 }
299
300}
301
302sub begin {
303 my $self = shift (@_);
304 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
305
306 if ($self->{'OIDtype'} eq "auto") {
307 # hasn't been set in the plugin, use the processor values
308 $self->{'OIDtype'} = $processor->{'OIDtype'};
309 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
310 }
311 if ($self->{'OIDtype'} eq "hash") {
312 # should we hash on the file or on the doc xml??
313 $self->{'OIDtype'} = $self->get_oid_hash_type();
314 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
315 $self->{'OIDtype'} = "hash_on_file";
316 }
317 }
318}
319
320# This is called once if removeold is set with import.pl. Most plugins will do
321# nothing but if a plugin does any stuff outside of creating doc obj, then
322# it may need to clear something.
323sub remove_all {
324 my $self = shift (@_);
325 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
326}
327
328# This is called per document for docs that have been deleted from the
329# collection. Most plugins will do nothing
330# but if a plugin does any stuff outside of creating doc obj, then it may need
331# to clear something.
332sub remove_one {
333 my $self = shift (@_);
334
335 my ($file, $oids, $archivedir) = @_;
336 return 0 if $self->can_process_this_file($file);
337 return undef;
338}
339
340sub end {
341 # potentially called at the end of each plugin pass
342 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
343
344 my ($self) = shift (@_);
345}
346
347sub deinit {
348 # called only once, after all plugin passes have been done
349
350 my ($self) = @_;
351}
352
353# default hashing type is to hash on the original file (or converted file)
354# override this to return hash_on_ga_xml for filetypes where hashing on the
355# file is no good eg video
356sub get_oid_hash_type {
357
358 my $self = shift (@_);
359
360 return "hash_on_file";
361}
362
363
364# this function should be overridden to return 1
365# in recursive plugins
366sub is_recursive {
367 my $self = shift (@_);
368
369 return 0;
370}
371
372sub get_default_block_exp {
373 my $self = shift (@_);
374
375 return "";
376}
377
378sub get_default_process_exp {
379 my $self = shift (@_);
380
381 return "";
382}
383
384# rename imported files and assoc files using URL encoding by default
385# as this will work for most plugins and give more legible filenames
386sub get_default_file_rename_method() {
387 my $self = shift (@_);
388 return "url";
389}
390
391# returns this plugin's active (possibly user-selected) file_rename_method
392sub get_file_rename_method() {
393 my $self = shift (@_);
394 my $rename_method = $self->{'file_rename_method'};
395 if($rename_method) {
396 return $rename_method;
397 } else {
398 return $self->get_default_file_rename_method();
399 }
400}
401
402# default implementation is to do nothing
403sub store_block_files {
404
405 my $self =shift (@_);
406 my ($filename_full_path, $block_hash) = @_;
407
408}
409
410# put files to block into hash
411sub use_block_expressions {
412
413 my $self =shift (@_);
414 my ($filename_full_path, $block_hash) = @_;
415
416 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
417
418 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
419 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
420 }
421
422}
423
424#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
425sub block_cover_image
426{
427 my $self =shift;
428 my ($filename, $block_hash) = @_;
429
430 $filename = &util::upgrade_if_dos_filename($filename);
431
432 if ($self->{'cover_image'}) {
433 my $coverfile = $filename;
434 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
435 if (!&util::fd_exists($coverfile)) {
436 $coverfile =~ s/jpg$/JPG/;
437 }
438 if (&util::fd_exists($coverfile)) {
439 $block_hash->{'file_blocks'}->{$coverfile} = 1;
440 }
441 }
442
443 return;
444}
445
446
447# discover all the files that should be blocked by this plugin
448# check the args ...
449sub file_block_read {
450
451 my $self = shift (@_);
452 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
453 # Keep track of filenames with same root but different extensions
454 # Used to support -associate_ext and the more generalised
455 # -associate_tail_re
456 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
457
458 if (!-d $filename_full_path) {
459 $block_hash->{'all_files'}->{$file} = 1;
460 }
461
462 my $associate_tail_re = $self->{'associate_tail_re'};
463 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
464 my ($file_prefix,$file_ext)
465 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
466 if ((defined $file_prefix) && (defined $file_ext)) {
467 my $shared_fileroot = $block_hash->{'shared_fileroot'};
468 if (!defined $shared_fileroot->{$file_prefix}) {
469 my $file_prefix_rec = { 'tie_to' => undef,
470 'exts' => {} };
471 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
472 }
473
474 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
475
476 if ($self->can_process_this_file($filename_full_path)) {
477 # This is the document the others should be tied to
478 $file_prefix_rec->{'tie_to'} = $file_ext;
479 }
480 else {
481 if ($file_ext =~ m/$associate_tail_re$/) {
482 # this file should be associated to the main one
483 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
484 }
485 }
486
487 }
488 }
489
490 # check block expressions
491 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
492
493 # now check whether we are actually processing this
494 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
495 return undef; # can't recognise
496 }
497
498 # if we have a block_exp, then this overrides the normal 'smart' blocking
499 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
500
501 # block the cover image if there is one
502 if ($self->{'cover_image'}) {
503 $self->block_cover_image($filename_full_path, $block_hash);
504 }
505
506 return 1;
507}
508
509# plugins that rely on more than process_exp (eg XML plugins) can override this method
510sub can_process_this_file {
511 my $self = shift(@_);
512 my ($filename) = @_;
513
514 if (-d $filename && !$self->{'can_process_directories'}) {
515 return 0;
516 }
517
518 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
519 return 1;
520 }
521 return 0;
522
523}
524
525# just converts path as is to utf8.
526sub filepath_to_utf8 {
527 my $self = shift (@_);
528 my ($file, $file_encoding) = @_;
529 my $filemeta = $file;
530
531 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
532
533 # Whenever filename-encoding is set to any of the auto settings, we
534 # check if the filename is already in UTF8. If it is, then we're done.
535 if($filename_encoding =~ m/auto/) {
536 if(&unicode::check_is_utf8($filemeta))
537 {
538 $filename_encoding = "utf8";
539 return $filemeta;
540 }
541 }
542
543 # Auto setting, but filename is not utf8
544 if ($filename_encoding eq "auto")
545 {
546 # try textcat
547 $filename_encoding = $self->textcat_encoding($filemeta);
548
549 # check the locale next
550 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
551
552
553 # now try the encoding of the document, if available
554 if ($filename_encoding eq "undefined" && defined $file_encoding) {
555 $filename_encoding = $file_encoding;
556 }
557
558 }
559
560 elsif ($filename_encoding eq "auto-language-analysis")
561 {
562 $filename_encoding = $self->textcat_encoding($filemeta);
563
564 # now try the encoding of the document, if available
565 if ($filename_encoding eq "undefined" && defined $file_encoding) {
566 $filename_encoding = $file_encoding;
567 }
568 }
569
570 elsif ($filename_encoding eq "auto-filesystem-encoding")
571 {
572 # try locale
573 $filename_encoding = $self->locale_encoding();
574 }
575
576 elsif ($filename_encoding eq "auto-fl")
577 {
578 # filesystem-encoding (locale) then language-analysis (textcat)
579 $filename_encoding = $self->locale_encoding();
580
581 # try textcat
582 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
583
584 # else assume filename encoding is encoding of file content, if that's available
585 if ($filename_encoding eq "undefined" && defined $file_encoding) {
586 $filename_encoding = $file_encoding;
587 }
588 }
589
590 elsif ($filename_encoding eq "auto-lf")
591 {
592 # language-analysis (textcat) then filesystem-encoding (locale)
593 $filename_encoding = $self->textcat_encoding($filemeta);
594
595 # guess filename encoding from encoding of file content, if available
596 if ($filename_encoding eq "undefined" && defined $file_encoding) {
597 $filename_encoding = $file_encoding;
598 }
599
600 # try locale
601 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
602 }
603
604 # if still undefined, use utf8 as fallback
605 if ($filename_encoding eq "undefined") {
606 $filename_encoding = "utf8";
607 }
608
609 #print STDERR "**** UTF8 encoding the filename $filemeta ";
610
611 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
612 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
613 # be always utf8 (in which case the filename's encoding is also set as utf8 even
614 # though the filename need not be if it originates from another system)--in such
615 # cases attempt to make the filename utf8 to match.
616 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
617 &unicode::ensure_utf8(\$filemeta);
618 }
619
620 # convert non-unicode encodings to utf8
621 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
622 $filemeta = &unicode::unicode2utf8(
623 &unicode::convert2unicode($filename_encoding, \$filemeta)
624 );
625 }
626
627 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
628 return $filemeta;
629}
630
631# gets the filename with no path, converts to utf8, and then dm safes it.
632# filename_encoding set by user
633sub filename_to_utf8_metadata
634{
635 my $self = shift (@_);
636 my ($file, $file_encoding) = @_;
637
638 my $outhandle = $self->{'outhandle'};
639
640 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
641 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
642 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
643
644
645 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
646 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
647
648 return $filemeta;
649}
650
651sub locale_encoding {
652 my $self = shift(@_);
653
654 if (!defined $self->{'filesystem_encoding'}) {
655 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
656 }
657
658 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
659 return $self->{'filesystem_encoding'}; # can be the string "undefined"
660}
661
662sub textcat_encoding {
663 my $self = shift(@_);
664 my ($filemeta) = @_;
665
666 # analyse filenames without extensions and digits (and trimmed of
667 # surrounding whitespace), so that irrelevant chars don't confuse
668 # textcat
669 my $strictfilemeta = $filemeta;
670 $strictfilemeta =~ s/\.[^\.]+$//g;
671 $strictfilemeta =~ s/\d//g;
672 $strictfilemeta =~ s/^\s*//g;
673 $strictfilemeta =~ s/\s*$//g;
674
675 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
676 if(!defined $filename_encoding) {
677 $filename_encoding = "undefined";
678 }
679
680 return $filename_encoding; # can be the string "undefined"
681}
682
683# performs textcat
684sub encoding_from_language_analysis {
685 my $self = shift(@_);
686 my ($text) = @_;
687
688 my $outhandle = $self->{'outhandle'};
689 my $best_encoding = undef;
690
691 # get the language/encoding of the textstring using textcat
692 require textcat; # Only load the textcat module if it is required
693 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
694 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
695
696
697 if (scalar @$results < 0) {
698 return undef;
699 }
700
701 # We have some results, we choose the first
702 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
703
704 $best_encoding = $encoding;
705 if (!defined $best_encoding) {
706 return undef;
707 }
708
709 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
710 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
711 $best_encoding = 'utf8';
712 }
713
714
715 # check for equivalents where textcat doesn't have some encodings...
716 # eg MS versions of standard encodings
717 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
718 my $iso = $1; # which variant of the iso standard?
719 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
720 if ($text =~ /[\x80-\x9f]/) {
721 # Western Europe
722 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
723 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
724 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
725 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
726 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
727 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
728 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
729 }
730 }
731
732 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
733 !defined $encodings::encodings->{$best_encoding})
734 {
735 if ($self->{'verbosity'}) {
736 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
737 }
738 $best_encoding = undef;
739 }
740
741 return $best_encoding;
742}
743
744# uses locale
745sub get_filesystem_encoding
746{
747
748 my $self = shift(@_);
749
750 my $outhandle = $self->{'outhandle'};
751 my $filesystem_encoding = undef;
752
753 eval {
754 # Works for Windows as well, returning the DOS code page in use
755 use POSIX qw(locale_h);
756
757 # With only one parameter, setlocale retrieves the
758 # current value
759 my $current_locale = setlocale(LC_CTYPE);
760
761 my $char_encoding = undef;
762 if ($current_locale =~ m/\./) {
763 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
764 $char_encoding = lc($char_encoding);
765 }
766 else {
767 if ($current_locale =~ m/^(posix|c)$/i) {
768 $char_encoding = "ascii";
769 }
770 }
771
772 if (defined $char_encoding) {
773 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
774 $char_encoding = "$1\_$2\_$3";
775 }
776
777 $char_encoding =~ s/-/_/g;
778 $char_encoding =~ s/^utf_8$/utf8/;
779
780 if ($char_encoding =~ m/^\d+$/) {
781 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
782 $char_encoding = "windows_$char_encoding";
783 }
784 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
785 $char_encoding = "dos_$char_encoding";
786 }
787 }
788
789 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
790 || (defined $encodings::encodings->{$char_encoding})) {
791 $filesystem_encoding = $char_encoding;
792 }
793 else {
794 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
795 }
796 }
797
798
799 };
800 if ($@) {
801 print $outhandle "$@\n";
802 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
803
804 }
805
806 return $filesystem_encoding;
807}
808
809
810
811sub deduce_filename_encoding
812{
813 my $self = shift (@_);
814 my ($file,$metadata,$plugin_filename_encoding) = @_;
815
816 my $gs_filename_encoding = $metadata->{"gs.filename_encoding"};
817 my $deduced_filename_encoding = undef;
818
819 # Start by looking for manually assigned metadata
820 if (defined $gs_filename_encoding) {
821 if (ref ($gs_filename_encoding) eq "ARRAY") {
822 my $outhandle = $self->{'outhandle'};
823
824 $deduced_filename_encoding = $gs_filename_encoding->[0];
825
826 my $num_vals = scalar(@$gs_filename_encoding);
827 if ($num_vals>1) {
828 print $outhandle "Warning: gs.filename_encoding multiply defined for $file\n";
829 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
830 }
831 }
832 else {
833 $deduced_filename_encoding = $gs_filename_encoding;
834 }
835 }
836
837 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
838 # Look to see if plugin specifies this value
839
840 if (defined $plugin_filename_encoding) {
841 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
842 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
843 my $outhandle = $self->{'outhandle'};
844 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
845 print $outhandle " default to 'auto'\n";
846 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
847 }
848
849 if ($plugin_filename_encoding ne "auto") {
850 # We've been given a specific filenamne encoding
851 # => so use it!
852 $deduced_filename_encoding = $plugin_filename_encoding;
853 }
854 }
855 }
856
857 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
858
859 # Look to file system to provide a character encoding
860
861 # If Windows NTFS, then -- assuming we work with long file names got through
862 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
863
864 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
865 # Can do better than working with the DOS character encoding returned by locale
866 $deduced_filename_encoding = "unicode";
867 }
868 else {
869 # Unix of some form or other
870
871 # See if we can determine the file system encoding through locale
872 $deduced_filename_encoding = $self->locale_encoding();
873
874 # if locale shows us filesystem is utf8, check to see filename is consistent
875 # => if not, then we have an "alien" filename on our hands
876
877 if ($deduced_filename_encoding =~ m/^utf-?8$/i) {
878 if (!&unicode::check_is_utf8($file)) {
879 # "alien" filename, so revert
880 $deduced_filename_encoding = undef;
881 }
882 }
883 }
884 }
885
886# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
887# # Last chance, apply textcat to deduce filename encoding
888# $deduced_filename_encoding = $self->textcat_encoding($file);
889# }
890
891 if ($self->{'verbosity'}>3) {
892 my $outhandle = $self->{'outhandle'};
893
894 if (defined $deduced_filename_encoding) {
895 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
896 }
897 else {
898 print $outhandle " No filename encoding deduced\n";
899 }
900 }
901
902 return $deduced_filename_encoding;
903}
904
905
906
907
908# Notionally written to be called once for each document, it is however safe to
909# call multiple times (as in the case of ImagePlugin) which calls this later on
910# after the original image has potentially been converted to a *new* source image
911# format (e.g. TIFF to PNG)
912
913sub set_Source_metadata {
914 my $self = shift (@_);
915 my ($doc_obj, $raw_filename, $filename_encoding) = @_;
916
917 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
918 # and (as a fallback) using %xx if not for non-ascii chars
919 # 2. Sets the url ref (SourceFile) to the URL encoded version
920 # of filename for generated files
921
922 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
923
924 my $top_section = $doc_obj->get_top_section();
925
926 my $octet_file = $raw_file;
927
928 # UTF-8 version of filename
929# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
930# print STDERR "**** Setting Source Metadata given: $octet_file\n";
931# }
932
933 # Deal with (on Windows) raw filenames that are in their
934 # abbreviated DOS form
935
936 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
937 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
938 if (-e $raw_filename) {
939 my $unicode_filename = Win32::GetLongPathName($raw_filename);
940
941 my $unused_full_uf;
942 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
943 }
944 }
945 }
946
947 my $url_encoded_filename;
948 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
949 # => Generate a pretty print version of filename that is mapped to Unicode
950
951 # Use filename_encoding to map raw filename to a Perl unicode-aware string
952 $url_encoded_filename = decode($filename_encoding,$octet_file);
953 }
954 else {
955 # otherwise generate %xx encoded version of filename for char > 127
956 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
957 }
958
959# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
960# print STDERR "****** saving Source as: $url_encoded_filename\n";
961# }
962
963
964 # Source is the UTF8 display name - not necessarily the name of the file on the system
965 $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename);
966
967 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
968 # If using URL encoding, then SourceFile is the url-reference to url-encoded
969 # renamed_raw_url: it's a url that refers to the actual file on the system
970 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
971
972 $doc_obj->set_utf8_metadata_element($top_section, "SourceFile",
973 $renamed_raw_url);
974
975# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
976# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
977# }
978}
979
980# this should be called by all plugins to set the oid of the doc obj, rather
981# than calling doc_obj->set_OID directly
982sub add_OID {
983 my $self = shift (@_);
984 my ($doc_obj) = @_;
985
986 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
987
988 # see if there is a plugin specific set_OID function
989 if (defined ($self->can('set_OID'))) {
990 $self->set_OID(@_); # pass through doc_obj and any extra arguments
991 }
992 else {
993 # use the default set_OID() in doc.pm
994 $doc_obj->set_OID();
995 }
996
997}
998
999# The BasePlugin read_into_doc_obj() function. This function does all the
1000# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1001# a file and sets up a slew of metadata all saved in doc_obj, which
1002# it then returns as part of a tuple (process_status,doc_obj)
1003#
1004# Much of this functionality used to reside in read, but it was broken
1005# down into a supporting routine to make the code more flexible.
1006#
1007# recursive plugins (e.g. RecPlug) and specialized plugins like those
1008# capable of processing many documents within a single file (e.g.
1009# GMLPlug) will normally want to implement their own version of
1010# read_into_doc_obj()
1011#
1012# Note that $base_dir might be "" and that $file might
1013# include directories
1014
1015# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1016sub read_into_doc_obj {
1017 my $self = shift (@_);
1018 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1019
1020 my $outhandle = $self->{'outhandle'};
1021
1022 # should we move this to read? What about secondary plugins?
1023 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1024 my $pp_file = &util::prettyprint_file($base_dir,$file);
1025 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1026 if $self->{'verbosity'} > 1;
1027
1028 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1029
1030 # create a new document
1031 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1032 my $top_section = $doc_obj->get_top_section();
1033
1034 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1035 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1036
1037
1038 my $plugin_filename_encoding = $self->{'filename_encoding'};
1039 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1040 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$filename_full_path);
1041
1042 # plugin specific stuff - what args do we need here??
1043 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1044 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1045 return -1;
1046 }
1047
1048 # include any metadata passed in from previous plugins
1049 # note that this metadata is associated with the top level section
1050 my $section = $doc_obj->get_top_section();
1051 # can we merge these two methods??
1052 $self->add_associated_files($doc_obj, $filename_full_path);
1053 $self->extra_metadata ($doc_obj, $section, $metadata);
1054 $self->auto_extract_metadata($doc_obj);
1055
1056 # if we haven't found any Title so far, assign one
1057 # this was shifted to here from inside read()
1058 $self->title_fallback($doc_obj,$section,$filename_no_path);
1059
1060 $self->add_OID($doc_obj);
1061
1062 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1063 return (1,$doc_obj);
1064}
1065
1066sub post_process_doc_obj {
1067 my $self = shift (@_);
1068 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1069
1070 return 1;
1071}
1072
1073sub add_dummy_text {
1074 my $self = shift(@_);
1075 my ($doc_obj, $section) = @_;
1076
1077 # add NoText metadata so we can hide this dummy text in format statements
1078 $doc_obj->add_metadata($section, "NoText", "1");
1079 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1080
1081}
1082
1083# does nothing. Can be overridden by subclass
1084sub auto_extract_metadata {
1085 my $self = shift(@_);
1086 my ($doc_obj) = @_;
1087}
1088
1089# adds cover image, associate_file options stuff. Should be called by sub class
1090# read_into_doc_obj
1091sub add_associated_files {
1092 my $self = shift(@_);
1093 # whatis filename??
1094 my ($doc_obj, $filename) = @_;
1095
1096 # add in the cover image
1097 if ($self->{'cover_image'}) {
1098 $self->associate_cover_image($doc_obj, $filename);
1099 }
1100 # store the original (used for eg TextPlugin to store the original for OAI)
1101 if ($self->{'store_original_file'}) {
1102 $self->associate_source_file($doc_obj, $filename);
1103 }
1104
1105
1106}
1107
1108# implement this if you are extracting metadata for other documents
1109sub metadata_read {
1110 my $self = shift (@_);
1111 my ($pluginfo, $base_dir, $file, $block_hash,
1112 $extrametakeys, $extrametadata, $extrametafile,
1113 $processor, $gli, $aux) = @_;
1114
1115 # can we process this file??
1116 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1117 return undef unless $self->can_process_this_file($filename_full_path);
1118
1119 return 1; # we recognise the file, but don't actually do anything with it
1120}
1121
1122
1123# The BasePlugin read() function. This function calls read_into_doc_obj()
1124# to ensure all the right things to make general options work for a
1125# given plugin are done. It then calls the process() function which
1126# does all the work specific to a plugin (like the old read functions
1127# used to do). Most plugins should define their own process() function
1128# and let this read() function keep control.
1129#
1130# recursive plugins (e.g. RecPlug) and specialized plugins like those
1131# capable of processing many documents within a single file (e.g.
1132# GMLPlug) might want to implement their own version of read(), but
1133# more likely need to implement their own version of read_into_doc_obj()
1134#
1135# Return number of files processed, undef if can't recognise, -1 if can't
1136# process
1137
1138sub read {
1139 my $self = shift (@_);
1140 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1141
1142 # can we process this file??
1143 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1144
1145 return undef unless $self->can_process_this_file($filename_full_path);
1146
1147 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1148
1149 if ((defined $process_status) && ($process_status == 1)) {
1150
1151 # process the document
1152 $processor->process($doc_obj);
1153
1154 $self->{'num_processed'} ++;
1155 undef $doc_obj;
1156 }
1157 # delete any temp files that we may have created
1158 $self->clean_up_after_doc_obj_processing();
1159
1160
1161 # if process_status == 1, then the file has been processed.
1162 return $process_status;
1163
1164}
1165
1166# returns undef if file is rejected by the plugin
1167sub process {
1168 my $self = shift (@_);
1169 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1170
1171 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
1172
1173 return undef; # never gets here
1174}
1175
1176# overwrite this method to delete any temp files that we have created
1177sub clean_up_after_doc_obj_processing {
1178 my $self = shift(@_);
1179
1180}
1181
1182# write_file -- used by ConvertToPlug, for example in post processing
1183#
1184# where should this go, is here the best place??
1185sub utf8_write_file {
1186 my $self = shift (@_);
1187 my ($textref, $filename) = @_;
1188
1189 if (!open (FILE, ">:utf8", $filename)) {
1190 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1191 die "\n";
1192 }
1193 print FILE $$textref;
1194
1195 close FILE;
1196}
1197
1198
1199sub filename_based_title
1200{
1201 my $self = shift (@_);
1202 my ($file) = @_;
1203
1204 my $file_derived_title = $file;
1205 $file_derived_title =~ s/_/ /g;
1206 $file_derived_title =~ s/\.[^.]+$//;
1207
1208 return $file_derived_title;
1209}
1210
1211
1212sub title_fallback
1213{
1214 my $self = shift (@_);
1215 my ($doc_obj,$section,$file) = @_;
1216
1217 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1218 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1219
1220 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1221 my $file_derived_title;
1222 if (defined $source_file) {
1223 $file_derived_title = $self->filename_based_title($source_file);
1224 }
1225 else {
1226 # pp = pretty print
1227 my $pp_file = (defined $source_file) ? $source_file : $file;
1228
1229 my $raw_title = $self->filename_based_title($file);
1230 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1231 }
1232
1233
1234 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1235 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1236 }
1237 else {
1238 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1239 }
1240 }
1241
1242}
1243
1244# add any extra metadata that's been passed around from one
1245# plugin to another.
1246# extra_metadata uses add_utf8_metadata so it expects metadata values
1247# to already be in utf8
1248sub extra_metadata {
1249 my $self = shift (@_);
1250 my ($doc_obj, $cursection, $metadata) = @_;
1251
1252 my $associate_tail_re = $self->{'associate_tail_re'};
1253
1254 foreach my $field (keys(%$metadata)) {
1255 # $metadata->{$field} may be an array reference
1256 if ($field eq "gsdlassocfile_tobe") {
1257 # 'gsdlassocfile_tobe' is artificially introduced metadata
1258 # that is used to signal that certain additional files should
1259 # be tied to this document. Useful in situations where a
1260 # metadata pass in the plugin pipeline works out some files
1261 # need to be associated with a document, but the document hasn't
1262 # been formed yet.
1263 my $equiv_form = "";
1264 foreach my $gaf (@{$metadata->{$field}}) {
1265 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1266 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1267
1268 # we need to make sure the filename is valid utf-8 - we do
1269 # this by url or base64 encoding it
1270 # $tail_filename is the name that we store the file as
1271 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1272 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1273 $doc_obj->associate_source_file($full_filename);
1274 # If the filename is url_encoded, we need to encode the % signs
1275 # in the filename, so that it works in a url
1276 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1277 # work out extended tail extension (i.e. matching tail re)
1278
1279 my ($file_prefix,$file_extended_ext)
1280 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1281 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1282 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1283 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1284 my $srcicon = "_icon".$doc_ext."_";
1285 my $end_doclink = "</a>";
1286
1287 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1288
1289 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1290 # for metadata such as [mp3._edited] [mp3._full] ...
1291 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1292 }
1293
1294 # for multiple metadata such as [mp3.assoclink]
1295 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1296
1297 $equiv_form .= " $assoc_form";
1298 }
1299 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1300 }
1301 elsif ($field eq "gsdlzipfilename") {
1302 # special case for when files have come out of a zip. source_path
1303 # (used for archives dbs and keeping track for incremental import)
1304 # must be set to the zip file name
1305 my $zip_filename = $metadata->{$field};
1306 # overwrite the source_path
1307 $doc_obj->set_source_path($zip_filename);
1308 # and set the metadata
1309 $zip_filename = &util::filename_within_collection($zip_filename);
1310 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1311 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1312 }
1313 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1314 map {
1315 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1316 } @{$metadata->{$field}};
1317 } else {
1318 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1319 }
1320 }
1321}
1322
1323
1324sub compile_stats {
1325 my $self = shift(@_);
1326 my ($stats) = @_;
1327
1328 $stats->{'num_processed'} += $self->{'num_processed'};
1329 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1330 $stats->{'num_archives'} += $self->{'num_archives'};
1331
1332}
1333sub associate_source_file {
1334 my $self = shift(@_);
1335
1336 my ($doc_obj, $filename) = @_;
1337 my $cursection = $doc_obj->get_top_section();
1338 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1339
1340 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1341 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1342}
1343
1344sub associate_cover_image {
1345 my $self = shift(@_);
1346 my ($doc_obj, $filename) = @_;
1347
1348 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1349
1350 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1351 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1352
1353 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1354 # don't stat() for existence e.g. for multiple document input files
1355 # (eg SplitPlug)
1356 return;
1357 }
1358
1359 my $top_section=$doc_obj->get_top_section();
1360
1361 if (&util::fd_exists($upgraded_filename)) {
1362 $doc_obj->associate_source_file($filename);
1363 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1364 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1365 } else {
1366 my $upper_filename = $filename;
1367 my $upgraded_upper_filename = $upgraded_filename;
1368
1369 $upper_filename =~ s/jpg$/JPG/;
1370 $upgraded_upper_filename =~ s/jpg$/JPG/;
1371
1372 if (&util::fd_exists($upgraded_upper_filename)) {
1373 $doc_obj->associate_source_file($upper_filename);
1374 $doc_obj->associate_file($upper_filename, "cover.jpg",
1375 "image/jpeg");
1376 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1377 } else {
1378 # file doesn't exist, so record the fact that it's missing so
1379 # we don't stat() again (stat is slow)
1380 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1381 }
1382 }
1383
1384}
1385
1386
1387# Overridden by exploding plugins (eg. ISISPlug)
1388sub clean_up_after_exploding
1389{
1390 my $self = shift(@_);
1391}
1392
1393
1394
13951;
Note: See TracBrowser for help on using the repository browser.