source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 24290

Last change on this file since 24290 was 24290, checked in by sjm84, 13 years ago

Several changes to how Greenstone hashes PDF files and also added several more options to the EmbeddedMetadataPlugin

  • Property svn:keywords set to Author Date Id Revision
File size: 45.6 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use PrintInfo;
43
44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
47
48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
54 'desc' => "{BasePlugin.rename_method.base64}" },
55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
57 'hiddengli' => "yes" } ];
58
59our $encoding_list =
60 [ { 'name' => "ascii",
61 'desc' => "{BasePlugin.encoding.ascii}" },
62 { 'name' => "utf8",
63 'desc' => "{BasePlugin.encoding.utf8}" },
64 { 'name' => "unicode",
65 'desc' => "{BasePlugin.encoding.unicode}" } ];
66
67
68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
75 push(@{$encoding_list},$hashEncode);
76}
77
78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
80 'desc' => "{BasePlugin.filename_encoding.auto}" },
81 { 'name' => "auto-language-analysis",
82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
89
90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
92our $oidtype_list =
93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
96 'desc' => "{import.OIDtype.hash}" },
97 { 'name' => "hash_on_ga_xml",
98 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
99 { 'name' => "assigned",
100 'desc' => "{import.OIDtype.assigned}" },
101 { 'name' => "incremental",
102 'desc' => "{import.OIDtype.incremental}" },
103 { 'name' => "dirname",
104 'desc' => "{import.OIDtype.dirname}" } ];
105
106my $arguments =
107 [ { 'name' => "process_exp",
108 'desc' => "{BasePlugin.process_exp}",
109 'type' => "regexp",
110 'deft' => "",
111 'reqd' => "no" },
112 { 'name' => "no_blocking",
113 'desc' => "{BasePlugin.no_blocking}",
114 'type' => "flag",
115 'reqd' => "no"},
116 { 'name' => "block_exp",
117 'desc' => "{BasePlugin.block_exp}",
118 'type' => "regexp",
119 'deft' => "",
120 'reqd' => "no" },
121 { 'name' => "store_original_file",
122 'desc' => "{BasePlugin.store_original_file}",
123 'type' => "flag",
124 'reqd' => "no" },
125 { 'name' => "associate_ext",
126 'desc' => "{BasePlugin.associate_ext}",
127 'type' => "string",
128 'reqd' => "no" },
129 { 'name' => "associate_tail_re",
130 'desc' => "{BasePlugin.associate_tail_re}",
131 'type' => "string",
132 'reqd' => "no" },
133 { 'name' => "OIDtype",
134 'desc' => "{import.OIDtype}",
135 'type' => "enum",
136 'list' => $oidtype_list,
137 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
138 'deft' => "auto",
139 'reqd' => "no" },
140 { 'name' => "OIDmetadata",
141 'desc' => "{import.OIDmetadata}",
142 'type' => "metadata",
143 'deft' => "dc.Identifier",
144 'reqd' => "no" },
145# { 'name' => "use_as_doc_identifier",
146# 'desc' => "{BasePlugin.use_as_doc_identifier}",
147# 'type' => "string",
148# 'reqd' => "no" ,
149# 'deft' => "" } ,
150 { 'name' => "no_cover_image",
151 'desc' => "{BasePlugin.no_cover_image}",
152 'type' => "flag",
153 'reqd' => "no" },
154 { 'name' => "filename_encoding",
155 'desc' => "{BasePlugin.filename_encoding}",
156 'type' => "enum",
157 'deft' => "auto",
158 'list' => $encoding_plus_auto_list,
159 'reqd' => "no" },
160 { 'name' => "smart_block",
161 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
162 'type' => "flag",
163 'reqd' => "no",
164 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
165 { 'name' => "file_rename_method",
166 'desc' => "{BasePlugin.file_rename_method}",
167 'type' => "enum",
168 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
169 'list' => $file_rename_method_list,
170 'reqd' => "no"
171 }
172
173 ];
174
175
176my $options = { 'name' => "BasePlugin",
177 'desc' => "{BasePlugin.desc}",
178 'abstract' => "yes",
179 'inherits' => "no",
180 'args' => $arguments };
181
182sub new {
183
184 my ($class) = shift (@_);
185 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
186 push(@$pluginlist, $class);
187
188 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
189 push(@{$hashArgOptLists->{"OptList"}},$options);
190
191 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
192
193 if ($self->{'info_only'}) {
194 # don't worry about any options etc
195 return bless $self, $class;
196 }
197
198 if ($self->{'smart_block'}) {
199 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
200 }
201 $self->{'smart_block'} = undef;
202
203 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
204 $self->{'plugin_type'} = $plugin_name;
205
206 # remove ex. from OIDmetadata
207 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
208 $self->{'num_processed'} = 0;
209 $self->{'num_not_processed'} = 0;
210 $self->{'num_blocked'} = 0;
211 $self->{'num_archives'} = 0;
212 $self->{'cover_image'} = 1; # cover image is on by default
213 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
214 $self->{'can_process_directories'} = 0;
215 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
216
217 my $associate_ext = $self->{'associate_ext'};
218 if ((defined $associate_ext) && ($associate_ext ne "")) {
219
220 my $associate_tail_re = $self->{'associate_tail_re'};
221 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
222 my $outhandle = $self->{'outhandle'};
223 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
224 print $outhandle " defaulting to 'associate_tail_re'\n";
225 }
226 else {
227 my @exts = split(/,/,$associate_ext);
228
229 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
230 my $associate_tail_re = join("|",@exts_bracketed);
231 $self->{'associate_tail_re'} = $associate_tail_re;
232 }
233
234 delete $self->{'associate_ext'};
235 }
236
237 return bless $self, $class;
238
239}
240
241sub merge_inheritance
242{
243 my $self = {};
244 my @child_selfs = @_;
245
246 foreach my $child_self (@child_selfs) {
247 foreach my $key (keys %$child_self) {
248 if (defined $self->{$key}) {
249 if ($self->{$key} ne $child_self->{$key}) {
250# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
251# print STDERR "Existing stored value = $self->{$key}\n";
252# print STDERR "New (child) value = $child_self->{$key}\n";
253# print STDERR "Keeping existing value\n";
254 # Existing value seems to be option specified in collect.cfg
255
256 ### $self->{$key} = $child_self->{$key};
257
258 }
259 else {
260## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
261 }
262
263 }
264 else {
265 $self->{$key} = $child_self->{$key};
266 }
267 }
268 }
269
270 return $self;
271}
272
273# initialize BasePlugin options
274# if init() is overridden in a sub-class, remember to call BasePlugin::init()
275sub init {
276 my $self = shift (@_);
277 my ($verbosity, $outhandle, $failhandle) = @_;
278
279 # verbosity is passed through from the processor
280 $self->{'verbosity'} = $verbosity;
281
282 # as are the outhandle and failhandle
283 $self->{'outhandle'} = $outhandle if defined $outhandle;
284 $self->{'failhandle'} = $failhandle;
285# $self->SUPER::init(@_);
286
287 # set process_exp and block_exp to defaults unless they were
288 # explicitly set
289
290 if ((!$self->is_recursive()) and
291 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
292
293 $self->{'process_exp'} = $self->get_default_process_exp ();
294 if ($self->{'process_exp'} eq "") {
295 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
296 }
297 }
298
299 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
300 $self->{'block_exp'} = $self->get_default_block_exp ();
301 }
302
303}
304
305sub begin {
306 my $self = shift (@_);
307 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
308
309 if ($self->{'OIDtype'} eq "auto") {
310 # hasn't been set in the plugin, use the processor values
311 $self->{'OIDtype'} = $processor->{'OIDtype'};
312 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
313 }
314 if ($self->{'OIDtype'} eq "hash") {
315 # should we hash on the file or on the doc xml??
316 $self->{'OIDtype'} = $self->get_oid_hash_type();
317 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
318 $self->{'OIDtype'} = "hash_on_file";
319 }
320 }
321}
322
323# This is called once if removeold is set with import.pl. Most plugins will do
324# nothing but if a plugin does any stuff outside of creating doc obj, then
325# it may need to clear something.
326sub remove_all {
327 my $self = shift (@_);
328 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
329}
330
331# This is called per document for docs that have been deleted from the
332# collection. Most plugins will do nothing
333# but if a plugin does any stuff outside of creating doc obj, then it may need
334# to clear something.
335sub remove_one {
336 my $self = shift (@_);
337
338 my ($file, $oids, $archivedir) = @_;
339 return 0 if $self->can_process_this_file($file);
340 return undef;
341}
342
343sub end {
344 # potentially called at the end of each plugin pass
345 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
346
347 my ($self) = shift (@_);
348}
349
350sub deinit {
351 # called only once, after all plugin passes have been done
352
353 my ($self) = @_;
354}
355
356# default hashing type is to hash on the original file (or converted file)
357# override this to return hash_on_ga_xml for filetypes where hashing on the
358# file is no good eg video
359sub get_oid_hash_type {
360
361 my $self = shift (@_);
362
363 return "hash_on_file";
364}
365
366
367# this function should be overridden to return 1
368# in recursive plugins
369sub is_recursive {
370 my $self = shift (@_);
371
372 return 0;
373}
374
375sub get_default_block_exp {
376 my $self = shift (@_);
377
378 return "";
379}
380
381sub get_default_process_exp {
382 my $self = shift (@_);
383
384 return "";
385}
386
387
388# rename imported files and assoc files using URL encoding by default
389# as this will work for most plugins and give more legible filenames
390sub get_default_file_rename_method() {
391 my $self = shift (@_);
392 return "url";
393}
394
395# returns this plugin's active (possibly user-selected) file_rename_method
396sub get_file_rename_method() {
397 my $self = shift (@_);
398 my $rename_method = $self->{'file_rename_method'};
399 if($rename_method) {
400 return $rename_method;
401 } else {
402 return $self->get_default_file_rename_method();
403 }
404}
405
406# default implementation is to do nothing
407sub store_block_files {
408
409 my $self =shift (@_);
410 my ($filename_full_path, $block_hash) = @_;
411
412}
413
414# put files to block into hash
415sub use_block_expressions {
416
417 my $self =shift (@_);
418 my ($filename_full_path, $block_hash) = @_;
419
420 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
421
422 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
423 &util::block_filename($block_hash,$filename_full_path);
424 }
425
426}
427
428#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
429sub block_cover_image
430{
431 my $self =shift;
432 my ($filename, $block_hash) = @_;
433
434 $filename = &util::upgrade_if_dos_filename($filename);
435
436 if ($self->{'cover_image'}) {
437 my $coverfile = $filename;
438 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
439
440 if (!&util::fd_exists($coverfile)) {
441 $coverfile =~ s/jpg$/JPG/;
442 }
443 if (&util::fd_exists($coverfile)) {
444 &util::block_filename($block_hash,$coverfile);
445 }
446 }
447
448 return;
449}
450
451
452# discover all the files that should be blocked by this plugin
453# check the args ...
454sub file_block_read {
455
456 my $self = shift (@_);
457 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
458 # Keep track of filenames with same root but different extensions
459 # Used to support -associate_ext and the more generalised
460 # -associate_tail_re
461 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
462
463 if (!-d $filename_full_path) {
464 $block_hash->{'all_files'}->{$file} = 1;
465 }
466
467 my $associate_tail_re = $self->{'associate_tail_re'};
468 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
469 my ($file_prefix,$file_ext)
470 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
471 if ((defined $file_prefix) && (defined $file_ext)) {
472 my $shared_fileroot = $block_hash->{'shared_fileroot'};
473 if (!defined $shared_fileroot->{$file_prefix}) {
474 my $file_prefix_rec = { 'tie_to' => undef,
475 'exts' => {} };
476 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
477 }
478
479 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
480
481 if ($self->can_process_this_file($filename_full_path)) {
482 # This is the document the others should be tied to
483 $file_prefix_rec->{'tie_to'} = $file_ext;
484 }
485 else {
486 if ($file_ext =~ m/$associate_tail_re$/) {
487 # this file should be associated to the main one
488 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
489 }
490 }
491
492 }
493 }
494
495 # check block expressions
496 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
497
498 # now check whether we are actually processing this
499 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
500 return undef; # can't recognise
501 }
502
503 # if we have a block_exp, then this overrides the normal 'smart' blocking
504 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
505
506 # block the cover image if there is one
507 if ($self->{'cover_image'}) {
508 $self->block_cover_image($filename_full_path, $block_hash);
509 }
510
511 return 1;
512}
513
514# plugins that rely on more than process_exp (eg XML plugins) can override this method
515sub can_process_this_file {
516 my $self = shift(@_);
517 my ($filename) = @_;
518
519 if (-d $filename && !$self->{'can_process_directories'}) {
520 return 0;
521 }
522
523 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
524 return 1;
525 }
526 return 0;
527
528}
529
530# just converts path as is to utf8.
531sub filepath_to_utf8 {
532 my $self = shift (@_);
533 my ($file, $file_encoding) = @_;
534 my $filemeta = $file;
535
536 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
537
538 # Whenever filename-encoding is set to any of the auto settings, we
539 # check if the filename is already in UTF8. If it is, then we're done.
540 if($filename_encoding =~ m/auto/) {
541 if(&unicode::check_is_utf8($filemeta))
542 {
543 $filename_encoding = "utf8";
544 return $filemeta;
545 }
546 }
547
548 # Auto setting, but filename is not utf8
549 if ($filename_encoding eq "auto")
550 {
551 # try textcat
552 $filename_encoding = $self->textcat_encoding($filemeta);
553
554 # check the locale next
555 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
556
557
558 # now try the encoding of the document, if available
559 if ($filename_encoding eq "undefined" && defined $file_encoding) {
560 $filename_encoding = $file_encoding;
561 }
562
563 }
564
565 elsif ($filename_encoding eq "auto-language-analysis")
566 {
567 $filename_encoding = $self->textcat_encoding($filemeta);
568
569 # now try the encoding of the document, if available
570 if ($filename_encoding eq "undefined" && defined $file_encoding) {
571 $filename_encoding = $file_encoding;
572 }
573 }
574
575 elsif ($filename_encoding eq "auto-filesystem-encoding")
576 {
577 # try locale
578 $filename_encoding = $self->locale_encoding();
579 }
580
581 elsif ($filename_encoding eq "auto-fl")
582 {
583 # filesystem-encoding (locale) then language-analysis (textcat)
584 $filename_encoding = $self->locale_encoding();
585
586 # try textcat
587 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
588
589 # else assume filename encoding is encoding of file content, if that's available
590 if ($filename_encoding eq "undefined" && defined $file_encoding) {
591 $filename_encoding = $file_encoding;
592 }
593 }
594
595 elsif ($filename_encoding eq "auto-lf")
596 {
597 # language-analysis (textcat) then filesystem-encoding (locale)
598 $filename_encoding = $self->textcat_encoding($filemeta);
599
600 # guess filename encoding from encoding of file content, if available
601 if ($filename_encoding eq "undefined" && defined $file_encoding) {
602 $filename_encoding = $file_encoding;
603 }
604
605 # try locale
606 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
607 }
608
609 # if still undefined, use utf8 as fallback
610 if ($filename_encoding eq "undefined") {
611 $filename_encoding = "utf8";
612 }
613
614 #print STDERR "**** UTF8 encoding the filename $filemeta ";
615
616 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
617 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
618 # be always utf8 (in which case the filename's encoding is also set as utf8 even
619 # though the filename need not be if it originates from another system)--in such
620 # cases attempt to make the filename utf8 to match.
621 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
622 &unicode::ensure_utf8(\$filemeta);
623 }
624
625 # convert non-unicode encodings to utf8
626 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
627 $filemeta = &unicode::unicode2utf8(
628 &unicode::convert2unicode($filename_encoding, \$filemeta)
629 );
630 }
631
632 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
633 return $filemeta;
634}
635
636# gets the filename with no path, converts to utf8, and then dm safes it.
637# filename_encoding set by user
638sub filename_to_utf8_metadata
639{
640 my $self = shift (@_);
641 my ($file, $file_encoding) = @_;
642
643 my $outhandle = $self->{'outhandle'};
644
645 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
646 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
647 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
648
649
650 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
651 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
652
653 return $filemeta;
654}
655
656sub locale_encoding {
657 my $self = shift(@_);
658
659 if (!defined $self->{'filesystem_encoding'}) {
660 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
661 }
662
663 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
664 return $self->{'filesystem_encoding'}; # can be the string "undefined"
665}
666
667sub textcat_encoding {
668 my $self = shift(@_);
669 my ($filemeta) = @_;
670
671 # analyse filenames without extensions and digits (and trimmed of
672 # surrounding whitespace), so that irrelevant chars don't confuse
673 # textcat
674 my $strictfilemeta = $filemeta;
675 $strictfilemeta =~ s/\.[^\.]+$//g;
676 $strictfilemeta =~ s/\d//g;
677 $strictfilemeta =~ s/^\s*//g;
678 $strictfilemeta =~ s/\s*$//g;
679
680 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
681 if(!defined $filename_encoding) {
682 $filename_encoding = "undefined";
683 }
684
685 return $filename_encoding; # can be the string "undefined"
686}
687
688# performs textcat
689sub encoding_from_language_analysis {
690 my $self = shift(@_);
691 my ($text) = @_;
692
693 my $outhandle = $self->{'outhandle'};
694 my $best_encoding = undef;
695
696 # get the language/encoding of the textstring using textcat
697 require textcat; # Only load the textcat module if it is required
698 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
699 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
700
701
702 if (scalar @$results < 0) {
703 return undef;
704 }
705
706 # We have some results, we choose the first
707 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
708
709 $best_encoding = $encoding;
710 if (!defined $best_encoding) {
711 return undef;
712 }
713
714 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
715 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
716 $best_encoding = 'utf8';
717 }
718
719
720 # check for equivalents where textcat doesn't have some encodings...
721 # eg MS versions of standard encodings
722 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
723 my $iso = $1; # which variant of the iso standard?
724 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
725 if ($text =~ /[\x80-\x9f]/) {
726 # Western Europe
727 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
728 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
729 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
730 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
731 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
732 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
733 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
734 }
735 }
736
737 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
738 !defined $encodings::encodings->{$best_encoding})
739 {
740 if ($self->{'verbosity'}) {
741 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
742 }
743 $best_encoding = undef;
744 }
745
746 return $best_encoding;
747}
748
749# uses locale
750sub get_filesystem_encoding
751{
752
753 my $self = shift(@_);
754
755 my $outhandle = $self->{'outhandle'};
756 my $filesystem_encoding = undef;
757
758 eval {
759 # Works for Windows as well, returning the DOS code page in use
760 use POSIX qw(locale_h);
761
762 # With only one parameter, setlocale retrieves the
763 # current value
764 my $current_locale = setlocale(LC_CTYPE);
765
766 my $char_encoding = undef;
767 if ($current_locale =~ m/\./) {
768 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
769 $char_encoding = lc($char_encoding);
770 }
771 else {
772 if ($current_locale =~ m/^(posix|c)$/i) {
773 $char_encoding = "ascii";
774 }
775 }
776
777 if (defined $char_encoding) {
778 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
779 $char_encoding = "$1\_$2\_$3";
780 }
781
782 $char_encoding =~ s/-/_/g;
783 $char_encoding =~ s/^utf_8$/utf8/;
784
785 if ($char_encoding =~ m/^\d+$/) {
786 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
787 $char_encoding = "windows_$char_encoding";
788 }
789 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
790 $char_encoding = "dos_$char_encoding";
791 }
792 }
793
794 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
795 || (defined $encodings::encodings->{$char_encoding})) {
796 $filesystem_encoding = $char_encoding;
797 }
798 else {
799 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
800 }
801 }
802
803
804 };
805 if ($@) {
806 print $outhandle "$@\n";
807 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
808
809 }
810
811 return $filesystem_encoding;
812}
813
814
815
816sub deduce_filename_encoding
817{
818 my $self = shift (@_);
819 my ($file,$metadata,$plugin_filename_encoding) = @_;
820
821 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
822 my $deduced_filename_encoding = undef;
823
824 # Start by looking for manually assigned metadata
825 if (defined $gs_filename_encoding) {
826 if (ref ($gs_filename_encoding) eq "ARRAY") {
827 my $outhandle = $self->{'outhandle'};
828
829 $deduced_filename_encoding = $gs_filename_encoding->[0];
830
831 my $num_vals = scalar(@$gs_filename_encoding);
832 if ($num_vals>1) {
833 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
834 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
835 }
836 }
837 else {
838 $deduced_filename_encoding = $gs_filename_encoding;
839 }
840 }
841
842 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
843 # Look to see if plugin specifies this value
844
845 if (defined $plugin_filename_encoding) {
846 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
847 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
848 my $outhandle = $self->{'outhandle'};
849 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
850 print $outhandle " default to 'auto'\n";
851 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
852 }
853
854 if ($plugin_filename_encoding ne "auto") {
855 # We've been given a specific filenamne encoding
856 # => so use it!
857 $deduced_filename_encoding = $plugin_filename_encoding;
858 }
859 }
860 }
861
862 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
863
864 # Look to file system to provide a character encoding
865
866 # If Windows NTFS, then -- assuming we work with long file names got through
867 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
868
869 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
870 # Can do better than working with the DOS character encoding returned by locale
871 $deduced_filename_encoding = "unicode";
872 }
873 else {
874 # Unix of some form or other
875
876 # See if we can determine the file system encoding through locale
877 $deduced_filename_encoding = $self->locale_encoding();
878
879 # if locale shows us filesystem is utf8, check to see filename is consistent
880 # => if not, then we have an "alien" filename on our hands
881
882 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
883 if (!&unicode::check_is_utf8($file)) {
884 # "alien" filename, so revert
885 $deduced_filename_encoding = undef;
886 }
887 }
888 }
889 }
890
891# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
892# # Last chance, apply textcat to deduce filename encoding
893# $deduced_filename_encoding = $self->textcat_encoding($file);
894# }
895
896 if ($self->{'verbosity'}>3) {
897 my $outhandle = $self->{'outhandle'};
898
899 if (defined $deduced_filename_encoding) {
900 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
901 }
902 else {
903 print $outhandle " No filename encoding deduced\n";
904 }
905 }
906
907 return $deduced_filename_encoding;
908}
909
910
911
912
913# Notionally written to be called once for each document, it is however safe to
914# call multiple times (as in the case of ImagePlugin) which calls this later on
915# after the original image has potentially been converted to a *new* source image
916# format (e.g. TIFF to PNG)
917
918sub set_Source_metadata {
919 my $self = shift (@_);
920 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
921
922 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
923 # and (as a fallback) using %xx if not for non-ascii chars
924 # 2. Sets the url ref (SourceFile) to the URL encoded version
925 # of filename for generated files
926
927 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
928
929 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
930
931 my $octet_file = $raw_file;
932
933 # UTF-8 version of filename
934# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
935# print STDERR "**** Setting Source Metadata given: $octet_file\n";
936# }
937
938 # Deal with (on Windows) raw filenames that are in their
939 # abbreviated DOS form
940
941 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
942 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
943 if (-e $raw_filename) {
944 my $unicode_filename = Win32::GetLongPathName($raw_filename);
945
946 my $unused_full_uf;
947 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
948 }
949 }
950 }
951
952 my $url_encoded_filename;
953 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
954 # => Generate a pretty print version of filename that is mapped to Unicode
955
956 # Use filename_encoding to map raw filename to a Perl unicode-aware string
957 $url_encoded_filename = decode($filename_encoding,$octet_file);
958 }
959 else {
960 # otherwise generate %xx encoded version of filename for char > 127
961 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
962 }
963
964# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
965# print STDERR "****** saving Source as: $url_encoded_filename\n";
966# }
967
968 # In the case of converted files and (generalized) exploded documents, there
969 # will already be a source filename => store as OrigSource before overriding
970 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
971 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
972 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
973 }
974
975 # Source is the UTF8 display name - not necessarily the name of the file on the system
976 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
977 # on Darwin want all display strings to be in composed form, then can search on that
978 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
979 }
980 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
981
982
983 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
984 # If using URL encoding, then SourceFile is the url-reference to url-encoded
985 # renamed_raw_url: it's a url that refers to the actual file on the system
986 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
987
988 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
989 $renamed_raw_url);
990
991# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
992# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
993# }
994}
995
996# this should be called by all plugins to set the oid of the doc obj, rather
997# than calling doc_obj->set_OID directly
998sub add_OID {
999 my $self = shift (@_);
1000 my ($doc_obj) = @_;
1001
1002 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1003
1004 # see if there is a plugin specific set_OID function
1005 if (defined ($self->can('set_OID'))) {
1006 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1007 }
1008 else {
1009 # use the default set_OID() in doc.pm
1010 $doc_obj->set_OID();
1011 }
1012
1013}
1014
1015# The BasePlugin read_into_doc_obj() function. This function does all the
1016# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1017# a file and sets up a slew of metadata all saved in doc_obj, which
1018# it then returns as part of a tuple (process_status,doc_obj)
1019#
1020# Much of this functionality used to reside in read, but it was broken
1021# down into a supporting routine to make the code more flexible.
1022#
1023# recursive plugins (e.g. RecPlug) and specialized plugins like those
1024# capable of processing many documents within a single file (e.g.
1025# GMLPlug) will normally want to implement their own version of
1026# read_into_doc_obj()
1027#
1028# Note that $base_dir might be "" and that $file might
1029# include directories
1030
1031# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1032sub read_into_doc_obj {
1033 my $self = shift (@_);
1034 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1035
1036 my $outhandle = $self->{'outhandle'};
1037
1038 # should we move this to read? What about secondary plugins?
1039 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1040 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1041 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1042 if $self->{'verbosity'} > 1;
1043
1044 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1045
1046 # create a new document
1047 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1048 my $top_section = $doc_obj->get_top_section();
1049
1050 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1051 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1052
1053
1054 my $plugin_filename_encoding = $self->{'filename_encoding'};
1055 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1056 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1057
1058 # plugin specific stuff - what args do we need here??
1059 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1060 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1061 return -1;
1062 }
1063
1064 # include any metadata passed in from previous plugins
1065 # note that this metadata is associated with the top level section
1066 my $section = $doc_obj->get_top_section();
1067 # can we merge these two methods??
1068 $self->add_associated_files($doc_obj, $filename_full_path);
1069 $self->extra_metadata ($doc_obj, $section, $metadata);
1070 $self->auto_extract_metadata($doc_obj);
1071
1072 # if we haven't found any Title so far, assign one
1073 # this was shifted to here from inside read()
1074 $self->title_fallback($doc_obj,$section,$filename_no_path);
1075
1076 $self->add_OID($doc_obj);
1077
1078 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1079 return (1,$doc_obj);
1080}
1081
1082sub post_process_doc_obj {
1083 my $self = shift (@_);
1084 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1085
1086 return 1;
1087}
1088
1089sub add_dummy_text {
1090 my $self = shift(@_);
1091 my ($doc_obj, $section) = @_;
1092
1093 # add NoText metadata so we can hide this dummy text in format statements
1094 $doc_obj->add_metadata($section, "NoText", "1");
1095 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1096
1097}
1098
1099# does nothing. Can be overridden by subclass
1100sub auto_extract_metadata {
1101 my $self = shift(@_);
1102 my ($doc_obj) = @_;
1103}
1104
1105# adds cover image, associate_file options stuff. Should be called by sub class
1106# read_into_doc_obj
1107sub add_associated_files {
1108 my $self = shift(@_);
1109 # whatis filename??
1110 my ($doc_obj, $filename) = @_;
1111
1112 # add in the cover image
1113 if ($self->{'cover_image'}) {
1114 $self->associate_cover_image($doc_obj, $filename);
1115 }
1116 # store the original (used for eg TextPlugin to store the original for OAI)
1117 if ($self->{'store_original_file'}) {
1118 $self->associate_source_file($doc_obj, $filename);
1119 }
1120
1121
1122}
1123
1124# implement this if you are extracting metadata for other documents
1125sub metadata_read {
1126 my $self = shift (@_);
1127 my ($pluginfo, $base_dir, $file, $block_hash,
1128 $extrametakeys, $extrametadata, $extrametafile,
1129 $processor, $gli, $aux) = @_;
1130
1131 # can we process this file??
1132 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1133 return undef unless $self->can_process_this_file($filename_full_path);
1134
1135 return 1; # we recognise the file, but don't actually do anything with it
1136}
1137
1138
1139# The BasePlugin read() function. This function calls read_into_doc_obj()
1140# to ensure all the right things to make general options work for a
1141# given plugin are done. It then calls the process() function which
1142# does all the work specific to a plugin (like the old read functions
1143# used to do). Most plugins should define their own process() function
1144# and let this read() function keep control.
1145#
1146# recursive plugins (e.g. RecPlug) and specialized plugins like those
1147# capable of processing many documents within a single file (e.g.
1148# GMLPlug) might want to implement their own version of read(), but
1149# more likely need to implement their own version of read_into_doc_obj()
1150#
1151# Return number of files processed, undef if can't recognise, -1 if can't
1152# process
1153
1154sub read {
1155 my $self = shift (@_);
1156 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1157
1158 # can we process this file??
1159 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1160
1161 return undef unless $self->can_process_this_file($filename_full_path);
1162
1163 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1164
1165 if ((defined $process_status) && ($process_status == 1)) {
1166
1167 # process the document
1168 $processor->process($doc_obj);
1169
1170 $self->{'num_processed'} ++;
1171 undef $doc_obj;
1172 }
1173 # delete any temp files that we may have created
1174 $self->clean_up_after_doc_obj_processing();
1175
1176
1177 # if process_status == 1, then the file has been processed.
1178 return $process_status;
1179
1180}
1181
1182# returns undef if file is rejected by the plugin
1183sub process {
1184 my $self = shift (@_);
1185 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1186
1187 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1188
1189 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1190 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1191
1192 die "\n";
1193
1194 return undef; # never gets here
1195}
1196
1197# overwrite this method to delete any temp files that we have created
1198sub clean_up_after_doc_obj_processing {
1199 my $self = shift(@_);
1200
1201}
1202
1203# write_file -- used by ConvertToPlug, for example in post processing
1204#
1205# where should this go, is here the best place??
1206sub utf8_write_file {
1207 my $self = shift (@_);
1208 my ($textref, $filename) = @_;
1209
1210 if (!open (FILE, ">:utf8", $filename)) {
1211 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1212 die "\n";
1213 }
1214 print FILE $$textref;
1215
1216 close FILE;
1217}
1218
1219
1220sub filename_based_title
1221{
1222 my $self = shift (@_);
1223 my ($file) = @_;
1224
1225 my $file_derived_title = $file;
1226 $file_derived_title =~ s/_/ /g;
1227 $file_derived_title =~ s/\.[^.]+$//;
1228
1229 return $file_derived_title;
1230}
1231
1232
1233sub title_fallback
1234{
1235 my $self = shift (@_);
1236 my ($doc_obj,$section,$file) = @_;
1237
1238 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1239 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1240
1241 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1242 my $file_derived_title;
1243 if (defined $source_file) {
1244 $file_derived_title = $self->filename_based_title($source_file);
1245 }
1246 else {
1247 # pp = pretty print
1248 my $pp_file = (defined $source_file) ? $source_file : $file;
1249
1250 my $raw_title = $self->filename_based_title($file);
1251 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1252 }
1253
1254
1255 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1256 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1257 }
1258 else {
1259 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1260 }
1261 }
1262
1263}
1264
1265# add any extra metadata that's been passed around from one
1266# plugin to another.
1267# extra_metadata uses add_utf8_metadata so it expects metadata values
1268# to already be in utf8
1269sub extra_metadata {
1270 my $self = shift (@_);
1271 my ($doc_obj, $cursection, $metadata) = @_;
1272
1273 my $associate_tail_re = $self->{'associate_tail_re'};
1274
1275 foreach my $field (keys(%$metadata)) {
1276 # $metadata->{$field} may be an array reference
1277 if ($field eq "gsdlassocfile_tobe") {
1278 # 'gsdlassocfile_tobe' is artificially introduced metadata
1279 # that is used to signal that certain additional files should
1280 # be tied to this document. Useful in situations where a
1281 # metadata pass in the plugin pipeline works out some files
1282 # need to be associated with a document, but the document hasn't
1283 # been formed yet.
1284 my $equiv_form = "";
1285 foreach my $gaf (@{$metadata->{$field}}) {
1286 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1287 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1288
1289 # we need to make sure the filename is valid utf-8 - we do
1290 # this by url or base64 encoding it
1291 # $tail_filename is the name that we store the file as
1292 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1293 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1294 $doc_obj->associate_source_file($full_filename);
1295 # If the filename is url_encoded, we need to encode the % signs
1296 # in the filename, so that it works in a url
1297 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1298 # work out extended tail extension (i.e. matching tail re)
1299
1300 my ($file_prefix,$file_extended_ext)
1301 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1302 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1303 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1304
1305## my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1306 my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1307 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1308
1309 my $srcicon = "_icon".$doc_ext."_";
1310 my $end_doclink = "</a>";
1311
1312 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1313
1314 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink);
1315 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1316 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1317
1318 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1319 # for metadata such as [mp3._edited] [mp3._full] ...
1320 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1321 }
1322
1323 # for multiple metadata such as [mp3.assoclink]
1324 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1325
1326 $equiv_form .= " $assoc_form";
1327 }
1328 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1329 }
1330 elsif ($field eq "gsdlzipfilename") {
1331 # special case for when files have come out of a zip. source_path
1332 # (used for archives dbs and keeping track for incremental import)
1333 # must be set to the zip file name
1334 my $zip_filename = $metadata->{$field};
1335 # overwrite the source_path
1336 $doc_obj->set_source_path($zip_filename);
1337 # and set the metadata
1338 $zip_filename = &util::filename_within_collection($zip_filename);
1339 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1340 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1341 }
1342 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1343 map {
1344 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1345 } @{$metadata->{$field}};
1346 } else {
1347 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1348 }
1349 }
1350}
1351
1352
1353sub compile_stats {
1354 my $self = shift(@_);
1355 my ($stats) = @_;
1356
1357 $stats->{'num_processed'} += $self->{'num_processed'};
1358 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1359 $stats->{'num_archives'} += $self->{'num_archives'};
1360
1361}
1362sub associate_source_file {
1363 my $self = shift(@_);
1364
1365 my ($doc_obj, $filename) = @_;
1366 my $cursection = $doc_obj->get_top_section();
1367 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1368
1369 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1370 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1371 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1372 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1373}
1374
1375sub associate_cover_image {
1376 my $self = shift(@_);
1377 my ($doc_obj, $filename) = @_;
1378
1379 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1380
1381 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1382 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1383
1384 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1385 # don't stat() for existence e.g. for multiple document input files
1386 # (eg SplitPlug)
1387 return;
1388 }
1389
1390 my $top_section=$doc_obj->get_top_section();
1391
1392 if (&util::fd_exists($upgraded_filename)) {
1393 $doc_obj->associate_source_file($filename);
1394 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1395 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1396 } else {
1397 my $upper_filename = $filename;
1398 my $upgraded_upper_filename = $upgraded_filename;
1399
1400 $upper_filename =~ s/jpg$/JPG/;
1401 $upgraded_upper_filename =~ s/jpg$/JPG/;
1402
1403 if (&util::fd_exists($upgraded_upper_filename)) {
1404 $doc_obj->associate_source_file($upper_filename);
1405 $doc_obj->associate_file($upper_filename, "cover.jpg",
1406 "image/jpeg");
1407 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1408 } else {
1409 # file doesn't exist, so record the fact that it's missing so
1410 # we don't stat() again (stat is slow)
1411 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1412 }
1413 }
1414
1415}
1416
1417
1418# Overridden by exploding plugins (eg. ISISPlug)
1419sub clean_up_after_exploding
1420{
1421 my $self = shift(@_);
1422}
1423
1424
1425
14261;
Note: See TracBrowser for help on using the repository browser.