source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 24219

Last change on this file since 24219 was 24219, checked in by ak19, 13 years ago

Thanks to Sam, Veronica and Dr Bainbridge, can finally commit the changes necessary for ticket 449.

  • Property svn:keywords set to Author Date Id Revision
File size: 45.3 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use PrintInfo;
43
44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
47
48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
54 'desc' => "{BasePlugin.rename_method.base64}" },
55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
57 'hiddengli' => "yes" } ];
58
59our $encoding_list =
60 [ { 'name' => "ascii",
61 'desc' => "{BasePlugin.encoding.ascii}" },
62 { 'name' => "utf8",
63 'desc' => "{BasePlugin.encoding.utf8}" },
64 { 'name' => "unicode",
65 'desc' => "{BasePlugin.encoding.unicode}" } ];
66
67
68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
75 push(@{$encoding_list},$hashEncode);
76}
77
78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
80 'desc' => "{BasePlugin.filename_encoding.auto}" },
81 { 'name' => "auto-language-analysis",
82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
89
90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
92our $oidtype_list =
93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
96 'desc' => "{import.OIDtype.hash}" },
97 { 'name' => "assigned",
98 'desc' => "{import.OIDtype.assigned}" },
99 { 'name' => "incremental",
100 'desc' => "{import.OIDtype.incremental}" },
101 { 'name' => "dirname",
102 'desc' => "{import.OIDtype.dirname}" } ];
103
104my $arguments =
105 [ { 'name' => "process_exp",
106 'desc' => "{BasePlugin.process_exp}",
107 'type' => "regexp",
108 'deft' => "",
109 'reqd' => "no" },
110 { 'name' => "no_blocking",
111 'desc' => "{BasePlugin.no_blocking}",
112 'type' => "flag",
113 'reqd' => "no"},
114 { 'name' => "block_exp",
115 'desc' => "{BasePlugin.block_exp}",
116 'type' => "regexp",
117 'deft' => "",
118 'reqd' => "no" },
119 { 'name' => "store_original_file",
120 'desc' => "{BasePlugin.store_original_file}",
121 'type' => "flag",
122 'reqd' => "no" },
123 { 'name' => "associate_ext",
124 'desc' => "{BasePlugin.associate_ext}",
125 'type' => "string",
126 'reqd' => "no" },
127 { 'name' => "associate_tail_re",
128 'desc' => "{BasePlugin.associate_tail_re}",
129 'type' => "string",
130 'reqd' => "no" },
131 { 'name' => "OIDtype",
132 'desc' => "{import.OIDtype}",
133 'type' => "enum",
134 'list' => $oidtype_list,
135 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
136 'deft' => "auto",
137 'reqd' => "no" },
138 { 'name' => "OIDmetadata",
139 'desc' => "{import.OIDmetadata}",
140 'type' => "metadata",
141 'deft' => "dc.Identifier",
142 'reqd' => "no" },
143# { 'name' => "use_as_doc_identifier",
144# 'desc' => "{BasePlugin.use_as_doc_identifier}",
145# 'type' => "string",
146# 'reqd' => "no" ,
147# 'deft' => "" } ,
148 { 'name' => "no_cover_image",
149 'desc' => "{BasePlugin.no_cover_image}",
150 'type' => "flag",
151 'reqd' => "no" },
152 { 'name' => "filename_encoding",
153 'desc' => "{BasePlugin.filename_encoding}",
154 'type' => "enum",
155 'deft' => "auto",
156 'list' => $encoding_plus_auto_list,
157 'reqd' => "no" },
158 { 'name' => "smart_block",
159 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
160 'type' => "flag",
161 'reqd' => "no",
162 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
163 { 'name' => "file_rename_method",
164 'desc' => "{BasePlugin.file_rename_method}",
165 'type' => "enum",
166 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
167 'list' => $file_rename_method_list,
168 'reqd' => "no"
169 }
170
171 ];
172
173
174my $options = { 'name' => "BasePlugin",
175 'desc' => "{BasePlugin.desc}",
176 'abstract' => "yes",
177 'inherits' => "no",
178 'args' => $arguments };
179
180sub new {
181
182 my ($class) = shift (@_);
183 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
184 push(@$pluginlist, $class);
185
186 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
187 push(@{$hashArgOptLists->{"OptList"}},$options);
188
189 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
190
191 if ($self->{'info_only'}) {
192 # don't worry about any options etc
193 return bless $self, $class;
194 }
195
196 if ($self->{'smart_block'}) {
197 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
198 }
199 $self->{'smart_block'} = undef;
200
201 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
202 $self->{'plugin_type'} = $plugin_name;
203
204 # remove ex. from OIDmetadata
205 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
206 $self->{'num_processed'} = 0;
207 $self->{'num_not_processed'} = 0;
208 $self->{'num_blocked'} = 0;
209 $self->{'num_archives'} = 0;
210 $self->{'cover_image'} = 1; # cover image is on by default
211 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
212 $self->{'can_process_directories'} = 0;
213 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
214
215 my $associate_ext = $self->{'associate_ext'};
216 if ((defined $associate_ext) && ($associate_ext ne "")) {
217
218 my $associate_tail_re = $self->{'associate_tail_re'};
219 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
220 my $outhandle = $self->{'outhandle'};
221 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
222 print $outhandle " defaulting to 'associate_tail_re'\n";
223 }
224 else {
225 my @exts = split(/,/,$associate_ext);
226
227 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
228 my $associate_tail_re = join("|",@exts_bracketed);
229 $self->{'associate_tail_re'} = $associate_tail_re;
230 }
231
232 delete $self->{'associate_ext'};
233 }
234
235 return bless $self, $class;
236
237}
238
239sub merge_inheritance
240{
241 my $self = {};
242 my @child_selfs = @_;
243
244 foreach my $child_self (@child_selfs) {
245 foreach my $key (keys %$child_self) {
246 if (defined $self->{$key}) {
247 if ($self->{$key} ne $child_self->{$key}) {
248# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
249# print STDERR "Existing stored value = $self->{$key}\n";
250# print STDERR "New (child) value = $child_self->{$key}\n";
251# print STDERR "Keeping existing value\n";
252 # Existing value seems to be option specified in collect.cfg
253
254 ### $self->{$key} = $child_self->{$key};
255
256 }
257 else {
258## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
259 }
260
261 }
262 else {
263 $self->{$key} = $child_self->{$key};
264 }
265 }
266 }
267
268 return $self;
269}
270
271# initialize BasePlugin options
272# if init() is overridden in a sub-class, remember to call BasePlugin::init()
273sub init {
274 my $self = shift (@_);
275 my ($verbosity, $outhandle, $failhandle) = @_;
276
277 # verbosity is passed through from the processor
278 $self->{'verbosity'} = $verbosity;
279
280 # as are the outhandle and failhandle
281 $self->{'outhandle'} = $outhandle if defined $outhandle;
282 $self->{'failhandle'} = $failhandle;
283# $self->SUPER::init(@_);
284
285 # set process_exp and block_exp to defaults unless they were
286 # explicitly set
287
288 if ((!$self->is_recursive()) and
289 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
290
291 $self->{'process_exp'} = $self->get_default_process_exp ();
292 if ($self->{'process_exp'} eq "") {
293 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
294 }
295 }
296
297 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
298 $self->{'block_exp'} = $self->get_default_block_exp ();
299 }
300
301}
302
303sub begin {
304 my $self = shift (@_);
305 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
306
307 if ($self->{'OIDtype'} eq "auto") {
308 # hasn't been set in the plugin, use the processor values
309 $self->{'OIDtype'} = $processor->{'OIDtype'};
310 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
311 }
312 if ($self->{'OIDtype'} eq "hash") {
313 # should we hash on the file or on the doc xml??
314 $self->{'OIDtype'} = $self->get_oid_hash_type();
315 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
316 $self->{'OIDtype'} = "hash_on_file";
317 }
318 }
319}
320
321# This is called once if removeold is set with import.pl. Most plugins will do
322# nothing but if a plugin does any stuff outside of creating doc obj, then
323# it may need to clear something.
324sub remove_all {
325 my $self = shift (@_);
326 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
327}
328
329# This is called per document for docs that have been deleted from the
330# collection. Most plugins will do nothing
331# but if a plugin does any stuff outside of creating doc obj, then it may need
332# to clear something.
333sub remove_one {
334 my $self = shift (@_);
335
336 my ($file, $oids, $archivedir) = @_;
337 return 0 if $self->can_process_this_file($file);
338 return undef;
339}
340
341sub end {
342 # potentially called at the end of each plugin pass
343 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
344
345 my ($self) = shift (@_);
346}
347
348sub deinit {
349 # called only once, after all plugin passes have been done
350
351 my ($self) = @_;
352}
353
354# default hashing type is to hash on the original file (or converted file)
355# override this to return hash_on_ga_xml for filetypes where hashing on the
356# file is no good eg video
357sub get_oid_hash_type {
358
359 my $self = shift (@_);
360
361 return "hash_on_file";
362}
363
364
365# this function should be overridden to return 1
366# in recursive plugins
367sub is_recursive {
368 my $self = shift (@_);
369
370 return 0;
371}
372
373sub get_default_block_exp {
374 my $self = shift (@_);
375
376 return "";
377}
378
379sub get_default_process_exp {
380 my $self = shift (@_);
381
382 return "";
383}
384
385
386# rename imported files and assoc files using URL encoding by default
387# as this will work for most plugins and give more legible filenames
388sub get_default_file_rename_method() {
389 my $self = shift (@_);
390 return "url";
391}
392
393# returns this plugin's active (possibly user-selected) file_rename_method
394sub get_file_rename_method() {
395 my $self = shift (@_);
396 my $rename_method = $self->{'file_rename_method'};
397 if($rename_method) {
398 return $rename_method;
399 } else {
400 return $self->get_default_file_rename_method();
401 }
402}
403
404# default implementation is to do nothing
405sub store_block_files {
406
407 my $self =shift (@_);
408 my ($filename_full_path, $block_hash) = @_;
409
410}
411
412# put files to block into hash
413sub use_block_expressions {
414
415 my $self =shift (@_);
416 my ($filename_full_path, $block_hash) = @_;
417
418 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
419
420 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
421 &util::block_filename($block_hash,$filename_full_path);
422 }
423
424}
425
426#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
427sub block_cover_image
428{
429 my $self =shift;
430 my ($filename, $block_hash) = @_;
431
432 $filename = &util::upgrade_if_dos_filename($filename);
433
434 if ($self->{'cover_image'}) {
435 my $coverfile = $filename;
436 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
437
438 if (!&util::fd_exists($coverfile)) {
439 $coverfile =~ s/jpg$/JPG/;
440 }
441 if (&util::fd_exists($coverfile)) {
442 &util::block_filename($block_hash,$coverfile);
443 }
444 }
445
446 return;
447}
448
449
450# discover all the files that should be blocked by this plugin
451# check the args ...
452sub file_block_read {
453
454 my $self = shift (@_);
455 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
456 # Keep track of filenames with same root but different extensions
457 # Used to support -associate_ext and the more generalised
458 # -associate_tail_re
459 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
460
461 if (!-d $filename_full_path) {
462 $block_hash->{'all_files'}->{$file} = 1;
463 }
464
465 my $associate_tail_re = $self->{'associate_tail_re'};
466 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
467 my ($file_prefix,$file_ext)
468 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
469 if ((defined $file_prefix) && (defined $file_ext)) {
470 my $shared_fileroot = $block_hash->{'shared_fileroot'};
471 if (!defined $shared_fileroot->{$file_prefix}) {
472 my $file_prefix_rec = { 'tie_to' => undef,
473 'exts' => {} };
474 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
475 }
476
477 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
478
479 if ($self->can_process_this_file($filename_full_path)) {
480 # This is the document the others should be tied to
481 $file_prefix_rec->{'tie_to'} = $file_ext;
482 }
483 else {
484 if ($file_ext =~ m/$associate_tail_re$/) {
485 # this file should be associated to the main one
486 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
487 }
488 }
489
490 }
491 }
492
493 # check block expressions
494 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
495
496 # now check whether we are actually processing this
497 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
498 return undef; # can't recognise
499 }
500
501 # if we have a block_exp, then this overrides the normal 'smart' blocking
502 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
503
504 # block the cover image if there is one
505 if ($self->{'cover_image'}) {
506 $self->block_cover_image($filename_full_path, $block_hash);
507 }
508
509 return 1;
510}
511
512# plugins that rely on more than process_exp (eg XML plugins) can override this method
513sub can_process_this_file {
514 my $self = shift(@_);
515 my ($filename) = @_;
516
517 if (-d $filename && !$self->{'can_process_directories'}) {
518 return 0;
519 }
520
521 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
522 return 1;
523 }
524 return 0;
525
526}
527
528# just converts path as is to utf8.
529sub filepath_to_utf8 {
530 my $self = shift (@_);
531 my ($file, $file_encoding) = @_;
532 my $filemeta = $file;
533
534 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
535
536 # Whenever filename-encoding is set to any of the auto settings, we
537 # check if the filename is already in UTF8. If it is, then we're done.
538 if($filename_encoding =~ m/auto/) {
539 if(&unicode::check_is_utf8($filemeta))
540 {
541 $filename_encoding = "utf8";
542 return $filemeta;
543 }
544 }
545
546 # Auto setting, but filename is not utf8
547 if ($filename_encoding eq "auto")
548 {
549 # try textcat
550 $filename_encoding = $self->textcat_encoding($filemeta);
551
552 # check the locale next
553 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
554
555
556 # now try the encoding of the document, if available
557 if ($filename_encoding eq "undefined" && defined $file_encoding) {
558 $filename_encoding = $file_encoding;
559 }
560
561 }
562
563 elsif ($filename_encoding eq "auto-language-analysis")
564 {
565 $filename_encoding = $self->textcat_encoding($filemeta);
566
567 # now try the encoding of the document, if available
568 if ($filename_encoding eq "undefined" && defined $file_encoding) {
569 $filename_encoding = $file_encoding;
570 }
571 }
572
573 elsif ($filename_encoding eq "auto-filesystem-encoding")
574 {
575 # try locale
576 $filename_encoding = $self->locale_encoding();
577 }
578
579 elsif ($filename_encoding eq "auto-fl")
580 {
581 # filesystem-encoding (locale) then language-analysis (textcat)
582 $filename_encoding = $self->locale_encoding();
583
584 # try textcat
585 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
586
587 # else assume filename encoding is encoding of file content, if that's available
588 if ($filename_encoding eq "undefined" && defined $file_encoding) {
589 $filename_encoding = $file_encoding;
590 }
591 }
592
593 elsif ($filename_encoding eq "auto-lf")
594 {
595 # language-analysis (textcat) then filesystem-encoding (locale)
596 $filename_encoding = $self->textcat_encoding($filemeta);
597
598 # guess filename encoding from encoding of file content, if available
599 if ($filename_encoding eq "undefined" && defined $file_encoding) {
600 $filename_encoding = $file_encoding;
601 }
602
603 # try locale
604 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
605 }
606
607 # if still undefined, use utf8 as fallback
608 if ($filename_encoding eq "undefined") {
609 $filename_encoding = "utf8";
610 }
611
612 #print STDERR "**** UTF8 encoding the filename $filemeta ";
613
614 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
615 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
616 # be always utf8 (in which case the filename's encoding is also set as utf8 even
617 # though the filename need not be if it originates from another system)--in such
618 # cases attempt to make the filename utf8 to match.
619 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
620 &unicode::ensure_utf8(\$filemeta);
621 }
622
623 # convert non-unicode encodings to utf8
624 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
625 $filemeta = &unicode::unicode2utf8(
626 &unicode::convert2unicode($filename_encoding, \$filemeta)
627 );
628 }
629
630 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
631 return $filemeta;
632}
633
634# gets the filename with no path, converts to utf8, and then dm safes it.
635# filename_encoding set by user
636sub filename_to_utf8_metadata
637{
638 my $self = shift (@_);
639 my ($file, $file_encoding) = @_;
640
641 my $outhandle = $self->{'outhandle'};
642
643 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
644 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
645 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
646
647
648 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
649 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
650
651 return $filemeta;
652}
653
654sub locale_encoding {
655 my $self = shift(@_);
656
657 if (!defined $self->{'filesystem_encoding'}) {
658 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
659 }
660
661 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
662 return $self->{'filesystem_encoding'}; # can be the string "undefined"
663}
664
665sub textcat_encoding {
666 my $self = shift(@_);
667 my ($filemeta) = @_;
668
669 # analyse filenames without extensions and digits (and trimmed of
670 # surrounding whitespace), so that irrelevant chars don't confuse
671 # textcat
672 my $strictfilemeta = $filemeta;
673 $strictfilemeta =~ s/\.[^\.]+$//g;
674 $strictfilemeta =~ s/\d//g;
675 $strictfilemeta =~ s/^\s*//g;
676 $strictfilemeta =~ s/\s*$//g;
677
678 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
679 if(!defined $filename_encoding) {
680 $filename_encoding = "undefined";
681 }
682
683 return $filename_encoding; # can be the string "undefined"
684}
685
686# performs textcat
687sub encoding_from_language_analysis {
688 my $self = shift(@_);
689 my ($text) = @_;
690
691 my $outhandle = $self->{'outhandle'};
692 my $best_encoding = undef;
693
694 # get the language/encoding of the textstring using textcat
695 require textcat; # Only load the textcat module if it is required
696 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
697 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
698
699
700 if (scalar @$results < 0) {
701 return undef;
702 }
703
704 # We have some results, we choose the first
705 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
706
707 $best_encoding = $encoding;
708 if (!defined $best_encoding) {
709 return undef;
710 }
711
712 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
713 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
714 $best_encoding = 'utf8';
715 }
716
717
718 # check for equivalents where textcat doesn't have some encodings...
719 # eg MS versions of standard encodings
720 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
721 my $iso = $1; # which variant of the iso standard?
722 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
723 if ($text =~ /[\x80-\x9f]/) {
724 # Western Europe
725 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
726 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
727 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
728 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
729 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
730 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
731 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
732 }
733 }
734
735 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
736 !defined $encodings::encodings->{$best_encoding})
737 {
738 if ($self->{'verbosity'}) {
739 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
740 }
741 $best_encoding = undef;
742 }
743
744 return $best_encoding;
745}
746
747# uses locale
748sub get_filesystem_encoding
749{
750
751 my $self = shift(@_);
752
753 my $outhandle = $self->{'outhandle'};
754 my $filesystem_encoding = undef;
755
756 eval {
757 # Works for Windows as well, returning the DOS code page in use
758 use POSIX qw(locale_h);
759
760 # With only one parameter, setlocale retrieves the
761 # current value
762 my $current_locale = setlocale(LC_CTYPE);
763
764 my $char_encoding = undef;
765 if ($current_locale =~ m/\./) {
766 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
767 $char_encoding = lc($char_encoding);
768 }
769 else {
770 if ($current_locale =~ m/^(posix|c)$/i) {
771 $char_encoding = "ascii";
772 }
773 }
774
775 if (defined $char_encoding) {
776 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
777 $char_encoding = "$1\_$2\_$3";
778 }
779
780 $char_encoding =~ s/-/_/g;
781 $char_encoding =~ s/^utf_8$/utf8/;
782
783 if ($char_encoding =~ m/^\d+$/) {
784 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
785 $char_encoding = "windows_$char_encoding";
786 }
787 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
788 $char_encoding = "dos_$char_encoding";
789 }
790 }
791
792 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
793 || (defined $encodings::encodings->{$char_encoding})) {
794 $filesystem_encoding = $char_encoding;
795 }
796 else {
797 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
798 }
799 }
800
801
802 };
803 if ($@) {
804 print $outhandle "$@\n";
805 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
806
807 }
808
809 return $filesystem_encoding;
810}
811
812
813
814sub deduce_filename_encoding
815{
816 my $self = shift (@_);
817 my ($file,$metadata,$plugin_filename_encoding) = @_;
818
819 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
820 my $deduced_filename_encoding = undef;
821
822 # Start by looking for manually assigned metadata
823 if (defined $gs_filename_encoding) {
824 if (ref ($gs_filename_encoding) eq "ARRAY") {
825 my $outhandle = $self->{'outhandle'};
826
827 $deduced_filename_encoding = $gs_filename_encoding->[0];
828
829 my $num_vals = scalar(@$gs_filename_encoding);
830 if ($num_vals>1) {
831 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
832 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
833 }
834 }
835 else {
836 $deduced_filename_encoding = $gs_filename_encoding;
837 }
838 }
839
840 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
841 # Look to see if plugin specifies this value
842
843 if (defined $plugin_filename_encoding) {
844 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
845 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
846 my $outhandle = $self->{'outhandle'};
847 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
848 print $outhandle " default to 'auto'\n";
849 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
850 }
851
852 if ($plugin_filename_encoding ne "auto") {
853 # We've been given a specific filenamne encoding
854 # => so use it!
855 $deduced_filename_encoding = $plugin_filename_encoding;
856 }
857 }
858 }
859
860 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
861
862 # Look to file system to provide a character encoding
863
864 # If Windows NTFS, then -- assuming we work with long file names got through
865 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
866
867 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
868 # Can do better than working with the DOS character encoding returned by locale
869 $deduced_filename_encoding = "unicode";
870 }
871 else {
872 # Unix of some form or other
873
874 # See if we can determine the file system encoding through locale
875 $deduced_filename_encoding = $self->locale_encoding();
876
877 # if locale shows us filesystem is utf8, check to see filename is consistent
878 # => if not, then we have an "alien" filename on our hands
879
880 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
881 if (!&unicode::check_is_utf8($file)) {
882 # "alien" filename, so revert
883 $deduced_filename_encoding = undef;
884 }
885 }
886 }
887 }
888
889# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
890# # Last chance, apply textcat to deduce filename encoding
891# $deduced_filename_encoding = $self->textcat_encoding($file);
892# }
893
894 if ($self->{'verbosity'}>3) {
895 my $outhandle = $self->{'outhandle'};
896
897 if (defined $deduced_filename_encoding) {
898 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
899 }
900 else {
901 print $outhandle " No filename encoding deduced\n";
902 }
903 }
904
905 return $deduced_filename_encoding;
906}
907
908
909
910
911# Notionally written to be called once for each document, it is however safe to
912# call multiple times (as in the case of ImagePlugin) which calls this later on
913# after the original image has potentially been converted to a *new* source image
914# format (e.g. TIFF to PNG)
915
916sub set_Source_metadata {
917 my $self = shift (@_);
918 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
919
920 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
921 # and (as a fallback) using %xx if not for non-ascii chars
922 # 2. Sets the url ref (SourceFile) to the URL encoded version
923 # of filename for generated files
924
925 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
926
927 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
928
929 my $octet_file = $raw_file;
930
931 # UTF-8 version of filename
932# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
933# print STDERR "**** Setting Source Metadata given: $octet_file\n";
934# }
935
936 # Deal with (on Windows) raw filenames that are in their
937 # abbreviated DOS form
938
939 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
940 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
941 if (-e $raw_filename) {
942 my $unicode_filename = Win32::GetLongPathName($raw_filename);
943
944 my $unused_full_uf;
945 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
946 }
947 }
948 }
949
950 my $url_encoded_filename;
951 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
952 # => Generate a pretty print version of filename that is mapped to Unicode
953
954 # Use filename_encoding to map raw filename to a Perl unicode-aware string
955 $url_encoded_filename = decode($filename_encoding,$octet_file);
956 }
957 else {
958 # otherwise generate %xx encoded version of filename for char > 127
959 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
960 }
961
962# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
963# print STDERR "****** saving Source as: $url_encoded_filename\n";
964# }
965
966 # In the case of converted files and (generalized) exploded documents, there
967 # will already be a source filename => store as OrigSource before overriding
968 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
969 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
970 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
971 }
972
973 # Source is the UTF8 display name - not necessarily the name of the file on the system
974 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
975 # on Darwin want all display strings to be in composed form, then can search on that
976 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
977 }
978 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
979
980
981 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
982 # If using URL encoding, then SourceFile is the url-reference to url-encoded
983 # renamed_raw_url: it's a url that refers to the actual file on the system
984 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
985
986 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
987 $renamed_raw_url);
988
989# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
990# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
991# }
992}
993
994# this should be called by all plugins to set the oid of the doc obj, rather
995# than calling doc_obj->set_OID directly
996sub add_OID {
997 my $self = shift (@_);
998 my ($doc_obj) = @_;
999
1000 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1001
1002 # see if there is a plugin specific set_OID function
1003 if (defined ($self->can('set_OID'))) {
1004 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1005 }
1006 else {
1007 # use the default set_OID() in doc.pm
1008 $doc_obj->set_OID();
1009 }
1010
1011}
1012
1013# The BasePlugin read_into_doc_obj() function. This function does all the
1014# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1015# a file and sets up a slew of metadata all saved in doc_obj, which
1016# it then returns as part of a tuple (process_status,doc_obj)
1017#
1018# Much of this functionality used to reside in read, but it was broken
1019# down into a supporting routine to make the code more flexible.
1020#
1021# recursive plugins (e.g. RecPlug) and specialized plugins like those
1022# capable of processing many documents within a single file (e.g.
1023# GMLPlug) will normally want to implement their own version of
1024# read_into_doc_obj()
1025#
1026# Note that $base_dir might be "" and that $file might
1027# include directories
1028
1029# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1030sub read_into_doc_obj {
1031 my $self = shift (@_);
1032 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1033
1034 my $outhandle = $self->{'outhandle'};
1035
1036 # should we move this to read? What about secondary plugins?
1037 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1038 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1039 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1040 if $self->{'verbosity'} > 1;
1041
1042 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1043
1044 # create a new document
1045 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1046 my $top_section = $doc_obj->get_top_section();
1047
1048 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1049 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1050
1051
1052 my $plugin_filename_encoding = $self->{'filename_encoding'};
1053 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1054 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1055
1056 # plugin specific stuff - what args do we need here??
1057 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1058 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1059 return -1;
1060 }
1061
1062 # include any metadata passed in from previous plugins
1063 # note that this metadata is associated with the top level section
1064 my $section = $doc_obj->get_top_section();
1065 # can we merge these two methods??
1066 $self->add_associated_files($doc_obj, $filename_full_path);
1067 $self->extra_metadata ($doc_obj, $section, $metadata);
1068 $self->auto_extract_metadata($doc_obj);
1069
1070 # if we haven't found any Title so far, assign one
1071 # this was shifted to here from inside read()
1072 $self->title_fallback($doc_obj,$section,$filename_no_path);
1073
1074 $self->add_OID($doc_obj);
1075
1076 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1077 return (1,$doc_obj);
1078}
1079
1080sub post_process_doc_obj {
1081 my $self = shift (@_);
1082 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1083
1084 return 1;
1085}
1086
1087sub add_dummy_text {
1088 my $self = shift(@_);
1089 my ($doc_obj, $section) = @_;
1090
1091 # add NoText metadata so we can hide this dummy text in format statements
1092 $doc_obj->add_metadata($section, "NoText", "1");
1093 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1094
1095}
1096
1097# does nothing. Can be overridden by subclass
1098sub auto_extract_metadata {
1099 my $self = shift(@_);
1100 my ($doc_obj) = @_;
1101}
1102
1103# adds cover image, associate_file options stuff. Should be called by sub class
1104# read_into_doc_obj
1105sub add_associated_files {
1106 my $self = shift(@_);
1107 # whatis filename??
1108 my ($doc_obj, $filename) = @_;
1109
1110 # add in the cover image
1111 if ($self->{'cover_image'}) {
1112 $self->associate_cover_image($doc_obj, $filename);
1113 }
1114 # store the original (used for eg TextPlugin to store the original for OAI)
1115 if ($self->{'store_original_file'}) {
1116 $self->associate_source_file($doc_obj, $filename);
1117 }
1118
1119
1120}
1121
1122# implement this if you are extracting metadata for other documents
1123sub metadata_read {
1124 my $self = shift (@_);
1125 my ($pluginfo, $base_dir, $file, $block_hash,
1126 $extrametakeys, $extrametadata, $extrametafile,
1127 $processor, $gli, $aux) = @_;
1128
1129 # can we process this file??
1130 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1131 return undef unless $self->can_process_this_file($filename_full_path);
1132
1133 return 1; # we recognise the file, but don't actually do anything with it
1134}
1135
1136
1137# The BasePlugin read() function. This function calls read_into_doc_obj()
1138# to ensure all the right things to make general options work for a
1139# given plugin are done. It then calls the process() function which
1140# does all the work specific to a plugin (like the old read functions
1141# used to do). Most plugins should define their own process() function
1142# and let this read() function keep control.
1143#
1144# recursive plugins (e.g. RecPlug) and specialized plugins like those
1145# capable of processing many documents within a single file (e.g.
1146# GMLPlug) might want to implement their own version of read(), but
1147# more likely need to implement their own version of read_into_doc_obj()
1148#
1149# Return number of files processed, undef if can't recognise, -1 if can't
1150# process
1151
1152sub read {
1153 my $self = shift (@_);
1154 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1155
1156 # can we process this file??
1157 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1158
1159 return undef unless $self->can_process_this_file($filename_full_path);
1160
1161 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1162
1163 if ((defined $process_status) && ($process_status == 1)) {
1164
1165 # process the document
1166 $processor->process($doc_obj);
1167
1168 $self->{'num_processed'} ++;
1169 undef $doc_obj;
1170 }
1171 # delete any temp files that we may have created
1172 $self->clean_up_after_doc_obj_processing();
1173
1174
1175 # if process_status == 1, then the file has been processed.
1176 return $process_status;
1177
1178}
1179
1180# returns undef if file is rejected by the plugin
1181sub process {
1182 my $self = shift (@_);
1183 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1184
1185 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1186
1187 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1188 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1189
1190 die "\n";
1191
1192 return undef; # never gets here
1193}
1194
1195# overwrite this method to delete any temp files that we have created
1196sub clean_up_after_doc_obj_processing {
1197 my $self = shift(@_);
1198
1199}
1200
1201# write_file -- used by ConvertToPlug, for example in post processing
1202#
1203# where should this go, is here the best place??
1204sub utf8_write_file {
1205 my $self = shift (@_);
1206 my ($textref, $filename) = @_;
1207
1208 if (!open (FILE, ">:utf8", $filename)) {
1209 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1210 die "\n";
1211 }
1212 print FILE $$textref;
1213
1214 close FILE;
1215}
1216
1217
1218sub filename_based_title
1219{
1220 my $self = shift (@_);
1221 my ($file) = @_;
1222
1223 my $file_derived_title = $file;
1224 $file_derived_title =~ s/_/ /g;
1225 $file_derived_title =~ s/\.[^.]+$//;
1226
1227 return $file_derived_title;
1228}
1229
1230
1231sub title_fallback
1232{
1233 my $self = shift (@_);
1234 my ($doc_obj,$section,$file) = @_;
1235
1236 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1237 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1238
1239 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1240 my $file_derived_title;
1241 if (defined $source_file) {
1242 $file_derived_title = $self->filename_based_title($source_file);
1243 }
1244 else {
1245 # pp = pretty print
1246 my $pp_file = (defined $source_file) ? $source_file : $file;
1247
1248 my $raw_title = $self->filename_based_title($file);
1249 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1250 }
1251
1252
1253 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1254 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1255 }
1256 else {
1257 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1258 }
1259 }
1260
1261}
1262
1263# add any extra metadata that's been passed around from one
1264# plugin to another.
1265# extra_metadata uses add_utf8_metadata so it expects metadata values
1266# to already be in utf8
1267sub extra_metadata {
1268 my $self = shift (@_);
1269 my ($doc_obj, $cursection, $metadata) = @_;
1270
1271 my $associate_tail_re = $self->{'associate_tail_re'};
1272
1273 foreach my $field (keys(%$metadata)) {
1274 # $metadata->{$field} may be an array reference
1275 if ($field eq "gsdlassocfile_tobe") {
1276 # 'gsdlassocfile_tobe' is artificially introduced metadata
1277 # that is used to signal that certain additional files should
1278 # be tied to this document. Useful in situations where a
1279 # metadata pass in the plugin pipeline works out some files
1280 # need to be associated with a document, but the document hasn't
1281 # been formed yet.
1282 my $equiv_form = "";
1283 foreach my $gaf (@{$metadata->{$field}}) {
1284 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1285 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1286
1287 # we need to make sure the filename is valid utf-8 - we do
1288 # this by url or base64 encoding it
1289 # $tail_filename is the name that we store the file as
1290 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1291 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1292 $doc_obj->associate_source_file($full_filename);
1293 # If the filename is url_encoded, we need to encode the % signs
1294 # in the filename, so that it works in a url
1295 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1296 # work out extended tail extension (i.e. matching tail re)
1297
1298 my ($file_prefix,$file_extended_ext)
1299 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1300 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1301 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1302
1303## my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1304 my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1305 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1306
1307 my $srcicon = "_icon".$doc_ext."_";
1308 my $end_doclink = "</a>";
1309
1310 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1311
1312 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink);
1313 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1314 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1315
1316 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1317 # for metadata such as [mp3._edited] [mp3._full] ...
1318 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1319 }
1320
1321 # for multiple metadata such as [mp3.assoclink]
1322 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1323
1324 $equiv_form .= " $assoc_form";
1325 }
1326 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1327 }
1328 elsif ($field eq "gsdlzipfilename") {
1329 # special case for when files have come out of a zip. source_path
1330 # (used for archives dbs and keeping track for incremental import)
1331 # must be set to the zip file name
1332 my $zip_filename = $metadata->{$field};
1333 # overwrite the source_path
1334 $doc_obj->set_source_path($zip_filename);
1335 # and set the metadata
1336 $zip_filename = &util::filename_within_collection($zip_filename);
1337 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1338 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1339 }
1340 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1341 map {
1342 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1343 } @{$metadata->{$field}};
1344 } else {
1345 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1346 }
1347 }
1348}
1349
1350
1351sub compile_stats {
1352 my $self = shift(@_);
1353 my ($stats) = @_;
1354
1355 $stats->{'num_processed'} += $self->{'num_processed'};
1356 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1357 $stats->{'num_archives'} += $self->{'num_archives'};
1358
1359}
1360sub associate_source_file {
1361 my $self = shift(@_);
1362
1363 my ($doc_obj, $filename) = @_;
1364 my $cursection = $doc_obj->get_top_section();
1365 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1366
1367 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1368 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1369}
1370
1371sub associate_cover_image {
1372 my $self = shift(@_);
1373 my ($doc_obj, $filename) = @_;
1374
1375 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1376
1377 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1378 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1379
1380 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1381 # don't stat() for existence e.g. for multiple document input files
1382 # (eg SplitPlug)
1383 return;
1384 }
1385
1386 my $top_section=$doc_obj->get_top_section();
1387
1388 if (&util::fd_exists($upgraded_filename)) {
1389 $doc_obj->associate_source_file($filename);
1390 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1391 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1392 } else {
1393 my $upper_filename = $filename;
1394 my $upgraded_upper_filename = $upgraded_filename;
1395
1396 $upper_filename =~ s/jpg$/JPG/;
1397 $upgraded_upper_filename =~ s/jpg$/JPG/;
1398
1399 if (&util::fd_exists($upgraded_upper_filename)) {
1400 $doc_obj->associate_source_file($upper_filename);
1401 $doc_obj->associate_file($upper_filename, "cover.jpg",
1402 "image/jpeg");
1403 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1404 } else {
1405 # file doesn't exist, so record the fact that it's missing so
1406 # we don't stat() again (stat is slow)
1407 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1408 }
1409 }
1410
1411}
1412
1413
1414# Overridden by exploding plugins (eg. ISISPlug)
1415sub clean_up_after_exploding
1416{
1417 my $self = shift(@_);
1418}
1419
1420
1421
14221;
Note: See TracBrowser for help on using the repository browser.