source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 23544

Last change on this file since 23544 was 23544, checked in by kjdon, 13 years ago

on windows, if have a .JPG cover image, then a -e xxx.jpg test works, but if that filename is put into the block list, it won't match later on the .JPG, and the file won't be blocked. Solution, lowercase entirefilepath before adding into or checking in the block_hash->file_blocks list. but only for windows. Now don't need the alternative A: a: options for drive letters.

  • Property svn:keywords set to Author Date Id Revision
File size: 44.8 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34
35use encodings;
36use unicode;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
53 'desc' => "{BasePlugin.rename_method.base64}" },
54 { 'name' => "none",
55 'desc' => "{BasePlugin.rename_method.none}",
56 'hiddengli' => "yes" } ];
57
58our $encoding_list =
59 [ { 'name' => "ascii",
60 'desc' => "{BasePlugin.encoding.ascii}" },
61 { 'name' => "utf8",
62 'desc' => "{BasePlugin.encoding.utf8}" },
63 { 'name' => "unicode",
64 'desc' => "{BasePlugin.encoding.unicode}" } ];
65
66
67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70 my $hashEncode =
71 {'name' => $enc,
72 'desc' => $e->{$enc}->{'name'}};
73
74 push(@{$encoding_list},$hashEncode);
75}
76
77our $encoding_plus_auto_list =
78 [ { 'name' => "auto",
79 'desc' => "{BasePlugin.filename_encoding.auto}" },
80 { 'name' => "auto-language-analysis",
81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82 { 'name' => "auto-filesystem-encoding",
83 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84 { 'name' => "auto-fl",
85 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86 { 'name' => "auto-lf",
87 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
88
89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
91our $oidtype_list =
92 [ { 'name' => "auto",
93 'desc' => "{BasePlugin.OIDtype.auto}" },
94 { 'name' => "hash",
95 'desc' => "{import.OIDtype.hash}" },
96 { 'name' => "assigned",
97 'desc' => "{import.OIDtype.assigned}" },
98 { 'name' => "incremental",
99 'desc' => "{import.OIDtype.incremental}" },
100 { 'name' => "dirname",
101 'desc' => "{import.OIDtype.dirname}" } ];
102
103my $arguments =
104 [ { 'name' => "process_exp",
105 'desc' => "{BasePlugin.process_exp}",
106 'type' => "regexp",
107 'deft' => "",
108 'reqd' => "no" },
109 { 'name' => "no_blocking",
110 'desc' => "{BasePlugin.no_blocking}",
111 'type' => "flag",
112 'reqd' => "no"},
113 { 'name' => "block_exp",
114 'desc' => "{BasePlugin.block_exp}",
115 'type' => "regexp",
116 'deft' => "",
117 'reqd' => "no" },
118 { 'name' => "store_original_file",
119 'desc' => "{BasePlugin.store_original_file}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "associate_ext",
123 'desc' => "{BasePlugin.associate_ext}",
124 'type' => "string",
125 'reqd' => "no" },
126 { 'name' => "associate_tail_re",
127 'desc' => "{BasePlugin.associate_tail_re}",
128 'type' => "string",
129 'reqd' => "no" },
130 { 'name' => "OIDtype",
131 'desc' => "{import.OIDtype}",
132 'type' => "enum",
133 'list' => $oidtype_list,
134 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
135 'deft' => "auto",
136 'reqd' => "no" },
137 { 'name' => "OIDmetadata",
138 'desc' => "{import.OIDmetadata}",
139 'type' => "metadata",
140 'deft' => "dc.Identifier",
141 'reqd' => "no" },
142# { 'name' => "use_as_doc_identifier",
143# 'desc' => "{BasePlugin.use_as_doc_identifier}",
144# 'type' => "string",
145# 'reqd' => "no" ,
146# 'deft' => "" } ,
147 { 'name' => "no_cover_image",
148 'desc' => "{BasePlugin.no_cover_image}",
149 'type' => "flag",
150 'reqd' => "no" },
151 { 'name' => "filename_encoding",
152 'desc' => "{BasePlugin.filename_encoding}",
153 'type' => "enum",
154 'deft' => "auto",
155 'list' => $encoding_plus_auto_list,
156 'reqd' => "no" },
157 { 'name' => "smart_block",
158 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
159 'type' => "flag",
160 'reqd' => "no",
161 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
162 { 'name' => "file_rename_method",
163 'desc' => "{BasePlugin.file_rename_method}",
164 'type' => "enum",
165 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
166 'list' => $file_rename_method_list,
167 'reqd' => "no"
168 }
169
170 ];
171
172
173my $options = { 'name' => "BasePlugin",
174 'desc' => "{BasePlugin.desc}",
175 'abstract' => "yes",
176 'inherits' => "no",
177 'args' => $arguments };
178
179sub new {
180
181 my ($class) = shift (@_);
182 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
183 push(@$pluginlist, $class);
184
185 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
186 push(@{$hashArgOptLists->{"OptList"}},$options);
187
188 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
189
190 if ($self->{'info_only'}) {
191 # don't worry about any options etc
192 return bless $self, $class;
193 }
194
195 if ($self->{'smart_block'}) {
196 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
197 }
198 $self->{'smart_block'} = undef;
199
200 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
201 $self->{'plugin_type'} = $plugin_name;
202
203 # remove ex. from OIDmetadata
204 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
205 $self->{'num_processed'} = 0;
206 $self->{'num_not_processed'} = 0;
207 $self->{'num_blocked'} = 0;
208 $self->{'num_archives'} = 0;
209 $self->{'cover_image'} = 1; # cover image is on by default
210 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
211 $self->{'can_process_directories'} = 0;
212 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
213
214 my $associate_ext = $self->{'associate_ext'};
215 if ((defined $associate_ext) && ($associate_ext ne "")) {
216
217 my $associate_tail_re = $self->{'associate_tail_re'};
218 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
219 my $outhandle = $self->{'outhandle'};
220 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
221 print $outhandle " defaulting to 'associate_tail_re'\n";
222 }
223 else {
224 my @exts = split(/,/,$associate_ext);
225
226 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
227 my $associate_tail_re = join("|",@exts_bracketed);
228 $self->{'associate_tail_re'} = $associate_tail_re;
229 }
230
231 delete $self->{'associate_ext'};
232 }
233
234 return bless $self, $class;
235
236}
237
238sub merge_inheritance
239{
240 my $self = {};
241 my @child_selfs = @_;
242
243 foreach my $child_self (@child_selfs) {
244 foreach my $key (keys %$child_self) {
245 if (defined $self->{$key}) {
246 if ($self->{$key} ne $child_self->{$key}) {
247# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
248# print STDERR "Existing stored value = $self->{$key}\n";
249# print STDERR "New (child) value = $child_self->{$key}\n";
250# print STDERR "Keeping existing value\n";
251 # Existing value seems to be option specified in collect.cfg
252
253 ### $self->{$key} = $child_self->{$key};
254
255 }
256 else {
257## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
258 }
259
260 }
261 else {
262 $self->{$key} = $child_self->{$key};
263 }
264 }
265 }
266
267 return $self;
268}
269
270# initialize BasePlugin options
271# if init() is overridden in a sub-class, remember to call BasePlugin::init()
272sub init {
273 my $self = shift (@_);
274 my ($verbosity, $outhandle, $failhandle) = @_;
275
276 # verbosity is passed through from the processor
277 $self->{'verbosity'} = $verbosity;
278
279 # as are the outhandle and failhandle
280 $self->{'outhandle'} = $outhandle if defined $outhandle;
281 $self->{'failhandle'} = $failhandle;
282# $self->SUPER::init(@_);
283
284 # set process_exp and block_exp to defaults unless they were
285 # explicitly set
286
287 if ((!$self->is_recursive()) and
288 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
289
290 $self->{'process_exp'} = $self->get_default_process_exp ();
291 if ($self->{'process_exp'} eq "") {
292 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
293 }
294 }
295
296 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
297 $self->{'block_exp'} = $self->get_default_block_exp ();
298 }
299
300}
301
302sub begin {
303 my $self = shift (@_);
304 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
305
306 if ($self->{'OIDtype'} eq "auto") {
307 # hasn't been set in the plugin, use the processor values
308 $self->{'OIDtype'} = $processor->{'OIDtype'};
309 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
310 }
311 if ($self->{'OIDtype'} eq "hash") {
312 # should we hash on the file or on the doc xml??
313 $self->{'OIDtype'} = $self->get_oid_hash_type();
314 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
315 $self->{'OIDtype'} = "hash_on_file";
316 }
317 }
318}
319
320# This is called once if removeold is set with import.pl. Most plugins will do
321# nothing but if a plugin does any stuff outside of creating doc obj, then
322# it may need to clear something.
323sub remove_all {
324 my $self = shift (@_);
325 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
326}
327
328# This is called per document for docs that have been deleted from the
329# collection. Most plugins will do nothing
330# but if a plugin does any stuff outside of creating doc obj, then it may need
331# to clear something.
332sub remove_one {
333 my $self = shift (@_);
334
335 my ($file, $oids, $archivedir) = @_;
336 return 0 if $self->can_process_this_file($file);
337 return undef;
338}
339
340sub end {
341 # potentially called at the end of each plugin pass
342 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
343
344 my ($self) = shift (@_);
345}
346
347sub deinit {
348 # called only once, after all plugin passes have been done
349
350 my ($self) = @_;
351}
352
353# default hashing type is to hash on the original file (or converted file)
354# override this to return hash_on_ga_xml for filetypes where hashing on the
355# file is no good eg video
356sub get_oid_hash_type {
357
358 my $self = shift (@_);
359
360 return "hash_on_file";
361}
362
363
364# this function should be overridden to return 1
365# in recursive plugins
366sub is_recursive {
367 my $self = shift (@_);
368
369 return 0;
370}
371
372sub get_default_block_exp {
373 my $self = shift (@_);
374
375 return "";
376}
377
378sub get_default_process_exp {
379 my $self = shift (@_);
380
381 return "";
382}
383
384sub block_filename
385{
386 my $self = shift(@_);
387 my ($block_hash,$filename) = @_;
388
389 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
390
391 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
392 my $lower_filename = lc($filename);
393 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
394# my $lower_drive = $filename;
395# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
396
397# my $upper_drive = $filename;
398# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
399#
400# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
401# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
402 }
403 else {
404 $block_hash->{'file_blocks'}->{$filename} = 1;
405 }
406}
407
408# rename imported files and assoc files using URL encoding by default
409# as this will work for most plugins and give more legible filenames
410sub get_default_file_rename_method() {
411 my $self = shift (@_);
412 return "url";
413}
414
415# returns this plugin's active (possibly user-selected) file_rename_method
416sub get_file_rename_method() {
417 my $self = shift (@_);
418 my $rename_method = $self->{'file_rename_method'};
419 if($rename_method) {
420 return $rename_method;
421 } else {
422 return $self->get_default_file_rename_method();
423 }
424}
425
426# default implementation is to do nothing
427sub store_block_files {
428
429 my $self =shift (@_);
430 my ($filename_full_path, $block_hash) = @_;
431
432}
433
434# put files to block into hash
435sub use_block_expressions {
436
437 my $self =shift (@_);
438 my ($filename_full_path, $block_hash) = @_;
439
440 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
441
442 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
443 $self->block_filename($block_hash,$filename_full_path);
444 }
445
446}
447
448#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
449sub block_cover_image
450{
451 my $self =shift;
452 my ($filename, $block_hash) = @_;
453
454 $filename = &util::upgrade_if_dos_filename($filename);
455
456 if ($self->{'cover_image'}) {
457 my $coverfile = $filename;
458 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
459 if (!&util::fd_exists($coverfile)) {
460 $coverfile =~ s/jpg$/JPG/;
461 }
462 if (&util::fd_exists($coverfile)) {
463 $self->block_filename($block_hash,$coverfile);
464 }
465 }
466
467 return;
468}
469
470
471# discover all the files that should be blocked by this plugin
472# check the args ...
473sub file_block_read {
474
475 my $self = shift (@_);
476 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
477 # Keep track of filenames with same root but different extensions
478 # Used to support -associate_ext and the more generalised
479 # -associate_tail_re
480 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
481
482 if (!-d $filename_full_path) {
483 $block_hash->{'all_files'}->{$file} = 1;
484 }
485
486 my $associate_tail_re = $self->{'associate_tail_re'};
487 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
488 my ($file_prefix,$file_ext)
489 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
490 if ((defined $file_prefix) && (defined $file_ext)) {
491 my $shared_fileroot = $block_hash->{'shared_fileroot'};
492 if (!defined $shared_fileroot->{$file_prefix}) {
493 my $file_prefix_rec = { 'tie_to' => undef,
494 'exts' => {} };
495 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
496 }
497
498 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
499
500 if ($self->can_process_this_file($filename_full_path)) {
501 # This is the document the others should be tied to
502 $file_prefix_rec->{'tie_to'} = $file_ext;
503 }
504 else {
505 if ($file_ext =~ m/$associate_tail_re$/) {
506 # this file should be associated to the main one
507 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
508 }
509 }
510
511 }
512 }
513
514 # check block expressions
515 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
516
517 # now check whether we are actually processing this
518 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
519 return undef; # can't recognise
520 }
521
522 # if we have a block_exp, then this overrides the normal 'smart' blocking
523 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
524
525 # block the cover image if there is one
526 if ($self->{'cover_image'}) {
527 $self->block_cover_image($filename_full_path, $block_hash);
528 }
529
530 return 1;
531}
532
533# plugins that rely on more than process_exp (eg XML plugins) can override this method
534sub can_process_this_file {
535 my $self = shift(@_);
536 my ($filename) = @_;
537
538 if (-d $filename && !$self->{'can_process_directories'}) {
539 return 0;
540 }
541
542 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
543 return 1;
544 }
545 return 0;
546
547}
548
549# just converts path as is to utf8.
550sub filepath_to_utf8 {
551 my $self = shift (@_);
552 my ($file, $file_encoding) = @_;
553 my $filemeta = $file;
554
555 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
556
557 # Whenever filename-encoding is set to any of the auto settings, we
558 # check if the filename is already in UTF8. If it is, then we're done.
559 if($filename_encoding =~ m/auto/) {
560 if(&unicode::check_is_utf8($filemeta))
561 {
562 $filename_encoding = "utf8";
563 return $filemeta;
564 }
565 }
566
567 # Auto setting, but filename is not utf8
568 if ($filename_encoding eq "auto")
569 {
570 # try textcat
571 $filename_encoding = $self->textcat_encoding($filemeta);
572
573 # check the locale next
574 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
575
576
577 # now try the encoding of the document, if available
578 if ($filename_encoding eq "undefined" && defined $file_encoding) {
579 $filename_encoding = $file_encoding;
580 }
581
582 }
583
584 elsif ($filename_encoding eq "auto-language-analysis")
585 {
586 $filename_encoding = $self->textcat_encoding($filemeta);
587
588 # now try the encoding of the document, if available
589 if ($filename_encoding eq "undefined" && defined $file_encoding) {
590 $filename_encoding = $file_encoding;
591 }
592 }
593
594 elsif ($filename_encoding eq "auto-filesystem-encoding")
595 {
596 # try locale
597 $filename_encoding = $self->locale_encoding();
598 }
599
600 elsif ($filename_encoding eq "auto-fl")
601 {
602 # filesystem-encoding (locale) then language-analysis (textcat)
603 $filename_encoding = $self->locale_encoding();
604
605 # try textcat
606 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
607
608 # else assume filename encoding is encoding of file content, if that's available
609 if ($filename_encoding eq "undefined" && defined $file_encoding) {
610 $filename_encoding = $file_encoding;
611 }
612 }
613
614 elsif ($filename_encoding eq "auto-lf")
615 {
616 # language-analysis (textcat) then filesystem-encoding (locale)
617 $filename_encoding = $self->textcat_encoding($filemeta);
618
619 # guess filename encoding from encoding of file content, if available
620 if ($filename_encoding eq "undefined" && defined $file_encoding) {
621 $filename_encoding = $file_encoding;
622 }
623
624 # try locale
625 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
626 }
627
628 # if still undefined, use utf8 as fallback
629 if ($filename_encoding eq "undefined") {
630 $filename_encoding = "utf8";
631 }
632
633 #print STDERR "**** UTF8 encoding the filename $filemeta ";
634
635 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
636 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
637 # be always utf8 (in which case the filename's encoding is also set as utf8 even
638 # though the filename need not be if it originates from another system)--in such
639 # cases attempt to make the filename utf8 to match.
640 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
641 &unicode::ensure_utf8(\$filemeta);
642 }
643
644 # convert non-unicode encodings to utf8
645 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
646 $filemeta = &unicode::unicode2utf8(
647 &unicode::convert2unicode($filename_encoding, \$filemeta)
648 );
649 }
650
651 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
652 return $filemeta;
653}
654
655# gets the filename with no path, converts to utf8, and then dm safes it.
656# filename_encoding set by user
657sub filename_to_utf8_metadata
658{
659 my $self = shift (@_);
660 my ($file, $file_encoding) = @_;
661
662 my $outhandle = $self->{'outhandle'};
663
664 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
665 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
666 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
667
668
669 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
670 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
671
672 return $filemeta;
673}
674
675sub locale_encoding {
676 my $self = shift(@_);
677
678 if (!defined $self->{'filesystem_encoding'}) {
679 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
680 }
681
682 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
683 return $self->{'filesystem_encoding'}; # can be the string "undefined"
684}
685
686sub textcat_encoding {
687 my $self = shift(@_);
688 my ($filemeta) = @_;
689
690 # analyse filenames without extensions and digits (and trimmed of
691 # surrounding whitespace), so that irrelevant chars don't confuse
692 # textcat
693 my $strictfilemeta = $filemeta;
694 $strictfilemeta =~ s/\.[^\.]+$//g;
695 $strictfilemeta =~ s/\d//g;
696 $strictfilemeta =~ s/^\s*//g;
697 $strictfilemeta =~ s/\s*$//g;
698
699 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
700 if(!defined $filename_encoding) {
701 $filename_encoding = "undefined";
702 }
703
704 return $filename_encoding; # can be the string "undefined"
705}
706
707# performs textcat
708sub encoding_from_language_analysis {
709 my $self = shift(@_);
710 my ($text) = @_;
711
712 my $outhandle = $self->{'outhandle'};
713 my $best_encoding = undef;
714
715 # get the language/encoding of the textstring using textcat
716 require textcat; # Only load the textcat module if it is required
717 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
718 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
719
720
721 if (scalar @$results < 0) {
722 return undef;
723 }
724
725 # We have some results, we choose the first
726 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
727
728 $best_encoding = $encoding;
729 if (!defined $best_encoding) {
730 return undef;
731 }
732
733 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
734 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
735 $best_encoding = 'utf8';
736 }
737
738
739 # check for equivalents where textcat doesn't have some encodings...
740 # eg MS versions of standard encodings
741 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
742 my $iso = $1; # which variant of the iso standard?
743 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
744 if ($text =~ /[\x80-\x9f]/) {
745 # Western Europe
746 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
747 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
748 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
749 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
750 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
751 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
752 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
753 }
754 }
755
756 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
757 !defined $encodings::encodings->{$best_encoding})
758 {
759 if ($self->{'verbosity'}) {
760 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
761 }
762 $best_encoding = undef;
763 }
764
765 return $best_encoding;
766}
767
768# uses locale
769sub get_filesystem_encoding
770{
771
772 my $self = shift(@_);
773
774 my $outhandle = $self->{'outhandle'};
775 my $filesystem_encoding = undef;
776
777 eval {
778 # Works for Windows as well, returning the DOS code page in use
779 use POSIX qw(locale_h);
780
781 # With only one parameter, setlocale retrieves the
782 # current value
783 my $current_locale = setlocale(LC_CTYPE);
784
785 my $char_encoding = undef;
786 if ($current_locale =~ m/\./) {
787 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
788 $char_encoding = lc($char_encoding);
789 }
790 else {
791 if ($current_locale =~ m/^(posix|c)$/i) {
792 $char_encoding = "ascii";
793 }
794 }
795
796 if (defined $char_encoding) {
797 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
798 $char_encoding = "$1\_$2\_$3";
799 }
800
801 $char_encoding =~ s/-/_/g;
802 $char_encoding =~ s/^utf_8$/utf8/;
803
804 if ($char_encoding =~ m/^\d+$/) {
805 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
806 $char_encoding = "windows_$char_encoding";
807 }
808 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
809 $char_encoding = "dos_$char_encoding";
810 }
811 }
812
813 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
814 || (defined $encodings::encodings->{$char_encoding})) {
815 $filesystem_encoding = $char_encoding;
816 }
817 else {
818 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
819 }
820 }
821
822
823 };
824 if ($@) {
825 print $outhandle "$@\n";
826 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
827
828 }
829
830 return $filesystem_encoding;
831}
832
833
834
835sub deduce_filename_encoding
836{
837 my $self = shift (@_);
838 my ($file,$metadata,$plugin_filename_encoding) = @_;
839
840 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
841 my $deduced_filename_encoding = undef;
842
843 # Start by looking for manually assigned metadata
844 if (defined $gs_filename_encoding) {
845 if (ref ($gs_filename_encoding) eq "ARRAY") {
846 my $outhandle = $self->{'outhandle'};
847
848 $deduced_filename_encoding = $gs_filename_encoding->[0];
849
850 my $num_vals = scalar(@$gs_filename_encoding);
851 if ($num_vals>1) {
852 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
853 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
854 }
855 }
856 else {
857 $deduced_filename_encoding = $gs_filename_encoding;
858 }
859 }
860
861 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
862 # Look to see if plugin specifies this value
863
864 if (defined $plugin_filename_encoding) {
865 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
866 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
867 my $outhandle = $self->{'outhandle'};
868 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
869 print $outhandle " default to 'auto'\n";
870 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
871 }
872
873 if ($plugin_filename_encoding ne "auto") {
874 # We've been given a specific filenamne encoding
875 # => so use it!
876 $deduced_filename_encoding = $plugin_filename_encoding;
877 }
878 }
879 }
880
881 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
882
883 # Look to file system to provide a character encoding
884
885 # If Windows NTFS, then -- assuming we work with long file names got through
886 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
887
888 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
889 # Can do better than working with the DOS character encoding returned by locale
890 $deduced_filename_encoding = "unicode";
891 }
892 else {
893 # Unix of some form or other
894
895 # See if we can determine the file system encoding through locale
896 $deduced_filename_encoding = $self->locale_encoding();
897
898 # if locale shows us filesystem is utf8, check to see filename is consistent
899 # => if not, then we have an "alien" filename on our hands
900
901 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
902 if (!&unicode::check_is_utf8($file)) {
903 # "alien" filename, so revert
904 $deduced_filename_encoding = undef;
905 }
906 }
907 }
908 }
909
910# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
911# # Last chance, apply textcat to deduce filename encoding
912# $deduced_filename_encoding = $self->textcat_encoding($file);
913# }
914
915 if ($self->{'verbosity'}>3) {
916 my $outhandle = $self->{'outhandle'};
917
918 if (defined $deduced_filename_encoding) {
919 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
920 }
921 else {
922 print $outhandle " No filename encoding deduced\n";
923 }
924 }
925
926 return $deduced_filename_encoding;
927}
928
929
930
931
932# Notionally written to be called once for each document, it is however safe to
933# call multiple times (as in the case of ImagePlugin) which calls this later on
934# after the original image has potentially been converted to a *new* source image
935# format (e.g. TIFF to PNG)
936
937sub set_Source_metadata {
938 my $self = shift (@_);
939 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
940
941 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
942 # and (as a fallback) using %xx if not for non-ascii chars
943 # 2. Sets the url ref (SourceFile) to the URL encoded version
944 # of filename for generated files
945
946 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
947
948 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
949
950 my $octet_file = $raw_file;
951
952 # UTF-8 version of filename
953# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
954# print STDERR "**** Setting Source Metadata given: $octet_file\n";
955# }
956
957 # Deal with (on Windows) raw filenames that are in their
958 # abbreviated DOS form
959
960 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
961 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
962 if (-e $raw_filename) {
963 my $unicode_filename = Win32::GetLongPathName($raw_filename);
964
965 my $unused_full_uf;
966 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
967 }
968 }
969 }
970
971 my $url_encoded_filename;
972 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
973 # => Generate a pretty print version of filename that is mapped to Unicode
974
975 # Use filename_encoding to map raw filename to a Perl unicode-aware string
976 $url_encoded_filename = decode($filename_encoding,$octet_file);
977 }
978 else {
979 # otherwise generate %xx encoded version of filename for char > 127
980 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
981 }
982
983# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
984# print STDERR "****** saving Source as: $url_encoded_filename\n";
985# }
986
987
988 # Source is the UTF8 display name - not necessarily the name of the file on the system
989 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
990
991 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
992 # If using URL encoding, then SourceFile is the url-reference to url-encoded
993 # renamed_raw_url: it's a url that refers to the actual file on the system
994 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
995
996 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
997 $renamed_raw_url);
998
999# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1000# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1001# }
1002}
1003
1004# this should be called by all plugins to set the oid of the doc obj, rather
1005# than calling doc_obj->set_OID directly
1006sub add_OID {
1007 my $self = shift (@_);
1008 my ($doc_obj) = @_;
1009
1010 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1011
1012 # see if there is a plugin specific set_OID function
1013 if (defined ($self->can('set_OID'))) {
1014 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1015 }
1016 else {
1017 # use the default set_OID() in doc.pm
1018 $doc_obj->set_OID();
1019 }
1020
1021}
1022
1023# The BasePlugin read_into_doc_obj() function. This function does all the
1024# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1025# a file and sets up a slew of metadata all saved in doc_obj, which
1026# it then returns as part of a tuple (process_status,doc_obj)
1027#
1028# Much of this functionality used to reside in read, but it was broken
1029# down into a supporting routine to make the code more flexible.
1030#
1031# recursive plugins (e.g. RecPlug) and specialized plugins like those
1032# capable of processing many documents within a single file (e.g.
1033# GMLPlug) will normally want to implement their own version of
1034# read_into_doc_obj()
1035#
1036# Note that $base_dir might be "" and that $file might
1037# include directories
1038
1039# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1040sub read_into_doc_obj {
1041 my $self = shift (@_);
1042 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1043
1044 my $outhandle = $self->{'outhandle'};
1045
1046 # should we move this to read? What about secondary plugins?
1047 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1048 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1049 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1050 if $self->{'verbosity'} > 1;
1051
1052 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1053
1054 # create a new document
1055 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1056 my $top_section = $doc_obj->get_top_section();
1057
1058 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1059 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1060
1061
1062 my $plugin_filename_encoding = $self->{'filename_encoding'};
1063 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1064 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1065
1066 # plugin specific stuff - what args do we need here??
1067 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1068 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1069 return -1;
1070 }
1071
1072 # include any metadata passed in from previous plugins
1073 # note that this metadata is associated with the top level section
1074 my $section = $doc_obj->get_top_section();
1075 # can we merge these two methods??
1076 $self->add_associated_files($doc_obj, $filename_full_path);
1077 $self->extra_metadata ($doc_obj, $section, $metadata);
1078 $self->auto_extract_metadata($doc_obj);
1079
1080 # if we haven't found any Title so far, assign one
1081 # this was shifted to here from inside read()
1082 $self->title_fallback($doc_obj,$section,$filename_no_path);
1083
1084 $self->add_OID($doc_obj);
1085
1086 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1087 return (1,$doc_obj);
1088}
1089
1090sub post_process_doc_obj {
1091 my $self = shift (@_);
1092 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1093
1094 return 1;
1095}
1096
1097sub add_dummy_text {
1098 my $self = shift(@_);
1099 my ($doc_obj, $section) = @_;
1100
1101 # add NoText metadata so we can hide this dummy text in format statements
1102 $doc_obj->add_metadata($section, "NoText", "1");
1103 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1104
1105}
1106
1107# does nothing. Can be overridden by subclass
1108sub auto_extract_metadata {
1109 my $self = shift(@_);
1110 my ($doc_obj) = @_;
1111}
1112
1113# adds cover image, associate_file options stuff. Should be called by sub class
1114# read_into_doc_obj
1115sub add_associated_files {
1116 my $self = shift(@_);
1117 # whatis filename??
1118 my ($doc_obj, $filename) = @_;
1119
1120 # add in the cover image
1121 if ($self->{'cover_image'}) {
1122 $self->associate_cover_image($doc_obj, $filename);
1123 }
1124 # store the original (used for eg TextPlugin to store the original for OAI)
1125 if ($self->{'store_original_file'}) {
1126 $self->associate_source_file($doc_obj, $filename);
1127 }
1128
1129
1130}
1131
1132# implement this if you are extracting metadata for other documents
1133sub metadata_read {
1134 my $self = shift (@_);
1135 my ($pluginfo, $base_dir, $file, $block_hash,
1136 $extrametakeys, $extrametadata, $extrametafile,
1137 $processor, $gli, $aux) = @_;
1138
1139 # can we process this file??
1140 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1141 return undef unless $self->can_process_this_file($filename_full_path);
1142
1143 return 1; # we recognise the file, but don't actually do anything with it
1144}
1145
1146
1147# The BasePlugin read() function. This function calls read_into_doc_obj()
1148# to ensure all the right things to make general options work for a
1149# given plugin are done. It then calls the process() function which
1150# does all the work specific to a plugin (like the old read functions
1151# used to do). Most plugins should define their own process() function
1152# and let this read() function keep control.
1153#
1154# recursive plugins (e.g. RecPlug) and specialized plugins like those
1155# capable of processing many documents within a single file (e.g.
1156# GMLPlug) might want to implement their own version of read(), but
1157# more likely need to implement their own version of read_into_doc_obj()
1158#
1159# Return number of files processed, undef if can't recognise, -1 if can't
1160# process
1161
1162sub read {
1163 my $self = shift (@_);
1164 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1165
1166 # can we process this file??
1167 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1168
1169 return undef unless $self->can_process_this_file($filename_full_path);
1170
1171 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1172
1173 if ((defined $process_status) && ($process_status == 1)) {
1174
1175 # process the document
1176 $processor->process($doc_obj);
1177
1178 $self->{'num_processed'} ++;
1179 undef $doc_obj;
1180 }
1181 # delete any temp files that we may have created
1182 $self->clean_up_after_doc_obj_processing();
1183
1184
1185 # if process_status == 1, then the file has been processed.
1186 return $process_status;
1187
1188}
1189
1190# returns undef if file is rejected by the plugin
1191sub process {
1192 my $self = shift (@_);
1193 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1194
1195 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1196
1197 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1198 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1199
1200 die "\n";
1201
1202 return undef; # never gets here
1203}
1204
1205# overwrite this method to delete any temp files that we have created
1206sub clean_up_after_doc_obj_processing {
1207 my $self = shift(@_);
1208
1209}
1210
1211# write_file -- used by ConvertToPlug, for example in post processing
1212#
1213# where should this go, is here the best place??
1214sub utf8_write_file {
1215 my $self = shift (@_);
1216 my ($textref, $filename) = @_;
1217
1218 if (!open (FILE, ">:utf8", $filename)) {
1219 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1220 die "\n";
1221 }
1222 print FILE $$textref;
1223
1224 close FILE;
1225}
1226
1227
1228sub filename_based_title
1229{
1230 my $self = shift (@_);
1231 my ($file) = @_;
1232
1233 my $file_derived_title = $file;
1234 $file_derived_title =~ s/_/ /g;
1235 $file_derived_title =~ s/\.[^.]+$//;
1236
1237 return $file_derived_title;
1238}
1239
1240
1241sub title_fallback
1242{
1243 my $self = shift (@_);
1244 my ($doc_obj,$section,$file) = @_;
1245
1246 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1247 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1248
1249 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1250 my $file_derived_title;
1251 if (defined $source_file) {
1252 $file_derived_title = $self->filename_based_title($source_file);
1253 }
1254 else {
1255 # pp = pretty print
1256 my $pp_file = (defined $source_file) ? $source_file : $file;
1257
1258 my $raw_title = $self->filename_based_title($file);
1259 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1260 }
1261
1262
1263 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1264 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1265 }
1266 else {
1267 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1268 }
1269 }
1270
1271}
1272
1273# add any extra metadata that's been passed around from one
1274# plugin to another.
1275# extra_metadata uses add_utf8_metadata so it expects metadata values
1276# to already be in utf8
1277sub extra_metadata {
1278 my $self = shift (@_);
1279 my ($doc_obj, $cursection, $metadata) = @_;
1280
1281 my $associate_tail_re = $self->{'associate_tail_re'};
1282
1283 foreach my $field (keys(%$metadata)) {
1284 # $metadata->{$field} may be an array reference
1285 if ($field eq "gsdlassocfile_tobe") {
1286 # 'gsdlassocfile_tobe' is artificially introduced metadata
1287 # that is used to signal that certain additional files should
1288 # be tied to this document. Useful in situations where a
1289 # metadata pass in the plugin pipeline works out some files
1290 # need to be associated with a document, but the document hasn't
1291 # been formed yet.
1292 my $equiv_form = "";
1293 foreach my $gaf (@{$metadata->{$field}}) {
1294 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1295 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1296
1297 # we need to make sure the filename is valid utf-8 - we do
1298 # this by url or base64 encoding it
1299 # $tail_filename is the name that we store the file as
1300 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1301 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1302 $doc_obj->associate_source_file($full_filename);
1303 # If the filename is url_encoded, we need to encode the % signs
1304 # in the filename, so that it works in a url
1305 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1306 # work out extended tail extension (i.e. matching tail re)
1307
1308 my ($file_prefix,$file_extended_ext)
1309 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1310 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1311 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1312 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1313 my $srcicon = "_icon".$doc_ext."_";
1314 my $end_doclink = "</a>";
1315
1316 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1317
1318 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1319 # for metadata such as [mp3._edited] [mp3._full] ...
1320 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1321 }
1322
1323 # for multiple metadata such as [mp3.assoclink]
1324 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1325
1326 $equiv_form .= " $assoc_form";
1327 }
1328 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1329 }
1330 elsif ($field eq "gsdlzipfilename") {
1331 # special case for when files have come out of a zip. source_path
1332 # (used for archives dbs and keeping track for incremental import)
1333 # must be set to the zip file name
1334 my $zip_filename = $metadata->{$field};
1335 # overwrite the source_path
1336 $doc_obj->set_source_path($zip_filename);
1337 # and set the metadata
1338 $zip_filename = &util::filename_within_collection($zip_filename);
1339 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1340 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1341 }
1342 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1343 map {
1344 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1345 } @{$metadata->{$field}};
1346 } else {
1347 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1348 }
1349 }
1350}
1351
1352
1353sub compile_stats {
1354 my $self = shift(@_);
1355 my ($stats) = @_;
1356
1357 $stats->{'num_processed'} += $self->{'num_processed'};
1358 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1359 $stats->{'num_archives'} += $self->{'num_archives'};
1360
1361}
1362sub associate_source_file {
1363 my $self = shift(@_);
1364
1365 my ($doc_obj, $filename) = @_;
1366 my $cursection = $doc_obj->get_top_section();
1367 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1368
1369 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1370 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1371}
1372
1373sub associate_cover_image {
1374 my $self = shift(@_);
1375 my ($doc_obj, $filename) = @_;
1376
1377 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1378
1379 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1380 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1381
1382 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1383 # don't stat() for existence e.g. for multiple document input files
1384 # (eg SplitPlug)
1385 return;
1386 }
1387
1388 my $top_section=$doc_obj->get_top_section();
1389
1390 if (&util::fd_exists($upgraded_filename)) {
1391 $doc_obj->associate_source_file($filename);
1392 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1393 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1394 } else {
1395 my $upper_filename = $filename;
1396 my $upgraded_upper_filename = $upgraded_filename;
1397
1398 $upper_filename =~ s/jpg$/JPG/;
1399 $upgraded_upper_filename =~ s/jpg$/JPG/;
1400
1401 if (&util::fd_exists($upgraded_upper_filename)) {
1402 $doc_obj->associate_source_file($upper_filename);
1403 $doc_obj->associate_file($upper_filename, "cover.jpg",
1404 "image/jpeg");
1405 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1406 } else {
1407 # file doesn't exist, so record the fact that it's missing so
1408 # we don't stat() again (stat is slow)
1409 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1410 }
1411 }
1412
1413}
1414
1415
1416# Overridden by exploding plugins (eg. ISISPlug)
1417sub clean_up_after_exploding
1418{
1419 my $self = shift(@_);
1420}
1421
1422
1423
14241;
Note: See TracBrowser for help on using the repository browser.