source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 23472

Last change on this file since 23472 was 23472, checked in by ak19, 13 years ago

Erroneous forth argument (a filename) left over from an earlier time, was now being picked up as the *new* 4th parameter (which should have been a $section). Now fixed.

  • Property svn:keywords set to Author Date Id Revision
File size: 44.6 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34
35use encodings;
36use unicode;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
53 'desc' => "{BasePlugin.rename_method.base64}" },
54 { 'name' => "none",
55 'desc' => "{BasePlugin.rename_method.none}",
56 'hiddengli' => "yes" } ];
57
58our $encoding_list =
59 [ { 'name' => "ascii",
60 'desc' => "{BasePlugin.encoding.ascii}" },
61 { 'name' => "utf8",
62 'desc' => "{BasePlugin.encoding.utf8}" },
63 { 'name' => "unicode",
64 'desc' => "{BasePlugin.encoding.unicode}" } ];
65
66
67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70 my $hashEncode =
71 {'name' => $enc,
72 'desc' => $e->{$enc}->{'name'}};
73
74 push(@{$encoding_list},$hashEncode);
75}
76
77our $encoding_plus_auto_list =
78 [ { 'name' => "auto",
79 'desc' => "{BasePlugin.filename_encoding.auto}" },
80 { 'name' => "auto-language-analysis",
81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82 { 'name' => "auto-filesystem-encoding",
83 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84 { 'name' => "auto-fl",
85 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86 { 'name' => "auto-lf",
87 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
88
89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
91our $oidtype_list =
92 [ { 'name' => "auto",
93 'desc' => "{BasePlugin.OIDtype.auto}" },
94 { 'name' => "hash",
95 'desc' => "{import.OIDtype.hash}" },
96 { 'name' => "assigned",
97 'desc' => "{import.OIDtype.assigned}" },
98 { 'name' => "incremental",
99 'desc' => "{import.OIDtype.incremental}" },
100 { 'name' => "dirname",
101 'desc' => "{import.OIDtype.dirname}" } ];
102
103my $arguments =
104 [ { 'name' => "process_exp",
105 'desc' => "{BasePlugin.process_exp}",
106 'type' => "regexp",
107 'deft' => "",
108 'reqd' => "no" },
109 { 'name' => "no_blocking",
110 'desc' => "{BasePlugin.no_blocking}",
111 'type' => "flag",
112 'reqd' => "no"},
113 { 'name' => "block_exp",
114 'desc' => "{BasePlugin.block_exp}",
115 'type' => "regexp",
116 'deft' => "",
117 'reqd' => "no" },
118 { 'name' => "store_original_file",
119 'desc' => "{BasePlugin.store_original_file}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "associate_ext",
123 'desc' => "{BasePlugin.associate_ext}",
124 'type' => "string",
125 'reqd' => "no" },
126 { 'name' => "associate_tail_re",
127 'desc' => "{BasePlugin.associate_tail_re}",
128 'type' => "string",
129 'reqd' => "no" },
130 { 'name' => "OIDtype",
131 'desc' => "{import.OIDtype}",
132 'type' => "enum",
133 'list' => $oidtype_list,
134 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
135 'deft' => "auto",
136 'reqd' => "no" },
137 { 'name' => "OIDmetadata",
138 'desc' => "{import.OIDmetadata}",
139 'type' => "metadata",
140 'deft' => "dc.Identifier",
141 'reqd' => "no" },
142# { 'name' => "use_as_doc_identifier",
143# 'desc' => "{BasePlugin.use_as_doc_identifier}",
144# 'type' => "string",
145# 'reqd' => "no" ,
146# 'deft' => "" } ,
147 { 'name' => "no_cover_image",
148 'desc' => "{BasePlugin.no_cover_image}",
149 'type' => "flag",
150 'reqd' => "no" },
151 { 'name' => "filename_encoding",
152 'desc' => "{BasePlugin.filename_encoding}",
153 'type' => "enum",
154 'deft' => "auto",
155 'list' => $encoding_plus_auto_list,
156 'reqd' => "no" },
157 { 'name' => "smart_block",
158 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
159 'type' => "flag",
160 'reqd' => "no",
161 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
162 { 'name' => "file_rename_method",
163 'desc' => "{BasePlugin.file_rename_method}",
164 'type' => "enum",
165 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
166 'list' => $file_rename_method_list,
167 'reqd' => "no"
168 }
169
170 ];
171
172
173my $options = { 'name' => "BasePlugin",
174 'desc' => "{BasePlugin.desc}",
175 'abstract' => "yes",
176 'inherits' => "no",
177 'args' => $arguments };
178
179sub new {
180
181 my ($class) = shift (@_);
182 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
183 push(@$pluginlist, $class);
184
185 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
186 push(@{$hashArgOptLists->{"OptList"}},$options);
187
188 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
189
190 if ($self->{'info_only'}) {
191 # don't worry about any options etc
192 return bless $self, $class;
193 }
194
195 if ($self->{'smart_block'}) {
196 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
197 }
198 $self->{'smart_block'} = undef;
199
200 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
201 $self->{'plugin_type'} = $plugin_name;
202
203 # remove ex. from OIDmetadata
204 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
205 $self->{'num_processed'} = 0;
206 $self->{'num_not_processed'} = 0;
207 $self->{'num_blocked'} = 0;
208 $self->{'num_archives'} = 0;
209 $self->{'cover_image'} = 1; # cover image is on by default
210 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
211 $self->{'can_process_directories'} = 0;
212 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
213
214 my $associate_ext = $self->{'associate_ext'};
215 if ((defined $associate_ext) && ($associate_ext ne "")) {
216
217 my $associate_tail_re = $self->{'associate_tail_re'};
218 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
219 my $outhandle = $self->{'outhandle'};
220 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
221 print $outhandle " defaulting to 'associate_tail_re'\n";
222 }
223 else {
224 my @exts = split(/,/,$associate_ext);
225
226 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
227 my $associate_tail_re = join("|",@exts_bracketed);
228 $self->{'associate_tail_re'} = $associate_tail_re;
229 }
230
231 delete $self->{'associate_ext'};
232 }
233
234 return bless $self, $class;
235
236}
237
238sub merge_inheritance
239{
240 my $self = {};
241 my @child_selfs = @_;
242
243 foreach my $child_self (@child_selfs) {
244 foreach my $key (keys %$child_self) {
245 if (defined $self->{$key}) {
246 if ($self->{$key} ne $child_self->{$key}) {
247# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
248# print STDERR "Existing stored value = $self->{$key}\n";
249# print STDERR "New (child) value = $child_self->{$key}\n";
250# print STDERR "Keeping existing value\n";
251 # Existing value seems to be option specified in collect.cfg
252
253 ### $self->{$key} = $child_self->{$key};
254
255 }
256 else {
257## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
258 }
259
260 }
261 else {
262 $self->{$key} = $child_self->{$key};
263 }
264 }
265 }
266
267 return $self;
268}
269
270# initialize BasePlugin options
271# if init() is overridden in a sub-class, remember to call BasePlugin::init()
272sub init {
273 my $self = shift (@_);
274 my ($verbosity, $outhandle, $failhandle) = @_;
275
276 # verbosity is passed through from the processor
277 $self->{'verbosity'} = $verbosity;
278
279 # as are the outhandle and failhandle
280 $self->{'outhandle'} = $outhandle if defined $outhandle;
281 $self->{'failhandle'} = $failhandle;
282# $self->SUPER::init(@_);
283
284 # set process_exp and block_exp to defaults unless they were
285 # explicitly set
286
287 if ((!$self->is_recursive()) and
288 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
289
290 $self->{'process_exp'} = $self->get_default_process_exp ();
291 if ($self->{'process_exp'} eq "") {
292 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
293 }
294 }
295
296 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
297 $self->{'block_exp'} = $self->get_default_block_exp ();
298 }
299
300}
301
302sub begin {
303 my $self = shift (@_);
304 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
305
306 if ($self->{'OIDtype'} eq "auto") {
307 # hasn't been set in the plugin, use the processor values
308 $self->{'OIDtype'} = $processor->{'OIDtype'};
309 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
310 }
311 if ($self->{'OIDtype'} eq "hash") {
312 # should we hash on the file or on the doc xml??
313 $self->{'OIDtype'} = $self->get_oid_hash_type();
314 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
315 $self->{'OIDtype'} = "hash_on_file";
316 }
317 }
318}
319
320# This is called once if removeold is set with import.pl. Most plugins will do
321# nothing but if a plugin does any stuff outside of creating doc obj, then
322# it may need to clear something.
323sub remove_all {
324 my $self = shift (@_);
325 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
326}
327
328# This is called per document for docs that have been deleted from the
329# collection. Most plugins will do nothing
330# but if a plugin does any stuff outside of creating doc obj, then it may need
331# to clear something.
332sub remove_one {
333 my $self = shift (@_);
334
335 my ($file, $oids, $archivedir) = @_;
336 return 0 if $self->can_process_this_file($file);
337 return undef;
338}
339
340sub end {
341 # potentially called at the end of each plugin pass
342 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
343
344 my ($self) = shift (@_);
345}
346
347sub deinit {
348 # called only once, after all plugin passes have been done
349
350 my ($self) = @_;
351}
352
353# default hashing type is to hash on the original file (or converted file)
354# override this to return hash_on_ga_xml for filetypes where hashing on the
355# file is no good eg video
356sub get_oid_hash_type {
357
358 my $self = shift (@_);
359
360 return "hash_on_file";
361}
362
363
364# this function should be overridden to return 1
365# in recursive plugins
366sub is_recursive {
367 my $self = shift (@_);
368
369 return 0;
370}
371
372sub get_default_block_exp {
373 my $self = shift (@_);
374
375 return "";
376}
377
378sub get_default_process_exp {
379 my $self = shift (@_);
380
381 return "";
382}
383
384sub block_filename
385{
386 my $self = shift(@_);
387 my ($block_hash,$filename) = @_;
388
389 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
390
391 my $lower_drive = $filename;
392 $lower_drive =~ s/^([A-Z]):/\l$1:/i;
393
394 my $upper_drive = $filename;
395 $upper_drive =~ s/^([A-Z]):/\u$1:/i;
396
397 $block_hash->{'file_blocks'}->{$lower_drive} = 1;
398 $block_hash->{'file_blocks'}->{$upper_drive} = 1;
399 }
400 else {
401 $block_hash->{'file_blocks'}->{$filename} = 1;
402 }
403}
404
405# rename imported files and assoc files using URL encoding by default
406# as this will work for most plugins and give more legible filenames
407sub get_default_file_rename_method() {
408 my $self = shift (@_);
409 return "url";
410}
411
412# returns this plugin's active (possibly user-selected) file_rename_method
413sub get_file_rename_method() {
414 my $self = shift (@_);
415 my $rename_method = $self->{'file_rename_method'};
416 if($rename_method) {
417 return $rename_method;
418 } else {
419 return $self->get_default_file_rename_method();
420 }
421}
422
423# default implementation is to do nothing
424sub store_block_files {
425
426 my $self =shift (@_);
427 my ($filename_full_path, $block_hash) = @_;
428
429}
430
431# put files to block into hash
432sub use_block_expressions {
433
434 my $self =shift (@_);
435 my ($filename_full_path, $block_hash) = @_;
436
437 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
438
439 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
440 $self->block_filename($block_hash,$filename_full_path);
441 }
442
443}
444
445#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
446sub block_cover_image
447{
448 my $self =shift;
449 my ($filename, $block_hash) = @_;
450
451 $filename = &util::upgrade_if_dos_filename($filename);
452
453 if ($self->{'cover_image'}) {
454 my $coverfile = $filename;
455 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
456 if (!&util::fd_exists($coverfile)) {
457 $coverfile =~ s/jpg$/JPG/;
458 }
459 if (&util::fd_exists($coverfile)) {
460 $self->block_filename($block_hash,$coverfile);
461 }
462 }
463
464 return;
465}
466
467
468# discover all the files that should be blocked by this plugin
469# check the args ...
470sub file_block_read {
471
472 my $self = shift (@_);
473 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
474 # Keep track of filenames with same root but different extensions
475 # Used to support -associate_ext and the more generalised
476 # -associate_tail_re
477 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
478
479 if (!-d $filename_full_path) {
480 $block_hash->{'all_files'}->{$file} = 1;
481 }
482
483 my $associate_tail_re = $self->{'associate_tail_re'};
484 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
485 my ($file_prefix,$file_ext)
486 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
487 if ((defined $file_prefix) && (defined $file_ext)) {
488 my $shared_fileroot = $block_hash->{'shared_fileroot'};
489 if (!defined $shared_fileroot->{$file_prefix}) {
490 my $file_prefix_rec = { 'tie_to' => undef,
491 'exts' => {} };
492 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
493 }
494
495 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
496
497 if ($self->can_process_this_file($filename_full_path)) {
498 # This is the document the others should be tied to
499 $file_prefix_rec->{'tie_to'} = $file_ext;
500 }
501 else {
502 if ($file_ext =~ m/$associate_tail_re$/) {
503 # this file should be associated to the main one
504 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
505 }
506 }
507
508 }
509 }
510
511 # check block expressions
512 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
513
514 # now check whether we are actually processing this
515 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
516 return undef; # can't recognise
517 }
518
519 # if we have a block_exp, then this overrides the normal 'smart' blocking
520 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
521
522 # block the cover image if there is one
523 if ($self->{'cover_image'}) {
524 $self->block_cover_image($filename_full_path, $block_hash);
525 }
526
527 return 1;
528}
529
530# plugins that rely on more than process_exp (eg XML plugins) can override this method
531sub can_process_this_file {
532 my $self = shift(@_);
533 my ($filename) = @_;
534
535 if (-d $filename && !$self->{'can_process_directories'}) {
536 return 0;
537 }
538
539 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
540 return 1;
541 }
542 return 0;
543
544}
545
546# just converts path as is to utf8.
547sub filepath_to_utf8 {
548 my $self = shift (@_);
549 my ($file, $file_encoding) = @_;
550 my $filemeta = $file;
551
552 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
553
554 # Whenever filename-encoding is set to any of the auto settings, we
555 # check if the filename is already in UTF8. If it is, then we're done.
556 if($filename_encoding =~ m/auto/) {
557 if(&unicode::check_is_utf8($filemeta))
558 {
559 $filename_encoding = "utf8";
560 return $filemeta;
561 }
562 }
563
564 # Auto setting, but filename is not utf8
565 if ($filename_encoding eq "auto")
566 {
567 # try textcat
568 $filename_encoding = $self->textcat_encoding($filemeta);
569
570 # check the locale next
571 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
572
573
574 # now try the encoding of the document, if available
575 if ($filename_encoding eq "undefined" && defined $file_encoding) {
576 $filename_encoding = $file_encoding;
577 }
578
579 }
580
581 elsif ($filename_encoding eq "auto-language-analysis")
582 {
583 $filename_encoding = $self->textcat_encoding($filemeta);
584
585 # now try the encoding of the document, if available
586 if ($filename_encoding eq "undefined" && defined $file_encoding) {
587 $filename_encoding = $file_encoding;
588 }
589 }
590
591 elsif ($filename_encoding eq "auto-filesystem-encoding")
592 {
593 # try locale
594 $filename_encoding = $self->locale_encoding();
595 }
596
597 elsif ($filename_encoding eq "auto-fl")
598 {
599 # filesystem-encoding (locale) then language-analysis (textcat)
600 $filename_encoding = $self->locale_encoding();
601
602 # try textcat
603 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
604
605 # else assume filename encoding is encoding of file content, if that's available
606 if ($filename_encoding eq "undefined" && defined $file_encoding) {
607 $filename_encoding = $file_encoding;
608 }
609 }
610
611 elsif ($filename_encoding eq "auto-lf")
612 {
613 # language-analysis (textcat) then filesystem-encoding (locale)
614 $filename_encoding = $self->textcat_encoding($filemeta);
615
616 # guess filename encoding from encoding of file content, if available
617 if ($filename_encoding eq "undefined" && defined $file_encoding) {
618 $filename_encoding = $file_encoding;
619 }
620
621 # try locale
622 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
623 }
624
625 # if still undefined, use utf8 as fallback
626 if ($filename_encoding eq "undefined") {
627 $filename_encoding = "utf8";
628 }
629
630 #print STDERR "**** UTF8 encoding the filename $filemeta ";
631
632 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
633 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
634 # be always utf8 (in which case the filename's encoding is also set as utf8 even
635 # though the filename need not be if it originates from another system)--in such
636 # cases attempt to make the filename utf8 to match.
637 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
638 &unicode::ensure_utf8(\$filemeta);
639 }
640
641 # convert non-unicode encodings to utf8
642 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
643 $filemeta = &unicode::unicode2utf8(
644 &unicode::convert2unicode($filename_encoding, \$filemeta)
645 );
646 }
647
648 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
649 return $filemeta;
650}
651
652# gets the filename with no path, converts to utf8, and then dm safes it.
653# filename_encoding set by user
654sub filename_to_utf8_metadata
655{
656 my $self = shift (@_);
657 my ($file, $file_encoding) = @_;
658
659 my $outhandle = $self->{'outhandle'};
660
661 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
662 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
663 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
664
665
666 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
667 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
668
669 return $filemeta;
670}
671
672sub locale_encoding {
673 my $self = shift(@_);
674
675 if (!defined $self->{'filesystem_encoding'}) {
676 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
677 }
678
679 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
680 return $self->{'filesystem_encoding'}; # can be the string "undefined"
681}
682
683sub textcat_encoding {
684 my $self = shift(@_);
685 my ($filemeta) = @_;
686
687 # analyse filenames without extensions and digits (and trimmed of
688 # surrounding whitespace), so that irrelevant chars don't confuse
689 # textcat
690 my $strictfilemeta = $filemeta;
691 $strictfilemeta =~ s/\.[^\.]+$//g;
692 $strictfilemeta =~ s/\d//g;
693 $strictfilemeta =~ s/^\s*//g;
694 $strictfilemeta =~ s/\s*$//g;
695
696 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
697 if(!defined $filename_encoding) {
698 $filename_encoding = "undefined";
699 }
700
701 return $filename_encoding; # can be the string "undefined"
702}
703
704# performs textcat
705sub encoding_from_language_analysis {
706 my $self = shift(@_);
707 my ($text) = @_;
708
709 my $outhandle = $self->{'outhandle'};
710 my $best_encoding = undef;
711
712 # get the language/encoding of the textstring using textcat
713 require textcat; # Only load the textcat module if it is required
714 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
715 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
716
717
718 if (scalar @$results < 0) {
719 return undef;
720 }
721
722 # We have some results, we choose the first
723 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
724
725 $best_encoding = $encoding;
726 if (!defined $best_encoding) {
727 return undef;
728 }
729
730 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
731 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
732 $best_encoding = 'utf8';
733 }
734
735
736 # check for equivalents where textcat doesn't have some encodings...
737 # eg MS versions of standard encodings
738 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
739 my $iso = $1; # which variant of the iso standard?
740 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
741 if ($text =~ /[\x80-\x9f]/) {
742 # Western Europe
743 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
744 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
745 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
746 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
747 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
748 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
749 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
750 }
751 }
752
753 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
754 !defined $encodings::encodings->{$best_encoding})
755 {
756 if ($self->{'verbosity'}) {
757 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
758 }
759 $best_encoding = undef;
760 }
761
762 return $best_encoding;
763}
764
765# uses locale
766sub get_filesystem_encoding
767{
768
769 my $self = shift(@_);
770
771 my $outhandle = $self->{'outhandle'};
772 my $filesystem_encoding = undef;
773
774 eval {
775 # Works for Windows as well, returning the DOS code page in use
776 use POSIX qw(locale_h);
777
778 # With only one parameter, setlocale retrieves the
779 # current value
780 my $current_locale = setlocale(LC_CTYPE);
781
782 my $char_encoding = undef;
783 if ($current_locale =~ m/\./) {
784 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
785 $char_encoding = lc($char_encoding);
786 }
787 else {
788 if ($current_locale =~ m/^(posix|c)$/i) {
789 $char_encoding = "ascii";
790 }
791 }
792
793 if (defined $char_encoding) {
794 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
795 $char_encoding = "$1\_$2\_$3";
796 }
797
798 $char_encoding =~ s/-/_/g;
799 $char_encoding =~ s/^utf_8$/utf8/;
800
801 if ($char_encoding =~ m/^\d+$/) {
802 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
803 $char_encoding = "windows_$char_encoding";
804 }
805 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
806 $char_encoding = "dos_$char_encoding";
807 }
808 }
809
810 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
811 || (defined $encodings::encodings->{$char_encoding})) {
812 $filesystem_encoding = $char_encoding;
813 }
814 else {
815 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
816 }
817 }
818
819
820 };
821 if ($@) {
822 print $outhandle "$@\n";
823 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
824
825 }
826
827 return $filesystem_encoding;
828}
829
830
831
832sub deduce_filename_encoding
833{
834 my $self = shift (@_);
835 my ($file,$metadata,$plugin_filename_encoding) = @_;
836
837 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
838 my $deduced_filename_encoding = undef;
839
840 # Start by looking for manually assigned metadata
841 if (defined $gs_filename_encoding) {
842 if (ref ($gs_filename_encoding) eq "ARRAY") {
843 my $outhandle = $self->{'outhandle'};
844
845 $deduced_filename_encoding = $gs_filename_encoding->[0];
846
847 my $num_vals = scalar(@$gs_filename_encoding);
848 if ($num_vals>1) {
849 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
850 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
851 }
852 }
853 else {
854 $deduced_filename_encoding = $gs_filename_encoding;
855 }
856 }
857
858 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
859 # Look to see if plugin specifies this value
860
861 if (defined $plugin_filename_encoding) {
862 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
863 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
864 my $outhandle = $self->{'outhandle'};
865 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
866 print $outhandle " default to 'auto'\n";
867 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
868 }
869
870 if ($plugin_filename_encoding ne "auto") {
871 # We've been given a specific filenamne encoding
872 # => so use it!
873 $deduced_filename_encoding = $plugin_filename_encoding;
874 }
875 }
876 }
877
878 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
879
880 # Look to file system to provide a character encoding
881
882 # If Windows NTFS, then -- assuming we work with long file names got through
883 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
884
885 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
886 # Can do better than working with the DOS character encoding returned by locale
887 $deduced_filename_encoding = "unicode";
888 }
889 else {
890 # Unix of some form or other
891
892 # See if we can determine the file system encoding through locale
893 $deduced_filename_encoding = $self->locale_encoding();
894
895 # if locale shows us filesystem is utf8, check to see filename is consistent
896 # => if not, then we have an "alien" filename on our hands
897
898 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
899 if (!&unicode::check_is_utf8($file)) {
900 # "alien" filename, so revert
901 $deduced_filename_encoding = undef;
902 }
903 }
904 }
905 }
906
907# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
908# # Last chance, apply textcat to deduce filename encoding
909# $deduced_filename_encoding = $self->textcat_encoding($file);
910# }
911
912 if ($self->{'verbosity'}>3) {
913 my $outhandle = $self->{'outhandle'};
914
915 if (defined $deduced_filename_encoding) {
916 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
917 }
918 else {
919 print $outhandle " No filename encoding deduced\n";
920 }
921 }
922
923 return $deduced_filename_encoding;
924}
925
926
927
928
929# Notionally written to be called once for each document, it is however safe to
930# call multiple times (as in the case of ImagePlugin) which calls this later on
931# after the original image has potentially been converted to a *new* source image
932# format (e.g. TIFF to PNG)
933
934sub set_Source_metadata {
935 my $self = shift (@_);
936 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
937
938 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
939 # and (as a fallback) using %xx if not for non-ascii chars
940 # 2. Sets the url ref (SourceFile) to the URL encoded version
941 # of filename for generated files
942
943 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
944
945 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
946
947 my $octet_file = $raw_file;
948
949 # UTF-8 version of filename
950# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
951# print STDERR "**** Setting Source Metadata given: $octet_file\n";
952# }
953
954 # Deal with (on Windows) raw filenames that are in their
955 # abbreviated DOS form
956
957 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
958 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
959 if (-e $raw_filename) {
960 my $unicode_filename = Win32::GetLongPathName($raw_filename);
961
962 my $unused_full_uf;
963 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
964 }
965 }
966 }
967
968 my $url_encoded_filename;
969 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
970 # => Generate a pretty print version of filename that is mapped to Unicode
971
972 # Use filename_encoding to map raw filename to a Perl unicode-aware string
973 $url_encoded_filename = decode($filename_encoding,$octet_file);
974 }
975 else {
976 # otherwise generate %xx encoded version of filename for char > 127
977 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
978 }
979
980# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
981# print STDERR "****** saving Source as: $url_encoded_filename\n";
982# }
983
984
985 # Source is the UTF8 display name - not necessarily the name of the file on the system
986 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
987
988 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
989 # If using URL encoding, then SourceFile is the url-reference to url-encoded
990 # renamed_raw_url: it's a url that refers to the actual file on the system
991 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
992
993 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
994 $renamed_raw_url);
995
996# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
997# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
998# }
999}
1000
1001# this should be called by all plugins to set the oid of the doc obj, rather
1002# than calling doc_obj->set_OID directly
1003sub add_OID {
1004 my $self = shift (@_);
1005 my ($doc_obj) = @_;
1006
1007 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1008
1009 # see if there is a plugin specific set_OID function
1010 if (defined ($self->can('set_OID'))) {
1011 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1012 }
1013 else {
1014 # use the default set_OID() in doc.pm
1015 $doc_obj->set_OID();
1016 }
1017
1018}
1019
1020# The BasePlugin read_into_doc_obj() function. This function does all the
1021# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1022# a file and sets up a slew of metadata all saved in doc_obj, which
1023# it then returns as part of a tuple (process_status,doc_obj)
1024#
1025# Much of this functionality used to reside in read, but it was broken
1026# down into a supporting routine to make the code more flexible.
1027#
1028# recursive plugins (e.g. RecPlug) and specialized plugins like those
1029# capable of processing many documents within a single file (e.g.
1030# GMLPlug) will normally want to implement their own version of
1031# read_into_doc_obj()
1032#
1033# Note that $base_dir might be "" and that $file might
1034# include directories
1035
1036# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1037sub read_into_doc_obj {
1038 my $self = shift (@_);
1039 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1040
1041 my $outhandle = $self->{'outhandle'};
1042
1043 # should we move this to read? What about secondary plugins?
1044 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1045 my $pp_file = &util::prettyprint_file($base_dir,$file);
1046 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1047 if $self->{'verbosity'} > 1;
1048
1049 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1050
1051 # create a new document
1052 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1053 my $top_section = $doc_obj->get_top_section();
1054
1055 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1056 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1057
1058
1059 my $plugin_filename_encoding = $self->{'filename_encoding'};
1060 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1061 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1062
1063 # plugin specific stuff - what args do we need here??
1064 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1065 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1066 return -1;
1067 }
1068
1069 # include any metadata passed in from previous plugins
1070 # note that this metadata is associated with the top level section
1071 my $section = $doc_obj->get_top_section();
1072 # can we merge these two methods??
1073 $self->add_associated_files($doc_obj, $filename_full_path);
1074 $self->extra_metadata ($doc_obj, $section, $metadata);
1075 $self->auto_extract_metadata($doc_obj);
1076
1077 # if we haven't found any Title so far, assign one
1078 # this was shifted to here from inside read()
1079 $self->title_fallback($doc_obj,$section,$filename_no_path);
1080
1081 $self->add_OID($doc_obj);
1082
1083 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1084 return (1,$doc_obj);
1085}
1086
1087sub post_process_doc_obj {
1088 my $self = shift (@_);
1089 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1090
1091 return 1;
1092}
1093
1094sub add_dummy_text {
1095 my $self = shift(@_);
1096 my ($doc_obj, $section) = @_;
1097
1098 # add NoText metadata so we can hide this dummy text in format statements
1099 $doc_obj->add_metadata($section, "NoText", "1");
1100 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1101
1102}
1103
1104# does nothing. Can be overridden by subclass
1105sub auto_extract_metadata {
1106 my $self = shift(@_);
1107 my ($doc_obj) = @_;
1108}
1109
1110# adds cover image, associate_file options stuff. Should be called by sub class
1111# read_into_doc_obj
1112sub add_associated_files {
1113 my $self = shift(@_);
1114 # whatis filename??
1115 my ($doc_obj, $filename) = @_;
1116
1117 # add in the cover image
1118 if ($self->{'cover_image'}) {
1119 $self->associate_cover_image($doc_obj, $filename);
1120 }
1121 # store the original (used for eg TextPlugin to store the original for OAI)
1122 if ($self->{'store_original_file'}) {
1123 $self->associate_source_file($doc_obj, $filename);
1124 }
1125
1126
1127}
1128
1129# implement this if you are extracting metadata for other documents
1130sub metadata_read {
1131 my $self = shift (@_);
1132 my ($pluginfo, $base_dir, $file, $block_hash,
1133 $extrametakeys, $extrametadata, $extrametafile,
1134 $processor, $gli, $aux) = @_;
1135
1136 # can we process this file??
1137 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1138 return undef unless $self->can_process_this_file($filename_full_path);
1139
1140 return 1; # we recognise the file, but don't actually do anything with it
1141}
1142
1143
1144# The BasePlugin read() function. This function calls read_into_doc_obj()
1145# to ensure all the right things to make general options work for a
1146# given plugin are done. It then calls the process() function which
1147# does all the work specific to a plugin (like the old read functions
1148# used to do). Most plugins should define their own process() function
1149# and let this read() function keep control.
1150#
1151# recursive plugins (e.g. RecPlug) and specialized plugins like those
1152# capable of processing many documents within a single file (e.g.
1153# GMLPlug) might want to implement their own version of read(), but
1154# more likely need to implement their own version of read_into_doc_obj()
1155#
1156# Return number of files processed, undef if can't recognise, -1 if can't
1157# process
1158
1159sub read {
1160 my $self = shift (@_);
1161 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1162
1163 # can we process this file??
1164 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1165
1166 return undef unless $self->can_process_this_file($filename_full_path);
1167
1168 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1169
1170 if ((defined $process_status) && ($process_status == 1)) {
1171
1172 # process the document
1173 $processor->process($doc_obj);
1174
1175 $self->{'num_processed'} ++;
1176 undef $doc_obj;
1177 }
1178 # delete any temp files that we may have created
1179 $self->clean_up_after_doc_obj_processing();
1180
1181
1182 # if process_status == 1, then the file has been processed.
1183 return $process_status;
1184
1185}
1186
1187# returns undef if file is rejected by the plugin
1188sub process {
1189 my $self = shift (@_);
1190 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1191
1192 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1193
1194 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1195 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1196
1197 die "\n";
1198
1199 return undef; # never gets here
1200}
1201
1202# overwrite this method to delete any temp files that we have created
1203sub clean_up_after_doc_obj_processing {
1204 my $self = shift(@_);
1205
1206}
1207
1208# write_file -- used by ConvertToPlug, for example in post processing
1209#
1210# where should this go, is here the best place??
1211sub utf8_write_file {
1212 my $self = shift (@_);
1213 my ($textref, $filename) = @_;
1214
1215 if (!open (FILE, ">:utf8", $filename)) {
1216 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1217 die "\n";
1218 }
1219 print FILE $$textref;
1220
1221 close FILE;
1222}
1223
1224
1225sub filename_based_title
1226{
1227 my $self = shift (@_);
1228 my ($file) = @_;
1229
1230 my $file_derived_title = $file;
1231 $file_derived_title =~ s/_/ /g;
1232 $file_derived_title =~ s/\.[^.]+$//;
1233
1234 return $file_derived_title;
1235}
1236
1237
1238sub title_fallback
1239{
1240 my $self = shift (@_);
1241 my ($doc_obj,$section,$file) = @_;
1242
1243 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1244 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1245
1246 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1247 my $file_derived_title;
1248 if (defined $source_file) {
1249 $file_derived_title = $self->filename_based_title($source_file);
1250 }
1251 else {
1252 # pp = pretty print
1253 my $pp_file = (defined $source_file) ? $source_file : $file;
1254
1255 my $raw_title = $self->filename_based_title($file);
1256 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1257 }
1258
1259
1260 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1261 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1262 }
1263 else {
1264 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1265 }
1266 }
1267
1268}
1269
1270# add any extra metadata that's been passed around from one
1271# plugin to another.
1272# extra_metadata uses add_utf8_metadata so it expects metadata values
1273# to already be in utf8
1274sub extra_metadata {
1275 my $self = shift (@_);
1276 my ($doc_obj, $cursection, $metadata) = @_;
1277
1278 my $associate_tail_re = $self->{'associate_tail_re'};
1279
1280 foreach my $field (keys(%$metadata)) {
1281 # $metadata->{$field} may be an array reference
1282 if ($field eq "gsdlassocfile_tobe") {
1283 # 'gsdlassocfile_tobe' is artificially introduced metadata
1284 # that is used to signal that certain additional files should
1285 # be tied to this document. Useful in situations where a
1286 # metadata pass in the plugin pipeline works out some files
1287 # need to be associated with a document, but the document hasn't
1288 # been formed yet.
1289 my $equiv_form = "";
1290 foreach my $gaf (@{$metadata->{$field}}) {
1291 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1292 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1293
1294 # we need to make sure the filename is valid utf-8 - we do
1295 # this by url or base64 encoding it
1296 # $tail_filename is the name that we store the file as
1297 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1298 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1299 $doc_obj->associate_source_file($full_filename);
1300 # If the filename is url_encoded, we need to encode the % signs
1301 # in the filename, so that it works in a url
1302 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1303 # work out extended tail extension (i.e. matching tail re)
1304
1305 my ($file_prefix,$file_extended_ext)
1306 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1307 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1308 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1309 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1310 my $srcicon = "_icon".$doc_ext."_";
1311 my $end_doclink = "</a>";
1312
1313 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1314
1315 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1316 # for metadata such as [mp3._edited] [mp3._full] ...
1317 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1318 }
1319
1320 # for multiple metadata such as [mp3.assoclink]
1321 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1322
1323 $equiv_form .= " $assoc_form";
1324 }
1325 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1326 }
1327 elsif ($field eq "gsdlzipfilename") {
1328 # special case for when files have come out of a zip. source_path
1329 # (used for archives dbs and keeping track for incremental import)
1330 # must be set to the zip file name
1331 my $zip_filename = $metadata->{$field};
1332 # overwrite the source_path
1333 $doc_obj->set_source_path($zip_filename);
1334 # and set the metadata
1335 $zip_filename = &util::filename_within_collection($zip_filename);
1336 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1337 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1338 }
1339 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1340 map {
1341 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1342 } @{$metadata->{$field}};
1343 } else {
1344 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1345 }
1346 }
1347}
1348
1349
1350sub compile_stats {
1351 my $self = shift(@_);
1352 my ($stats) = @_;
1353
1354 $stats->{'num_processed'} += $self->{'num_processed'};
1355 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1356 $stats->{'num_archives'} += $self->{'num_archives'};
1357
1358}
1359sub associate_source_file {
1360 my $self = shift(@_);
1361
1362 my ($doc_obj, $filename) = @_;
1363 my $cursection = $doc_obj->get_top_section();
1364 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1365
1366 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1367 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1368}
1369
1370sub associate_cover_image {
1371 my $self = shift(@_);
1372 my ($doc_obj, $filename) = @_;
1373
1374 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1375
1376 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1377 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1378
1379 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1380 # don't stat() for existence e.g. for multiple document input files
1381 # (eg SplitPlug)
1382 return;
1383 }
1384
1385 my $top_section=$doc_obj->get_top_section();
1386
1387 if (&util::fd_exists($upgraded_filename)) {
1388 $doc_obj->associate_source_file($filename);
1389 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1390 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1391 } else {
1392 my $upper_filename = $filename;
1393 my $upgraded_upper_filename = $upgraded_filename;
1394
1395 $upper_filename =~ s/jpg$/JPG/;
1396 $upgraded_upper_filename =~ s/jpg$/JPG/;
1397
1398 if (&util::fd_exists($upgraded_upper_filename)) {
1399 $doc_obj->associate_source_file($upper_filename);
1400 $doc_obj->associate_file($upper_filename, "cover.jpg",
1401 "image/jpeg");
1402 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1403 } else {
1404 # file doesn't exist, so record the fact that it's missing so
1405 # we don't stat() again (stat is slow)
1406 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1407 }
1408 }
1409
1410}
1411
1412
1413# Overridden by exploding plugins (eg. ISISPlug)
1414sub clean_up_after_exploding
1415{
1416 my $self = shift(@_);
1417}
1418
1419
1420
14211;
Note: See TracBrowser for help on using the repository browser.