source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 23759

Last change on this file since 23759 was 23759, checked in by davidb, 13 years ago

Now that generalized exploding is coming on-line, some additional checks are needed. In this case, an exploded document will already have an original metadata field called Source. Detect this and retain its value under the metadata name OrigSource

  • Property svn:keywords set to Author Date Id Revision
File size: 44.6 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34
35use encodings;
36use unicode;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
53 'desc' => "{BasePlugin.rename_method.base64}" },
54 { 'name' => "none",
55 'desc' => "{BasePlugin.rename_method.none}",
56 'hiddengli' => "yes" } ];
57
58our $encoding_list =
59 [ { 'name' => "ascii",
60 'desc' => "{BasePlugin.encoding.ascii}" },
61 { 'name' => "utf8",
62 'desc' => "{BasePlugin.encoding.utf8}" },
63 { 'name' => "unicode",
64 'desc' => "{BasePlugin.encoding.unicode}" } ];
65
66
67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70 my $hashEncode =
71 {'name' => $enc,
72 'desc' => $e->{$enc}->{'name'}};
73
74 push(@{$encoding_list},$hashEncode);
75}
76
77our $encoding_plus_auto_list =
78 [ { 'name' => "auto",
79 'desc' => "{BasePlugin.filename_encoding.auto}" },
80 { 'name' => "auto-language-analysis",
81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82 { 'name' => "auto-filesystem-encoding",
83 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84 { 'name' => "auto-fl",
85 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86 { 'name' => "auto-lf",
87 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
88
89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
91our $oidtype_list =
92 [ { 'name' => "auto",
93 'desc' => "{BasePlugin.OIDtype.auto}" },
94 { 'name' => "hash",
95 'desc' => "{import.OIDtype.hash}" },
96 { 'name' => "assigned",
97 'desc' => "{import.OIDtype.assigned}" },
98 { 'name' => "incremental",
99 'desc' => "{import.OIDtype.incremental}" },
100 { 'name' => "dirname",
101 'desc' => "{import.OIDtype.dirname}" } ];
102
103my $arguments =
104 [ { 'name' => "process_exp",
105 'desc' => "{BasePlugin.process_exp}",
106 'type' => "regexp",
107 'deft' => "",
108 'reqd' => "no" },
109 { 'name' => "no_blocking",
110 'desc' => "{BasePlugin.no_blocking}",
111 'type' => "flag",
112 'reqd' => "no"},
113 { 'name' => "block_exp",
114 'desc' => "{BasePlugin.block_exp}",
115 'type' => "regexp",
116 'deft' => "",
117 'reqd' => "no" },
118 { 'name' => "store_original_file",
119 'desc' => "{BasePlugin.store_original_file}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "associate_ext",
123 'desc' => "{BasePlugin.associate_ext}",
124 'type' => "string",
125 'reqd' => "no" },
126 { 'name' => "associate_tail_re",
127 'desc' => "{BasePlugin.associate_tail_re}",
128 'type' => "string",
129 'reqd' => "no" },
130 { 'name' => "OIDtype",
131 'desc' => "{import.OIDtype}",
132 'type' => "enum",
133 'list' => $oidtype_list,
134 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
135 'deft' => "auto",
136 'reqd' => "no" },
137 { 'name' => "OIDmetadata",
138 'desc' => "{import.OIDmetadata}",
139 'type' => "metadata",
140 'deft' => "dc.Identifier",
141 'reqd' => "no" },
142# { 'name' => "use_as_doc_identifier",
143# 'desc' => "{BasePlugin.use_as_doc_identifier}",
144# 'type' => "string",
145# 'reqd' => "no" ,
146# 'deft' => "" } ,
147 { 'name' => "no_cover_image",
148 'desc' => "{BasePlugin.no_cover_image}",
149 'type' => "flag",
150 'reqd' => "no" },
151 { 'name' => "filename_encoding",
152 'desc' => "{BasePlugin.filename_encoding}",
153 'type' => "enum",
154 'deft' => "auto",
155 'list' => $encoding_plus_auto_list,
156 'reqd' => "no" },
157 { 'name' => "smart_block",
158 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
159 'type' => "flag",
160 'reqd' => "no",
161 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
162 { 'name' => "file_rename_method",
163 'desc' => "{BasePlugin.file_rename_method}",
164 'type' => "enum",
165 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
166 'list' => $file_rename_method_list,
167 'reqd' => "no"
168 }
169
170 ];
171
172
173my $options = { 'name' => "BasePlugin",
174 'desc' => "{BasePlugin.desc}",
175 'abstract' => "yes",
176 'inherits' => "no",
177 'args' => $arguments };
178
179sub new {
180
181 my ($class) = shift (@_);
182 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
183 push(@$pluginlist, $class);
184
185 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
186 push(@{$hashArgOptLists->{"OptList"}},$options);
187
188 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
189
190 if ($self->{'info_only'}) {
191 # don't worry about any options etc
192 return bless $self, $class;
193 }
194
195 if ($self->{'smart_block'}) {
196 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
197 }
198 $self->{'smart_block'} = undef;
199
200 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
201 $self->{'plugin_type'} = $plugin_name;
202
203 # remove ex. from OIDmetadata
204 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
205 $self->{'num_processed'} = 0;
206 $self->{'num_not_processed'} = 0;
207 $self->{'num_blocked'} = 0;
208 $self->{'num_archives'} = 0;
209 $self->{'cover_image'} = 1; # cover image is on by default
210 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
211 $self->{'can_process_directories'} = 0;
212 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
213
214 my $associate_ext = $self->{'associate_ext'};
215 if ((defined $associate_ext) && ($associate_ext ne "")) {
216
217 my $associate_tail_re = $self->{'associate_tail_re'};
218 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
219 my $outhandle = $self->{'outhandle'};
220 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
221 print $outhandle " defaulting to 'associate_tail_re'\n";
222 }
223 else {
224 my @exts = split(/,/,$associate_ext);
225
226 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
227 my $associate_tail_re = join("|",@exts_bracketed);
228 $self->{'associate_tail_re'} = $associate_tail_re;
229 }
230
231 delete $self->{'associate_ext'};
232 }
233
234 return bless $self, $class;
235
236}
237
238sub merge_inheritance
239{
240 my $self = {};
241 my @child_selfs = @_;
242
243 foreach my $child_self (@child_selfs) {
244 foreach my $key (keys %$child_self) {
245 if (defined $self->{$key}) {
246 if ($self->{$key} ne $child_self->{$key}) {
247# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
248# print STDERR "Existing stored value = $self->{$key}\n";
249# print STDERR "New (child) value = $child_self->{$key}\n";
250# print STDERR "Keeping existing value\n";
251 # Existing value seems to be option specified in collect.cfg
252
253 ### $self->{$key} = $child_self->{$key};
254
255 }
256 else {
257## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
258 }
259
260 }
261 else {
262 $self->{$key} = $child_self->{$key};
263 }
264 }
265 }
266
267 return $self;
268}
269
270# initialize BasePlugin options
271# if init() is overridden in a sub-class, remember to call BasePlugin::init()
272sub init {
273 my $self = shift (@_);
274 my ($verbosity, $outhandle, $failhandle) = @_;
275
276 # verbosity is passed through from the processor
277 $self->{'verbosity'} = $verbosity;
278
279 # as are the outhandle and failhandle
280 $self->{'outhandle'} = $outhandle if defined $outhandle;
281 $self->{'failhandle'} = $failhandle;
282# $self->SUPER::init(@_);
283
284 # set process_exp and block_exp to defaults unless they were
285 # explicitly set
286
287 if ((!$self->is_recursive()) and
288 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
289
290 $self->{'process_exp'} = $self->get_default_process_exp ();
291 if ($self->{'process_exp'} eq "") {
292 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
293 }
294 }
295
296 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
297 $self->{'block_exp'} = $self->get_default_block_exp ();
298 }
299
300}
301
302sub begin {
303 my $self = shift (@_);
304 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
305
306 if ($self->{'OIDtype'} eq "auto") {
307 # hasn't been set in the plugin, use the processor values
308 $self->{'OIDtype'} = $processor->{'OIDtype'};
309 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
310 }
311 if ($self->{'OIDtype'} eq "hash") {
312 # should we hash on the file or on the doc xml??
313 $self->{'OIDtype'} = $self->get_oid_hash_type();
314 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
315 $self->{'OIDtype'} = "hash_on_file";
316 }
317 }
318}
319
320# This is called once if removeold is set with import.pl. Most plugins will do
321# nothing but if a plugin does any stuff outside of creating doc obj, then
322# it may need to clear something.
323sub remove_all {
324 my $self = shift (@_);
325 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
326}
327
328# This is called per document for docs that have been deleted from the
329# collection. Most plugins will do nothing
330# but if a plugin does any stuff outside of creating doc obj, then it may need
331# to clear something.
332sub remove_one {
333 my $self = shift (@_);
334
335 my ($file, $oids, $archivedir) = @_;
336 return 0 if $self->can_process_this_file($file);
337 return undef;
338}
339
340sub end {
341 # potentially called at the end of each plugin pass
342 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
343
344 my ($self) = shift (@_);
345}
346
347sub deinit {
348 # called only once, after all plugin passes have been done
349
350 my ($self) = @_;
351}
352
353# default hashing type is to hash on the original file (or converted file)
354# override this to return hash_on_ga_xml for filetypes where hashing on the
355# file is no good eg video
356sub get_oid_hash_type {
357
358 my $self = shift (@_);
359
360 return "hash_on_file";
361}
362
363
364# this function should be overridden to return 1
365# in recursive plugins
366sub is_recursive {
367 my $self = shift (@_);
368
369 return 0;
370}
371
372sub get_default_block_exp {
373 my $self = shift (@_);
374
375 return "";
376}
377
378sub get_default_process_exp {
379 my $self = shift (@_);
380
381 return "";
382}
383
384
385# rename imported files and assoc files using URL encoding by default
386# as this will work for most plugins and give more legible filenames
387sub get_default_file_rename_method() {
388 my $self = shift (@_);
389 return "url";
390}
391
392# returns this plugin's active (possibly user-selected) file_rename_method
393sub get_file_rename_method() {
394 my $self = shift (@_);
395 my $rename_method = $self->{'file_rename_method'};
396 if($rename_method) {
397 return $rename_method;
398 } else {
399 return $self->get_default_file_rename_method();
400 }
401}
402
403# default implementation is to do nothing
404sub store_block_files {
405
406 my $self =shift (@_);
407 my ($filename_full_path, $block_hash) = @_;
408
409}
410
411# put files to block into hash
412sub use_block_expressions {
413
414 my $self =shift (@_);
415 my ($filename_full_path, $block_hash) = @_;
416
417 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
418
419 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
420 &util::block_filename($block_hash,$filename_full_path);
421 }
422
423}
424
425#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
426sub block_cover_image
427{
428 my $self =shift;
429 my ($filename, $block_hash) = @_;
430
431 $filename = &util::upgrade_if_dos_filename($filename);
432
433 if ($self->{'cover_image'}) {
434 my $coverfile = $filename;
435 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
436
437 if (!&util::fd_exists($coverfile)) {
438 $coverfile =~ s/jpg$/JPG/;
439 }
440 if (&util::fd_exists($coverfile)) {
441 &util::block_filename($block_hash,$coverfile);
442 }
443 }
444
445 return;
446}
447
448
449# discover all the files that should be blocked by this plugin
450# check the args ...
451sub file_block_read {
452
453 my $self = shift (@_);
454 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
455 # Keep track of filenames with same root but different extensions
456 # Used to support -associate_ext and the more generalised
457 # -associate_tail_re
458 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
459
460 if (!-d $filename_full_path) {
461 $block_hash->{'all_files'}->{$file} = 1;
462 }
463
464 my $associate_tail_re = $self->{'associate_tail_re'};
465 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
466 my ($file_prefix,$file_ext)
467 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
468 if ((defined $file_prefix) && (defined $file_ext)) {
469 my $shared_fileroot = $block_hash->{'shared_fileroot'};
470 if (!defined $shared_fileroot->{$file_prefix}) {
471 my $file_prefix_rec = { 'tie_to' => undef,
472 'exts' => {} };
473 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
474 }
475
476 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
477
478 if ($self->can_process_this_file($filename_full_path)) {
479 # This is the document the others should be tied to
480 $file_prefix_rec->{'tie_to'} = $file_ext;
481 }
482 else {
483 if ($file_ext =~ m/$associate_tail_re$/) {
484 # this file should be associated to the main one
485 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
486 }
487 }
488
489 }
490 }
491
492 # check block expressions
493 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
494
495 # now check whether we are actually processing this
496 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
497 return undef; # can't recognise
498 }
499
500 # if we have a block_exp, then this overrides the normal 'smart' blocking
501 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
502
503 # block the cover image if there is one
504 if ($self->{'cover_image'}) {
505 $self->block_cover_image($filename_full_path, $block_hash);
506 }
507
508 return 1;
509}
510
511# plugins that rely on more than process_exp (eg XML plugins) can override this method
512sub can_process_this_file {
513 my $self = shift(@_);
514 my ($filename) = @_;
515
516 if (-d $filename && !$self->{'can_process_directories'}) {
517 return 0;
518 }
519
520 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
521 return 1;
522 }
523 return 0;
524
525}
526
527# just converts path as is to utf8.
528sub filepath_to_utf8 {
529 my $self = shift (@_);
530 my ($file, $file_encoding) = @_;
531 my $filemeta = $file;
532
533 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
534
535 # Whenever filename-encoding is set to any of the auto settings, we
536 # check if the filename is already in UTF8. If it is, then we're done.
537 if($filename_encoding =~ m/auto/) {
538 if(&unicode::check_is_utf8($filemeta))
539 {
540 $filename_encoding = "utf8";
541 return $filemeta;
542 }
543 }
544
545 # Auto setting, but filename is not utf8
546 if ($filename_encoding eq "auto")
547 {
548 # try textcat
549 $filename_encoding = $self->textcat_encoding($filemeta);
550
551 # check the locale next
552 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
553
554
555 # now try the encoding of the document, if available
556 if ($filename_encoding eq "undefined" && defined $file_encoding) {
557 $filename_encoding = $file_encoding;
558 }
559
560 }
561
562 elsif ($filename_encoding eq "auto-language-analysis")
563 {
564 $filename_encoding = $self->textcat_encoding($filemeta);
565
566 # now try the encoding of the document, if available
567 if ($filename_encoding eq "undefined" && defined $file_encoding) {
568 $filename_encoding = $file_encoding;
569 }
570 }
571
572 elsif ($filename_encoding eq "auto-filesystem-encoding")
573 {
574 # try locale
575 $filename_encoding = $self->locale_encoding();
576 }
577
578 elsif ($filename_encoding eq "auto-fl")
579 {
580 # filesystem-encoding (locale) then language-analysis (textcat)
581 $filename_encoding = $self->locale_encoding();
582
583 # try textcat
584 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
585
586 # else assume filename encoding is encoding of file content, if that's available
587 if ($filename_encoding eq "undefined" && defined $file_encoding) {
588 $filename_encoding = $file_encoding;
589 }
590 }
591
592 elsif ($filename_encoding eq "auto-lf")
593 {
594 # language-analysis (textcat) then filesystem-encoding (locale)
595 $filename_encoding = $self->textcat_encoding($filemeta);
596
597 # guess filename encoding from encoding of file content, if available
598 if ($filename_encoding eq "undefined" && defined $file_encoding) {
599 $filename_encoding = $file_encoding;
600 }
601
602 # try locale
603 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
604 }
605
606 # if still undefined, use utf8 as fallback
607 if ($filename_encoding eq "undefined") {
608 $filename_encoding = "utf8";
609 }
610
611 #print STDERR "**** UTF8 encoding the filename $filemeta ";
612
613 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
614 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
615 # be always utf8 (in which case the filename's encoding is also set as utf8 even
616 # though the filename need not be if it originates from another system)--in such
617 # cases attempt to make the filename utf8 to match.
618 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
619 &unicode::ensure_utf8(\$filemeta);
620 }
621
622 # convert non-unicode encodings to utf8
623 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
624 $filemeta = &unicode::unicode2utf8(
625 &unicode::convert2unicode($filename_encoding, \$filemeta)
626 );
627 }
628
629 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
630 return $filemeta;
631}
632
633# gets the filename with no path, converts to utf8, and then dm safes it.
634# filename_encoding set by user
635sub filename_to_utf8_metadata
636{
637 my $self = shift (@_);
638 my ($file, $file_encoding) = @_;
639
640 my $outhandle = $self->{'outhandle'};
641
642 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
643 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
644 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
645
646
647 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
648 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
649
650 return $filemeta;
651}
652
653sub locale_encoding {
654 my $self = shift(@_);
655
656 if (!defined $self->{'filesystem_encoding'}) {
657 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
658 }
659
660 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
661 return $self->{'filesystem_encoding'}; # can be the string "undefined"
662}
663
664sub textcat_encoding {
665 my $self = shift(@_);
666 my ($filemeta) = @_;
667
668 # analyse filenames without extensions and digits (and trimmed of
669 # surrounding whitespace), so that irrelevant chars don't confuse
670 # textcat
671 my $strictfilemeta = $filemeta;
672 $strictfilemeta =~ s/\.[^\.]+$//g;
673 $strictfilemeta =~ s/\d//g;
674 $strictfilemeta =~ s/^\s*//g;
675 $strictfilemeta =~ s/\s*$//g;
676
677 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
678 if(!defined $filename_encoding) {
679 $filename_encoding = "undefined";
680 }
681
682 return $filename_encoding; # can be the string "undefined"
683}
684
685# performs textcat
686sub encoding_from_language_analysis {
687 my $self = shift(@_);
688 my ($text) = @_;
689
690 my $outhandle = $self->{'outhandle'};
691 my $best_encoding = undef;
692
693 # get the language/encoding of the textstring using textcat
694 require textcat; # Only load the textcat module if it is required
695 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
696 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
697
698
699 if (scalar @$results < 0) {
700 return undef;
701 }
702
703 # We have some results, we choose the first
704 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
705
706 $best_encoding = $encoding;
707 if (!defined $best_encoding) {
708 return undef;
709 }
710
711 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
712 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
713 $best_encoding = 'utf8';
714 }
715
716
717 # check for equivalents where textcat doesn't have some encodings...
718 # eg MS versions of standard encodings
719 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
720 my $iso = $1; # which variant of the iso standard?
721 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
722 if ($text =~ /[\x80-\x9f]/) {
723 # Western Europe
724 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
725 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
726 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
727 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
728 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
729 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
730 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
731 }
732 }
733
734 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
735 !defined $encodings::encodings->{$best_encoding})
736 {
737 if ($self->{'verbosity'}) {
738 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
739 }
740 $best_encoding = undef;
741 }
742
743 return $best_encoding;
744}
745
746# uses locale
747sub get_filesystem_encoding
748{
749
750 my $self = shift(@_);
751
752 my $outhandle = $self->{'outhandle'};
753 my $filesystem_encoding = undef;
754
755 eval {
756 # Works for Windows as well, returning the DOS code page in use
757 use POSIX qw(locale_h);
758
759 # With only one parameter, setlocale retrieves the
760 # current value
761 my $current_locale = setlocale(LC_CTYPE);
762
763 my $char_encoding = undef;
764 if ($current_locale =~ m/\./) {
765 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
766 $char_encoding = lc($char_encoding);
767 }
768 else {
769 if ($current_locale =~ m/^(posix|c)$/i) {
770 $char_encoding = "ascii";
771 }
772 }
773
774 if (defined $char_encoding) {
775 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
776 $char_encoding = "$1\_$2\_$3";
777 }
778
779 $char_encoding =~ s/-/_/g;
780 $char_encoding =~ s/^utf_8$/utf8/;
781
782 if ($char_encoding =~ m/^\d+$/) {
783 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
784 $char_encoding = "windows_$char_encoding";
785 }
786 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
787 $char_encoding = "dos_$char_encoding";
788 }
789 }
790
791 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
792 || (defined $encodings::encodings->{$char_encoding})) {
793 $filesystem_encoding = $char_encoding;
794 }
795 else {
796 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
797 }
798 }
799
800
801 };
802 if ($@) {
803 print $outhandle "$@\n";
804 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
805
806 }
807
808 return $filesystem_encoding;
809}
810
811
812
813sub deduce_filename_encoding
814{
815 my $self = shift (@_);
816 my ($file,$metadata,$plugin_filename_encoding) = @_;
817
818 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
819 my $deduced_filename_encoding = undef;
820
821 # Start by looking for manually assigned metadata
822 if (defined $gs_filename_encoding) {
823 if (ref ($gs_filename_encoding) eq "ARRAY") {
824 my $outhandle = $self->{'outhandle'};
825
826 $deduced_filename_encoding = $gs_filename_encoding->[0];
827
828 my $num_vals = scalar(@$gs_filename_encoding);
829 if ($num_vals>1) {
830 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
831 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
832 }
833 }
834 else {
835 $deduced_filename_encoding = $gs_filename_encoding;
836 }
837 }
838
839 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
840 # Look to see if plugin specifies this value
841
842 if (defined $plugin_filename_encoding) {
843 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
844 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
845 my $outhandle = $self->{'outhandle'};
846 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
847 print $outhandle " default to 'auto'\n";
848 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
849 }
850
851 if ($plugin_filename_encoding ne "auto") {
852 # We've been given a specific filenamne encoding
853 # => so use it!
854 $deduced_filename_encoding = $plugin_filename_encoding;
855 }
856 }
857 }
858
859 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
860
861 # Look to file system to provide a character encoding
862
863 # If Windows NTFS, then -- assuming we work with long file names got through
864 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
865
866 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
867 # Can do better than working with the DOS character encoding returned by locale
868 $deduced_filename_encoding = "unicode";
869 }
870 else {
871 # Unix of some form or other
872
873 # See if we can determine the file system encoding through locale
874 $deduced_filename_encoding = $self->locale_encoding();
875
876 # if locale shows us filesystem is utf8, check to see filename is consistent
877 # => if not, then we have an "alien" filename on our hands
878
879 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
880 if (!&unicode::check_is_utf8($file)) {
881 # "alien" filename, so revert
882 $deduced_filename_encoding = undef;
883 }
884 }
885 }
886 }
887
888# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
889# # Last chance, apply textcat to deduce filename encoding
890# $deduced_filename_encoding = $self->textcat_encoding($file);
891# }
892
893 if ($self->{'verbosity'}>3) {
894 my $outhandle = $self->{'outhandle'};
895
896 if (defined $deduced_filename_encoding) {
897 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
898 }
899 else {
900 print $outhandle " No filename encoding deduced\n";
901 }
902 }
903
904 return $deduced_filename_encoding;
905}
906
907
908
909
910# Notionally written to be called once for each document, it is however safe to
911# call multiple times (as in the case of ImagePlugin) which calls this later on
912# after the original image has potentially been converted to a *new* source image
913# format (e.g. TIFF to PNG)
914
915sub set_Source_metadata {
916 my $self = shift (@_);
917 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
918
919 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
920 # and (as a fallback) using %xx if not for non-ascii chars
921 # 2. Sets the url ref (SourceFile) to the URL encoded version
922 # of filename for generated files
923
924 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
925
926 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
927
928 my $octet_file = $raw_file;
929
930 # UTF-8 version of filename
931# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
932# print STDERR "**** Setting Source Metadata given: $octet_file\n";
933# }
934
935 # Deal with (on Windows) raw filenames that are in their
936 # abbreviated DOS form
937
938 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
939 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
940 if (-e $raw_filename) {
941 my $unicode_filename = Win32::GetLongPathName($raw_filename);
942
943 my $unused_full_uf;
944 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
945 }
946 }
947 }
948
949 my $url_encoded_filename;
950 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
951 # => Generate a pretty print version of filename that is mapped to Unicode
952
953 # Use filename_encoding to map raw filename to a Perl unicode-aware string
954 $url_encoded_filename = decode($filename_encoding,$octet_file);
955 }
956 else {
957 # otherwise generate %xx encoded version of filename for char > 127
958 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
959 }
960
961# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
962# print STDERR "****** saving Source as: $url_encoded_filename\n";
963# }
964
965 # In the case of converted files and (generalized) exploded documents, there
966 # will already be a source filename => store as OrigSource before overriding
967 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
968 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
969 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
970 }
971
972 # Source is the UTF8 display name - not necessarily the name of the file on the system
973 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
974
975
976 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
977 # If using URL encoding, then SourceFile is the url-reference to url-encoded
978 # renamed_raw_url: it's a url that refers to the actual file on the system
979 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
980
981 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
982 $renamed_raw_url);
983
984# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
985# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
986# }
987}
988
989# this should be called by all plugins to set the oid of the doc obj, rather
990# than calling doc_obj->set_OID directly
991sub add_OID {
992 my $self = shift (@_);
993 my ($doc_obj) = @_;
994
995 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
996
997 # see if there is a plugin specific set_OID function
998 if (defined ($self->can('set_OID'))) {
999 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1000 }
1001 else {
1002 # use the default set_OID() in doc.pm
1003 $doc_obj->set_OID();
1004 }
1005
1006}
1007
1008# The BasePlugin read_into_doc_obj() function. This function does all the
1009# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1010# a file and sets up a slew of metadata all saved in doc_obj, which
1011# it then returns as part of a tuple (process_status,doc_obj)
1012#
1013# Much of this functionality used to reside in read, but it was broken
1014# down into a supporting routine to make the code more flexible.
1015#
1016# recursive plugins (e.g. RecPlug) and specialized plugins like those
1017# capable of processing many documents within a single file (e.g.
1018# GMLPlug) will normally want to implement their own version of
1019# read_into_doc_obj()
1020#
1021# Note that $base_dir might be "" and that $file might
1022# include directories
1023
1024# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1025sub read_into_doc_obj {
1026 my $self = shift (@_);
1027 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1028
1029 my $outhandle = $self->{'outhandle'};
1030
1031 # should we move this to read? What about secondary plugins?
1032 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1033 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1034 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1035 if $self->{'verbosity'} > 1;
1036
1037 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1038
1039 # create a new document
1040 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1041 my $top_section = $doc_obj->get_top_section();
1042
1043 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1044 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1045
1046
1047 my $plugin_filename_encoding = $self->{'filename_encoding'};
1048 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1049 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1050
1051 # plugin specific stuff - what args do we need here??
1052 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1053 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1054 return -1;
1055 }
1056
1057 # include any metadata passed in from previous plugins
1058 # note that this metadata is associated with the top level section
1059 my $section = $doc_obj->get_top_section();
1060 # can we merge these two methods??
1061 $self->add_associated_files($doc_obj, $filename_full_path);
1062 $self->extra_metadata ($doc_obj, $section, $metadata);
1063 $self->auto_extract_metadata($doc_obj);
1064
1065 # if we haven't found any Title so far, assign one
1066 # this was shifted to here from inside read()
1067 $self->title_fallback($doc_obj,$section,$filename_no_path);
1068
1069 $self->add_OID($doc_obj);
1070
1071 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1072 return (1,$doc_obj);
1073}
1074
1075sub post_process_doc_obj {
1076 my $self = shift (@_);
1077 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1078
1079 return 1;
1080}
1081
1082sub add_dummy_text {
1083 my $self = shift(@_);
1084 my ($doc_obj, $section) = @_;
1085
1086 # add NoText metadata so we can hide this dummy text in format statements
1087 $doc_obj->add_metadata($section, "NoText", "1");
1088 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1089
1090}
1091
1092# does nothing. Can be overridden by subclass
1093sub auto_extract_metadata {
1094 my $self = shift(@_);
1095 my ($doc_obj) = @_;
1096}
1097
1098# adds cover image, associate_file options stuff. Should be called by sub class
1099# read_into_doc_obj
1100sub add_associated_files {
1101 my $self = shift(@_);
1102 # whatis filename??
1103 my ($doc_obj, $filename) = @_;
1104
1105 # add in the cover image
1106 if ($self->{'cover_image'}) {
1107 $self->associate_cover_image($doc_obj, $filename);
1108 }
1109 # store the original (used for eg TextPlugin to store the original for OAI)
1110 if ($self->{'store_original_file'}) {
1111 $self->associate_source_file($doc_obj, $filename);
1112 }
1113
1114
1115}
1116
1117# implement this if you are extracting metadata for other documents
1118sub metadata_read {
1119 my $self = shift (@_);
1120 my ($pluginfo, $base_dir, $file, $block_hash,
1121 $extrametakeys, $extrametadata, $extrametafile,
1122 $processor, $gli, $aux) = @_;
1123
1124 # can we process this file??
1125 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1126 return undef unless $self->can_process_this_file($filename_full_path);
1127
1128 return 1; # we recognise the file, but don't actually do anything with it
1129}
1130
1131
1132# The BasePlugin read() function. This function calls read_into_doc_obj()
1133# to ensure all the right things to make general options work for a
1134# given plugin are done. It then calls the process() function which
1135# does all the work specific to a plugin (like the old read functions
1136# used to do). Most plugins should define their own process() function
1137# and let this read() function keep control.
1138#
1139# recursive plugins (e.g. RecPlug) and specialized plugins like those
1140# capable of processing many documents within a single file (e.g.
1141# GMLPlug) might want to implement their own version of read(), but
1142# more likely need to implement their own version of read_into_doc_obj()
1143#
1144# Return number of files processed, undef if can't recognise, -1 if can't
1145# process
1146
1147sub read {
1148 my $self = shift (@_);
1149 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1150
1151 # can we process this file??
1152 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1153
1154 return undef unless $self->can_process_this_file($filename_full_path);
1155
1156 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1157
1158 if ((defined $process_status) && ($process_status == 1)) {
1159
1160 # process the document
1161 $processor->process($doc_obj);
1162
1163 $self->{'num_processed'} ++;
1164 undef $doc_obj;
1165 }
1166 # delete any temp files that we may have created
1167 $self->clean_up_after_doc_obj_processing();
1168
1169
1170 # if process_status == 1, then the file has been processed.
1171 return $process_status;
1172
1173}
1174
1175# returns undef if file is rejected by the plugin
1176sub process {
1177 my $self = shift (@_);
1178 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1179
1180 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1181
1182 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1183 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1184
1185 die "\n";
1186
1187 return undef; # never gets here
1188}
1189
1190# overwrite this method to delete any temp files that we have created
1191sub clean_up_after_doc_obj_processing {
1192 my $self = shift(@_);
1193
1194}
1195
1196# write_file -- used by ConvertToPlug, for example in post processing
1197#
1198# where should this go, is here the best place??
1199sub utf8_write_file {
1200 my $self = shift (@_);
1201 my ($textref, $filename) = @_;
1202
1203 if (!open (FILE, ">:utf8", $filename)) {
1204 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1205 die "\n";
1206 }
1207 print FILE $$textref;
1208
1209 close FILE;
1210}
1211
1212
1213sub filename_based_title
1214{
1215 my $self = shift (@_);
1216 my ($file) = @_;
1217
1218 my $file_derived_title = $file;
1219 $file_derived_title =~ s/_/ /g;
1220 $file_derived_title =~ s/\.[^.]+$//;
1221
1222 return $file_derived_title;
1223}
1224
1225
1226sub title_fallback
1227{
1228 my $self = shift (@_);
1229 my ($doc_obj,$section,$file) = @_;
1230
1231 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1232 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1233
1234 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1235 my $file_derived_title;
1236 if (defined $source_file) {
1237 $file_derived_title = $self->filename_based_title($source_file);
1238 }
1239 else {
1240 # pp = pretty print
1241 my $pp_file = (defined $source_file) ? $source_file : $file;
1242
1243 my $raw_title = $self->filename_based_title($file);
1244 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1245 }
1246
1247
1248 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1249 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1250 }
1251 else {
1252 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1253 }
1254 }
1255
1256}
1257
1258# add any extra metadata that's been passed around from one
1259# plugin to another.
1260# extra_metadata uses add_utf8_metadata so it expects metadata values
1261# to already be in utf8
1262sub extra_metadata {
1263 my $self = shift (@_);
1264 my ($doc_obj, $cursection, $metadata) = @_;
1265
1266 my $associate_tail_re = $self->{'associate_tail_re'};
1267
1268 foreach my $field (keys(%$metadata)) {
1269 # $metadata->{$field} may be an array reference
1270 if ($field eq "gsdlassocfile_tobe") {
1271 # 'gsdlassocfile_tobe' is artificially introduced metadata
1272 # that is used to signal that certain additional files should
1273 # be tied to this document. Useful in situations where a
1274 # metadata pass in the plugin pipeline works out some files
1275 # need to be associated with a document, but the document hasn't
1276 # been formed yet.
1277 my $equiv_form = "";
1278 foreach my $gaf (@{$metadata->{$field}}) {
1279 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1280 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1281
1282 # we need to make sure the filename is valid utf-8 - we do
1283 # this by url or base64 encoding it
1284 # $tail_filename is the name that we store the file as
1285 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1286 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1287 $doc_obj->associate_source_file($full_filename);
1288 # If the filename is url_encoded, we need to encode the % signs
1289 # in the filename, so that it works in a url
1290 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1291 # work out extended tail extension (i.e. matching tail re)
1292
1293 my ($file_prefix,$file_extended_ext)
1294 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1295 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1296 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1297 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1298 my $srcicon = "_icon".$doc_ext."_";
1299 my $end_doclink = "</a>";
1300
1301 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1302
1303 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1304 # for metadata such as [mp3._edited] [mp3._full] ...
1305 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1306 }
1307
1308 # for multiple metadata such as [mp3.assoclink]
1309 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1310
1311 $equiv_form .= " $assoc_form";
1312 }
1313 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1314 }
1315 elsif ($field eq "gsdlzipfilename") {
1316 # special case for when files have come out of a zip. source_path
1317 # (used for archives dbs and keeping track for incremental import)
1318 # must be set to the zip file name
1319 my $zip_filename = $metadata->{$field};
1320 # overwrite the source_path
1321 $doc_obj->set_source_path($zip_filename);
1322 # and set the metadata
1323 $zip_filename = &util::filename_within_collection($zip_filename);
1324 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1325 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1326 }
1327 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1328 map {
1329 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1330 } @{$metadata->{$field}};
1331 } else {
1332 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1333 }
1334 }
1335}
1336
1337
1338sub compile_stats {
1339 my $self = shift(@_);
1340 my ($stats) = @_;
1341
1342 $stats->{'num_processed'} += $self->{'num_processed'};
1343 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1344 $stats->{'num_archives'} += $self->{'num_archives'};
1345
1346}
1347sub associate_source_file {
1348 my $self = shift(@_);
1349
1350 my ($doc_obj, $filename) = @_;
1351 my $cursection = $doc_obj->get_top_section();
1352 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1353
1354 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1355 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1356}
1357
1358sub associate_cover_image {
1359 my $self = shift(@_);
1360 my ($doc_obj, $filename) = @_;
1361
1362 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1363
1364 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1365 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1366
1367 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1368 # don't stat() for existence e.g. for multiple document input files
1369 # (eg SplitPlug)
1370 return;
1371 }
1372
1373 my $top_section=$doc_obj->get_top_section();
1374
1375 if (&util::fd_exists($upgraded_filename)) {
1376 $doc_obj->associate_source_file($filename);
1377 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1378 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1379 } else {
1380 my $upper_filename = $filename;
1381 my $upgraded_upper_filename = $upgraded_filename;
1382
1383 $upper_filename =~ s/jpg$/JPG/;
1384 $upgraded_upper_filename =~ s/jpg$/JPG/;
1385
1386 if (&util::fd_exists($upgraded_upper_filename)) {
1387 $doc_obj->associate_source_file($upper_filename);
1388 $doc_obj->associate_file($upper_filename, "cover.jpg",
1389 "image/jpeg");
1390 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1391 } else {
1392 # file doesn't exist, so record the fact that it's missing so
1393 # we don't stat() again (stat is slow)
1394 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1395 }
1396 }
1397
1398}
1399
1400
1401# Overridden by exploding plugins (eg. ISISPlug)
1402sub clean_up_after_exploding
1403{
1404 my $self = shift(@_);
1405}
1406
1407
1408
14091;
Note: See TracBrowser for help on using the repository browser.