root/main/trunk/greenstone2/perllib/plugins/BasePlugin.pm @ 23829

Revision 23829, 44.8 KB (checked in by ak19, 9 years ago)

Display string for Source also needs to be in normalised composed form, since all display strings including doc text are to be stored in that form (so searching is also consistent).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34
35use encodings;
36use unicode;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44    @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50    [ { 'name' => "url",
51    'desc' => "{BasePlugin.rename_method.url}" },
52      { 'name' => "base64",
53    'desc' => "{BasePlugin.rename_method.base64}" },
54      { 'name' => "none",
55    'desc' => "{BasePlugin.rename_method.none}",
56    'hiddengli' => "yes" } ];
57
58our $encoding_list =
59    [ { 'name' => "ascii",
60    'desc' => "{BasePlugin.encoding.ascii}" },
61      { 'name' => "utf8",
62    'desc' => "{BasePlugin.encoding.utf8}" },
63      { 'name' => "unicode",
64    'desc' => "{BasePlugin.encoding.unicode}" } ];
65
66
67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70    my $hashEncode =
71    {'name' => $enc,
72     'desc' => $e->{$enc}->{'name'}};
73   
74    push(@{$encoding_list},$hashEncode);
75}
76
77our $encoding_plus_auto_list =
78    [ { 'name' => "auto",
79    'desc' => "{BasePlugin.filename_encoding.auto}" },
80      { 'name' => "auto-language-analysis",
81    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82      { 'name' => "auto-filesystem-encoding",
83    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84      { 'name' => "auto-fl",
85    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86      { 'name' => "auto-lf",
87    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
88
89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
91our $oidtype_list =
92    [ { 'name' => "auto",
93    'desc' => "{BasePlugin.OIDtype.auto}" },
94      { 'name' => "hash",
95        'desc' => "{import.OIDtype.hash}" },
96      { 'name' => "assigned",
97        'desc' => "{import.OIDtype.assigned}" },
98      { 'name' => "incremental",
99        'desc' => "{import.OIDtype.incremental}" },
100      { 'name' => "dirname",
101        'desc' => "{import.OIDtype.dirname}" } ];
102
103my $arguments =
104    [ { 'name' => "process_exp",
105    'desc' => "{BasePlugin.process_exp}",
106    'type' => "regexp",
107    'deft' => "",
108    'reqd' => "no" },
109      { 'name' => "no_blocking",
110    'desc' => "{BasePlugin.no_blocking}",
111    'type' => "flag",
112    'reqd' => "no"},
113      { 'name' => "block_exp",
114    'desc' => "{BasePlugin.block_exp}",
115    'type' => "regexp",
116    'deft' => "",
117    'reqd' => "no" },
118      { 'name' => "store_original_file",
119    'desc' => "{BasePlugin.store_original_file}",
120    'type' => "flag",
121    'reqd' => "no" },
122      { 'name' => "associate_ext",
123    'desc' => "{BasePlugin.associate_ext}",
124    'type' => "string",
125    'reqd' => "no" },
126      { 'name' => "associate_tail_re",
127    'desc' => "{BasePlugin.associate_tail_re}",
128    'type' => "string",
129    'reqd' => "no" },
130      { 'name' => "OIDtype",
131    'desc' => "{import.OIDtype}",
132    'type' => "enum",
133    'list' => $oidtype_list,
134    # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
135    'deft' => "auto",
136    'reqd' => "no" },
137      { 'name' => "OIDmetadata",
138    'desc' => "{import.OIDmetadata}",
139    'type' => "metadata",
140    'deft' => "dc.Identifier",
141    'reqd' => "no" },     
142#      { 'name' => "use_as_doc_identifier",
143#   'desc' => "{BasePlugin.use_as_doc_identifier}",
144#   'type' => "string",
145#   'reqd' => "no" ,
146#   'deft' => "" } ,
147      { 'name' => "no_cover_image",
148    'desc' => "{BasePlugin.no_cover_image}",
149    'type' => "flag",
150    'reqd' => "no" },
151      { 'name' => "filename_encoding",
152    'desc' => "{BasePlugin.filename_encoding}",
153    'type' => "enum",
154    'deft' => "auto",
155    'list' => $encoding_plus_auto_list,
156    'reqd' => "no" },
157      { 'name' => "smart_block",
158        'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
159        'type' => "flag",
160        'reqd' => "no",
161    'hiddengli' => "yes" }, # deprecated, but leave in for old collections
162      { 'name' => "file_rename_method",
163    'desc' => "{BasePlugin.file_rename_method}",
164    'type' => "enum",
165    'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
166    'list' => $file_rename_method_list,
167    'reqd' => "no"
168    }
169     
170      ];
171
172
173my $options = { 'name'     => "BasePlugin",
174        'desc'     => "{BasePlugin.desc}",
175        'abstract' => "yes",
176        'inherits' => "no",
177        'args'     => $arguments };
178
179sub new {
180
181    my ($class) = shift (@_);
182    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
183    push(@$pluginlist, $class);
184
185    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
186    push(@{$hashArgOptLists->{"OptList"}},$options);
187
188    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
189   
190    if ($self->{'info_only'}) {
191        # don't worry about any options etc
192        return bless $self, $class;
193    }
194
195    if ($self->{'smart_block'}) {
196    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
197    }
198    $self->{'smart_block'} = undef;
199
200    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
201    $self->{'plugin_type'} = $plugin_name;
202
203    # remove ex. from OIDmetadata
204    $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
205    $self->{'num_processed'} = 0;
206    $self->{'num_not_processed'} = 0;
207    $self->{'num_blocked'} = 0;
208    $self->{'num_archives'} = 0;
209    $self->{'cover_image'} = 1; # cover image is on by default
210    $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
211    $self->{'can_process_directories'} = 0;
212    #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
213   
214    my $associate_ext = $self->{'associate_ext'};
215    if ((defined $associate_ext) && ($associate_ext ne "")) {
216
217    my $associate_tail_re = $self->{'associate_tail_re'};
218    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
219        my $outhandle = $self->{'outhandle'};
220        print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
221        print $outhandle "         defaulting to 'associate_tail_re'\n";
222    }
223    else {
224        my @exts = split(/,/,$associate_ext);
225
226        my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
227        my $associate_tail_re = join("|",@exts_bracketed);
228        $self->{'associate_tail_re'} = $associate_tail_re;
229    }
230
231    delete $self->{'associate_ext'};
232    }
233
234    return bless $self, $class;
235
236}
237
238sub merge_inheritance
239{
240    my $self = {};
241    my @child_selfs = @_;
242
243    foreach my $child_self (@child_selfs) {
244    foreach my $key (keys %$child_self) {
245        if (defined $self->{$key}) {
246        if ($self->{$key} ne $child_self->{$key}) {
247#           print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
248#           print STDERR "Existing stored value = $self->{$key}\n";
249#           print STDERR "New (child) value     = $child_self->{$key}\n";
250#           print STDERR "Keeping existing value\n";
251            # Existing value seems to be option specified in collect.cfg
252
253            ### $self->{$key} = $child_self->{$key};
254           
255        }
256        else {
257##          print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
258        }
259
260        }
261        else {
262        $self->{$key} = $child_self->{$key};
263        }
264    }
265    }
266
267    return $self;   
268}
269
270# initialize BasePlugin options
271# if init() is overridden in a sub-class, remember to call BasePlugin::init()
272sub init {
273    my $self = shift (@_);
274    my ($verbosity, $outhandle, $failhandle) = @_;
275
276    # verbosity is passed through from the processor
277    $self->{'verbosity'} = $verbosity;
278
279    # as are the outhandle and failhandle
280    $self->{'outhandle'} = $outhandle if defined $outhandle;
281    $self->{'failhandle'} = $failhandle;
282#    $self->SUPER::init(@_);
283   
284    # set process_exp and block_exp to defaults unless they were
285    # explicitly set
286
287    if ((!$self->is_recursive()) and
288    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
289
290    $self->{'process_exp'} = $self->get_default_process_exp ();
291    if ($self->{'process_exp'} eq "") {
292        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
293    }
294    }
295
296    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
297    $self->{'block_exp'} = $self->get_default_block_exp ();
298    }
299
300}
301
302sub begin {
303    my $self = shift (@_);
304    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
305
306    if ($self->{'OIDtype'} eq "auto") {
307    # hasn't been set in the plugin, use the processor values
308    $self->{'OIDtype'} = $processor->{'OIDtype'};
309    $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
310    }
311    if ($self->{'OIDtype'} eq "hash") {
312    # should we hash on the file or on the doc xml??
313    $self->{'OIDtype'} = $self->get_oid_hash_type();
314    if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
315        $self->{'OIDtype'} = "hash_on_file";
316    }
317    }
318}
319
320# This is called once if removeold is set with import.pl. Most plugins will do
321# nothing but if a plugin does any stuff outside of creating doc obj, then
322# it may need to clear something.
323sub remove_all {
324    my $self = shift (@_);
325    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
326}
327
328# This is called per document for docs that have been deleted from the
329# collection. Most plugins will do nothing
330# but if a plugin does any stuff outside of creating doc obj, then it may need
331# to clear something.
332sub remove_one {
333    my $self = shift (@_);
334   
335    my ($file, $oids, $archivedir) = @_;
336    return 0 if $self->can_process_this_file($file);
337    return undef;
338}
339
340sub end {
341    # potentially called at the end of each plugin pass
342    # import.pl only has one plugin pass, but buildcol.pl has multiple ones
343
344    my ($self) = shift (@_);
345}
346
347sub deinit {
348    # called only once, after all plugin passes have been done
349
350    my ($self) = @_;
351}
352
353# default hashing type is to hash on the original file (or converted file)
354# override this to return hash_on_ga_xml for filetypes where hashing on the
355# file is no good eg video
356sub get_oid_hash_type {
357
358    my $self = shift (@_);
359
360    return "hash_on_file";
361}
362
363
364# this function should be overridden to return 1
365# in recursive plugins
366sub is_recursive {
367    my $self = shift (@_);
368
369    return 0;
370}
371
372sub get_default_block_exp {
373    my $self = shift (@_);
374
375    return "";
376}
377
378sub get_default_process_exp {
379    my $self = shift (@_);
380
381    return "";
382}
383
384
385# rename imported files and assoc files using URL encoding by default
386# as this will work for most plugins and give more legible filenames
387sub get_default_file_rename_method() {
388    my $self = shift (@_);
389    return "url";
390}
391
392# returns this plugin's active (possibly user-selected) file_rename_method
393sub get_file_rename_method() {
394    my $self = shift (@_);
395    my $rename_method = $self->{'file_rename_method'};
396    if($rename_method) {
397    return $rename_method;
398    } else {   
399    return $self->get_default_file_rename_method();
400    }
401}
402
403# default implementation is to do nothing
404sub store_block_files {
405   
406    my $self =shift (@_);
407    my ($filename_full_path, $block_hash) = @_;
408
409}
410
411# put files to block into hash
412sub use_block_expressions {
413
414    my $self =shift (@_);
415    my ($filename_full_path, $block_hash) = @_;
416
417    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
418
419    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
420    &util::block_filename($block_hash,$filename_full_path);
421    }
422
423}
424
425#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
426sub block_cover_image
427{
428    my $self =shift;
429    my ($filename, $block_hash) = @_;
430
431    $filename = &util::upgrade_if_dos_filename($filename);
432
433    if ($self->{'cover_image'}) {
434    my $coverfile = $filename;
435    $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
436
437    if (!&util::fd_exists($coverfile)) {
438        $coverfile =~ s/jpg$/JPG/;
439    }   
440    if (&util::fd_exists($coverfile)) {
441        &util::block_filename($block_hash,$coverfile);
442    }
443    }
444
445    return;
446}
447
448
449# discover all the files that should be blocked by this plugin
450# check the args ...
451sub file_block_read {
452
453    my $self = shift (@_); 
454    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
455    # Keep track of filenames with same root but different extensions
456    # Used to support -associate_ext and the more generalised
457    # -associate_tail_re
458    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
459
460    if (!-d $filename_full_path) {
461    $block_hash->{'all_files'}->{$file} = 1;
462    }
463
464    my $associate_tail_re = $self->{'associate_tail_re'};
465    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
466    my ($file_prefix,$file_ext)
467        = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
468    if ((defined $file_prefix) && (defined $file_ext)) {
469        my $shared_fileroot = $block_hash->{'shared_fileroot'};
470        if (!defined $shared_fileroot->{$file_prefix}) {
471        my $file_prefix_rec = { 'tie_to'  => undef,
472                        'exts'    => {} };
473        $shared_fileroot->{$file_prefix} = $file_prefix_rec;
474        }
475       
476        my $file_prefix_rec = $shared_fileroot->{$file_prefix};
477
478        if ($self->can_process_this_file($filename_full_path)) {
479        # This is the document the others should be tied to
480        $file_prefix_rec->{'tie_to'} = $file_ext;
481        }
482        else {
483        if ($file_ext =~ m/$associate_tail_re$/) {
484            # this file should be associated to the main one
485            $file_prefix_rec->{'exts'}->{$file_ext} = 1;
486        }
487        }
488
489    }
490    }
491
492    # check block expressions
493    $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
494
495    # now check whether we are actually processing this
496    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
497    return undef; # can't recognise
498    }
499   
500    # if we have a block_exp, then this overrides the normal 'smart' blocking
501    $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
502
503    # block the cover image if there is one
504    if ($self->{'cover_image'}) {
505    $self->block_cover_image($filename_full_path, $block_hash);
506    }
507   
508    return 1;
509}
510
511# plugins that rely on more than process_exp (eg XML plugins) can override this method
512sub can_process_this_file {
513    my $self = shift(@_);
514    my ($filename) = @_;
515
516    if (-d $filename && !$self->{'can_process_directories'}) {
517    return 0;
518    }
519
520    if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
521    return 1;
522    }
523    return 0;
524   
525}
526
527# just converts path as is to utf8.
528sub filepath_to_utf8 {
529    my $self = shift (@_); 
530    my ($file, $file_encoding) = @_;
531    my $filemeta = $file;
532
533    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
534
535    # Whenever filename-encoding is set to any of the auto settings, we
536    # check if the filename is already in UTF8. If it is, then we're done.
537    if($filename_encoding =~ m/auto/) {
538    if(&unicode::check_is_utf8($filemeta))
539    {
540        $filename_encoding = "utf8";
541        return $filemeta;
542    }
543    }
544   
545    # Auto setting, but filename is not utf8
546    if ($filename_encoding eq "auto")
547    {
548    # try textcat
549    $filename_encoding = $self->textcat_encoding($filemeta);
550   
551    # check the locale next
552    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
553   
554   
555    # now try the encoding of the document, if available
556    if ($filename_encoding eq "undefined" && defined $file_encoding) {
557        $filename_encoding = $file_encoding;
558    }
559
560    }
561
562    elsif ($filename_encoding eq "auto-language-analysis")
563    {   
564    $filename_encoding = $self->textcat_encoding($filemeta);
565
566    # now try the encoding of the document, if available
567    if ($filename_encoding eq "undefined" && defined $file_encoding) {
568        $filename_encoding = $file_encoding;
569    }
570    }
571
572    elsif ($filename_encoding eq "auto-filesystem-encoding")
573    {   
574    # try locale
575    $filename_encoding = $self->locale_encoding();
576    }
577
578    elsif ($filename_encoding eq "auto-fl")
579    {
580    # filesystem-encoding (locale) then language-analysis (textcat)
581    $filename_encoding = $self->locale_encoding();
582   
583    # try textcat
584    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
585   
586    # else assume filename encoding is encoding of file content, if that's available
587    if ($filename_encoding eq "undefined" && defined $file_encoding) {
588        $filename_encoding = $file_encoding;
589    }
590    }
591   
592    elsif ($filename_encoding eq "auto-lf")
593    {
594    # language-analysis (textcat) then filesystem-encoding (locale)
595    $filename_encoding = $self->textcat_encoding($filemeta);
596   
597    # guess filename encoding from encoding of file content, if available
598    if ($filename_encoding eq "undefined" && defined $file_encoding) {
599        $filename_encoding = $file_encoding;
600    }
601
602    # try locale
603    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
604    }
605   
606    # if still undefined, use utf8 as fallback
607    if ($filename_encoding eq "undefined") {
608    $filename_encoding = "utf8";
609    }
610
611    #print STDERR "**** UTF8 encoding the filename $filemeta ";
612   
613    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
614    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
615    # be always utf8 (in which case the filename's encoding is also set as utf8 even
616    # though the filename need not be if it originates from another system)--in such
617    # cases attempt to make the filename utf8 to match.
618    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
619    &unicode::ensure_utf8(\$filemeta);
620    }
621
622    # convert non-unicode encodings to utf8
623    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
624    $filemeta = &unicode::unicode2utf8(
625                       &unicode::convert2unicode($filename_encoding, \$filemeta)
626                       );
627    }
628
629    #print STDERR " from encoding $filename_encoding -> $filemeta\n";
630    return $filemeta;
631}
632
633# gets the filename with no path, converts to utf8, and then dm safes it.
634# filename_encoding set by user
635sub filename_to_utf8_metadata
636{
637    my $self = shift (@_); 
638    my ($file, $file_encoding) = @_;
639
640    my $outhandle = $self->{'outhandle'};
641
642    print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
643    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
644    print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
645
646
647    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
648    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
649
650    return $filemeta;
651}
652
653sub locale_encoding {
654    my $self = shift(@_);
655   
656    if (!defined $self->{'filesystem_encoding'}) {
657    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
658    }
659
660    #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
661    return $self->{'filesystem_encoding'}; # can be the string "undefined"
662}
663
664sub textcat_encoding {
665    my $self = shift(@_);
666    my ($filemeta) = @_;
667
668    # analyse filenames without extensions and digits (and trimmed of
669    # surrounding whitespace), so that irrelevant chars don't confuse
670    # textcat
671    my $strictfilemeta = $filemeta;
672    $strictfilemeta =~ s/\.[^\.]+$//g;
673    $strictfilemeta =~ s/\d//g;
674    $strictfilemeta =~ s/^\s*//g;
675    $strictfilemeta =~ s/\s*$//g;
676   
677    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
678    if(!defined $filename_encoding) {
679    $filename_encoding = "undefined";
680    }
681
682    return $filename_encoding; # can be the string "undefined"
683}
684
685# performs textcat
686sub encoding_from_language_analysis {
687    my $self = shift(@_);
688    my ($text) = @_;
689
690    my $outhandle = $self->{'outhandle'};
691    my $best_encoding = undef;
692   
693    # get the language/encoding of the textstring using textcat
694    require textcat;  # Only load the textcat module if it is required
695    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
696    my $results = $self->{'textcat'}->classify_cached_filename(\$text);
697
698
699    if (scalar @$results < 0) {
700    return undef;
701    }
702   
703    # We have some results, we choose the first
704    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
705   
706    $best_encoding = $encoding;
707    if (!defined $best_encoding) {
708    return undef;
709    }
710   
711    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
712    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
713    $best_encoding = 'utf8';
714    }
715   
716   
717    # check for equivalents where textcat doesn't have some encodings...
718    # eg MS versions of standard encodings
719    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
720    my $iso = $1; # which variant of the iso standard?
721    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
722    if ($text =~ /[\x80-\x9f]/) {
723        # Western Europe
724        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
725        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
726        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
727        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
728        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
729        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
730        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
731    }
732    }
733   
734    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
735    !defined $encodings::encodings->{$best_encoding})
736    {
737    if ($self->{'verbosity'}) {
738        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
739    }
740    $best_encoding = undef;
741    }
742   
743    return $best_encoding;
744}
745
746# uses locale
747sub get_filesystem_encoding
748{
749
750    my $self = shift(@_);
751
752    my $outhandle = $self->{'outhandle'};
753    my $filesystem_encoding = undef;
754
755    eval {
756    # Works for Windows as well, returning the DOS code page in use
757    use POSIX qw(locale_h);
758   
759    # With only one parameter, setlocale retrieves the
760    # current value
761    my $current_locale = setlocale(LC_CTYPE);
762   
763    my $char_encoding = undef;
764    if ($current_locale =~ m/\./) {
765        ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
766        $char_encoding = lc($char_encoding);
767    }
768    else {
769        if ($current_locale =~ m/^(posix|c)$/i) {
770        $char_encoding = "ascii";
771        }
772    }
773
774    if (defined $char_encoding) {
775        if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
776        $char_encoding = "$1\_$2\_$3";
777        }
778
779        $char_encoding =~ s/-/_/g;
780        $char_encoding =~ s/^utf_8$/utf8/;
781       
782        if ($char_encoding =~ m/^\d+$/) {
783        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
784            $char_encoding = "windows_$char_encoding";
785        }
786        elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
787            $char_encoding = "dos_$char_encoding";
788        }
789        }
790       
791        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
792        || (defined $encodings::encodings->{$char_encoding})) {
793        $filesystem_encoding = $char_encoding;
794        }
795        else {
796        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
797        }
798    }
799   
800
801    };
802    if ($@) {
803    print $outhandle "$@\n";
804    print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n";
805   
806    }
807
808    return $filesystem_encoding;
809}
810
811
812
813sub deduce_filename_encoding
814{
815    my $self = shift (@_); 
816    my ($file,$metadata,$plugin_filename_encoding) = @_;
817
818    my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
819    my $deduced_filename_encoding = undef;
820   
821    # Start by looking for manually assigned metadata
822    if (defined $gs_filename_encoding) {
823    if (ref ($gs_filename_encoding) eq "ARRAY") {
824        my $outhandle = $self->{'outhandle'};
825       
826        $deduced_filename_encoding = $gs_filename_encoding->[0];
827       
828        my $num_vals = scalar(@$gs_filename_encoding);
829        if ($num_vals>1) {
830        print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
831        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
832        }
833    }
834    else {
835        $deduced_filename_encoding = $gs_filename_encoding;
836    }
837    }
838   
839    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
840    # Look to see if plugin specifies this value
841
842    if (defined $plugin_filename_encoding) {
843        # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
844        if ($plugin_filename_encoding =~ m/^auto-.*$/) {
845        my $outhandle = $self->{'outhandle'};
846        print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
847        print $outhandle "         default to 'auto'\n";
848        $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
849        }
850       
851        if ($plugin_filename_encoding ne "auto") {
852        # We've been given a specific filenamne encoding
853        # => so use it!
854        $deduced_filename_encoding = $plugin_filename_encoding;
855        }
856    }
857    }
858   
859    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
860
861    # Look to file system to provide a character encoding
862
863    # If Windows NTFS, then -- assuming we work with long file names got through
864    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
865
866    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
867        # Can do better than working with the DOS character encoding returned by locale     
868        $deduced_filename_encoding = "unicode";
869    }
870    else {
871        # Unix of some form or other
872
873        # See if we can determine the file system encoding through locale
874        $deduced_filename_encoding = $self->locale_encoding();
875       
876        # if locale shows us filesystem is utf8, check to see filename is consistent
877        # => if not, then we have an "alien" filename on our hands
878
879        if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
880        if (!&unicode::check_is_utf8($file)) {
881            # "alien" filename, so revert
882            $deduced_filename_encoding = undef;
883        }
884        }
885    }
886    }
887   
888#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
889#       # Last chance, apply textcat to deduce filename encoding
890#       $deduced_filename_encoding = $self->textcat_encoding($file);
891#    }
892
893    if ($self->{'verbosity'}>3) {
894    my $outhandle = $self->{'outhandle'};
895
896    if (defined $deduced_filename_encoding) {
897        print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n";
898    }
899    else {
900        print $outhandle "  No filename encoding deduced\n";
901    }
902    }
903   
904    return $deduced_filename_encoding;
905}
906
907
908
909
910# Notionally written to be called once for each document, it is however safe to
911# call multiple times (as in the case of ImagePlugin) which calls this later on
912# after the original image has potentially been converted to a *new* source image
913# format (e.g. TIFF to PNG)
914
915sub set_Source_metadata {
916    my $self = shift (@_); 
917    my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
918   
919    # 1. Sets the filename (Source) for display encoded as Unicode if possible,
920    #    and (as a fallback) using %xx if not for non-ascii chars
921    # 2. Sets the url ref (SourceFile) to the URL encoded version
922    #    of filename for generated files
923   
924    my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
925
926    my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
927
928    my $octet_file = $raw_file;
929
930    # UTF-8 version of filename
931#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
932#   print STDERR "**** Setting Source Metadata given: $octet_file\n";
933#    }
934   
935    # Deal with (on Windows) raw filenames that are in their
936    # abbreviated DOS form
937
938    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
939    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
940        if (-e $raw_filename) {
941        my $unicode_filename = Win32::GetLongPathName($raw_filename);
942       
943        my $unused_full_uf;
944        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
945        }
946    }
947    }
948
949    my $url_encoded_filename;
950    if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
951    # => Generate a pretty print version of filename that is mapped to Unicode
952   
953    # Use filename_encoding to map raw filename to a Perl unicode-aware string
954    $url_encoded_filename = decode($filename_encoding,$octet_file);     
955    }
956    else {
957    # otherwise generate %xx encoded version of filename for char > 127
958    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
959    }
960   
961#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
962#   print STDERR "****** saving Source as:             $url_encoded_filename\n";
963#    }
964
965    # In the case of converted files and (generalized) exploded documents, there
966    # will already be a source filename => store as OrigSource before overriding
967    my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
968    if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
969    $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
970    }
971       
972    # Source is the UTF8 display name - not necessarily the name of the file on the system
973    if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
974    # on Darwin want all display strings to be in composed form, then can search on that
975    $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
976    }
977    $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
978
979   
980    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
981    # If using URL encoding, then SourceFile is the url-reference to url-encoded
982    # renamed_raw_url: it's a url that refers to the actual file on the system
983    my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
984
985    $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
986                    $renamed_raw_url);
987
988#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
989#   print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
990#    }
991}
992
993# this should be called by all plugins to set the oid of the doc obj, rather
994# than calling doc_obj->set_OID directly
995sub add_OID {
996    my $self = shift (@_); 
997    my ($doc_obj) = @_;
998
999    $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1000
1001    # see if there is a plugin specific set_OID function
1002    if (defined ($self->can('set_OID'))) {
1003    $self->set_OID(@_); # pass through doc_obj and any extra arguments
1004    }
1005    else {
1006    # use the default set_OID() in doc.pm
1007    $doc_obj->set_OID();
1008    }
1009
1010}
1011
1012# The BasePlugin read_into_doc_obj() function. This function does all the
1013# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
1014# a file and sets up a slew of metadata all saved in doc_obj, which
1015# it then returns as part of a tuple (process_status,doc_obj)
1016#
1017# Much of this functionality used to reside in read, but it was broken
1018# down into a supporting routine to make the code more flexible. 
1019#
1020# recursive plugins (e.g. RecPlug) and specialized plugins like those
1021# capable of processing many documents within a single file (e.g.
1022# GMLPlug) will normally want to implement their own version of
1023# read_into_doc_obj()
1024#
1025# Note that $base_dir might be "" and that $file might
1026# include directories
1027
1028# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1029sub read_into_doc_obj {
1030    my $self = shift (@_); 
1031    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1032
1033    my $outhandle = $self->{'outhandle'};
1034
1035    # should we move this to read? What about secondary plugins?
1036    my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1037    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1038    print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1039    if $self->{'verbosity'} > 1;
1040
1041    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1042   
1043    # create a new document
1044    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1045    my $top_section = $doc_obj->get_top_section();
1046
1047    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1048    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1049   
1050
1051    my $plugin_filename_encoding = $self->{'filename_encoding'};
1052    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1053    $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1054
1055    # plugin specific stuff - what args do we need here??
1056    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1057    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1058    return -1;
1059    }
1060   
1061    # include any metadata passed in from previous plugins
1062    # note that this metadata is associated with the top level section
1063    my $section = $doc_obj->get_top_section();
1064    # can we merge these two methods??
1065    $self->add_associated_files($doc_obj, $filename_full_path);
1066    $self->extra_metadata ($doc_obj, $section, $metadata);
1067    $self->auto_extract_metadata($doc_obj);
1068
1069    # if we haven't found any Title so far, assign one
1070    # this was shifted to here from inside read()
1071    $self->title_fallback($doc_obj,$section,$filename_no_path);
1072   
1073    $self->add_OID($doc_obj);
1074   
1075    $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1076    return (1,$doc_obj);
1077}
1078
1079sub post_process_doc_obj {
1080    my $self = shift (@_); 
1081    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1082
1083    return 1;
1084}
1085
1086sub add_dummy_text {
1087    my $self = shift(@_);
1088    my ($doc_obj, $section) = @_;
1089
1090    # add NoText metadata so we can hide this dummy text in format statements
1091    $doc_obj->add_metadata($section, "NoText", "1");
1092    $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1093   
1094}
1095
1096# does nothing. Can be overridden by subclass
1097sub auto_extract_metadata {
1098    my $self = shift(@_);
1099    my ($doc_obj) = @_;
1100}
1101
1102# adds cover image, associate_file options stuff. Should be called by sub class
1103# read_into_doc_obj
1104sub add_associated_files {
1105    my $self = shift(@_);
1106    # whatis filename??
1107    my ($doc_obj, $filename) = @_;
1108   
1109    # add in the cover image
1110    if ($self->{'cover_image'}) {
1111    $self->associate_cover_image($doc_obj, $filename);
1112    }
1113    # store the original (used for eg TextPlugin to store the original for OAI)
1114    if ($self->{'store_original_file'}) {
1115    $self->associate_source_file($doc_obj, $filename);
1116    }
1117   
1118
1119}
1120
1121# implement this if you are extracting metadata for other documents
1122sub metadata_read {
1123    my $self = shift (@_);
1124    my ($pluginfo, $base_dir, $file, $block_hash,
1125    $extrametakeys, $extrametadata, $extrametafile,
1126    $processor, $gli, $aux) = @_;
1127   
1128    # can we process this file??
1129    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1130    return undef unless $self->can_process_this_file($filename_full_path);
1131
1132    return 1; # we recognise the file, but don't actually do anything with it
1133}
1134
1135
1136# The BasePlugin read() function. This function calls read_into_doc_obj()
1137# to ensure all the right things to make general options work for a
1138# given plugin are done. It then calls the process() function which
1139# does all the work specific to a plugin (like the old read functions
1140# used to do). Most plugins should define their own process() function
1141# and let this read() function keep control. 
1142#
1143# recursive plugins (e.g. RecPlug) and specialized plugins like those
1144# capable of processing many documents within a single file (e.g.
1145# GMLPlug) might want to implement their own version of read(), but
1146# more likely need to implement their own version of read_into_doc_obj()
1147#
1148# Return number of files processed, undef if can't recognise, -1 if can't
1149# process
1150
1151sub read {
1152    my $self = shift (@_); 
1153    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1154
1155    # can we process this file??
1156    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1157
1158    return undef unless $self->can_process_this_file($filename_full_path);
1159   
1160    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1161   
1162    if ((defined $process_status) && ($process_status == 1)) {
1163   
1164    # process the document
1165    $processor->process($doc_obj);
1166
1167    $self->{'num_processed'} ++;
1168    undef $doc_obj;
1169    }
1170    # delete any temp files that we may have created
1171    $self->clean_up_after_doc_obj_processing();
1172
1173
1174    # if process_status == 1, then the file has been processed.
1175    return $process_status;
1176
1177}
1178
1179# returns undef if file is rejected by the plugin
1180sub process {
1181    my $self = shift (@_);
1182    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1183
1184    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1185
1186    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1187    print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1188
1189    die "\n";
1190
1191    return undef; # never gets here
1192}
1193
1194# overwrite this method to delete any temp files that we have created
1195sub clean_up_after_doc_obj_processing {
1196    my $self = shift(@_);
1197
1198}
1199
1200# write_file -- used by ConvertToPlug, for example in post processing
1201#
1202# where should this go, is here the best place??
1203sub utf8_write_file {
1204    my $self = shift (@_);
1205    my ($textref, $filename) = @_;
1206   
1207    if (!open (FILE, ">:utf8", $filename)) {
1208    gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1209    die "\n";
1210    }
1211    print FILE $$textref;
1212   
1213    close FILE;
1214}
1215
1216
1217sub filename_based_title
1218{
1219    my $self = shift (@_);
1220    my ($file) = @_;
1221
1222    my $file_derived_title = $file;
1223    $file_derived_title =~ s/_/ /g;
1224    $file_derived_title =~ s/\.[^.]+$//;
1225
1226    return $file_derived_title;
1227}
1228
1229
1230sub title_fallback
1231{
1232    my $self = shift (@_);
1233    my ($doc_obj,$section,$file) = @_;
1234
1235    if (!defined $doc_obj->get_metadata_element ($section, "Title")
1236    || $doc_obj->get_metadata_element($section, "Title") eq "") {
1237
1238    my $source_file = $doc_obj->get_metadata_element($section, "Source");
1239    my $file_derived_title;
1240    if (defined $source_file) {
1241        $file_derived_title =  $self->filename_based_title($source_file);
1242    }
1243    else {
1244        # pp = pretty print
1245        my $pp_file = (defined $source_file) ? $source_file : $file;
1246
1247        my $raw_title = $self->filename_based_title($file);
1248        my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1249    }
1250
1251
1252    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1253        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1254    }
1255    else {
1256        $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1257    }
1258    }
1259   
1260}
1261
1262# add any extra metadata that's been passed around from one
1263# plugin to another.
1264# extra_metadata uses add_utf8_metadata so it expects metadata values
1265# to already be in utf8
1266sub extra_metadata {
1267    my $self = shift (@_);
1268    my ($doc_obj, $cursection, $metadata) = @_;
1269
1270    my $associate_tail_re = $self->{'associate_tail_re'};
1271
1272    foreach my $field (keys(%$metadata)) {
1273    # $metadata->{$field} may be an array reference
1274    if ($field eq "gsdlassocfile_tobe") {
1275        # 'gsdlassocfile_tobe' is artificially introduced metadata
1276        # that is used to signal that certain additional files should
1277        # be tied to this document.  Useful in situations where a
1278        # metadata pass in the plugin pipeline works out some files
1279        # need to be associated with a document, but the document hasn't
1280        # been formed yet.
1281        my $equiv_form = "";
1282        foreach my $gaf (@{$metadata->{$field}}) {
1283        my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1284        my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1285       
1286        # we need to make sure the filename is valid utf-8 - we do
1287        # this by url or base64 encoding it
1288        # $tail_filename is the name that we store the file as
1289        $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1290        $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1291        $doc_obj->associate_source_file($full_filename);
1292        # If the filename is url_encoded, we need to encode the % signs
1293        # in the filename, so that it works in a url
1294        my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1295        # work out extended tail extension (i.e. matching tail re)
1296
1297        my ($file_prefix,$file_extended_ext)
1298            = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1299        my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1300        my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1301        my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1302        my $srcicon = "_icon".$doc_ext."_";
1303        my $end_doclink = "</a>";
1304
1305        my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1306
1307        if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1308            # for metadata such as [mp3._edited] [mp3._full] ...
1309            $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1310        }
1311
1312        # for multiple metadata such as [mp3.assoclink]
1313        $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1314       
1315        $equiv_form .= " $assoc_form"; 
1316        }
1317        $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1318    }
1319    elsif ($field eq "gsdlzipfilename") {
1320        # special case for when files have come out of a zip. source_path
1321        # (used for archives dbs and keeping track for incremental import)
1322        # must be set to the zip file name
1323        my $zip_filename = $metadata->{$field};
1324        # overwrite the source_path
1325        $doc_obj->set_source_path($zip_filename);
1326        # and set the metadata
1327        $zip_filename = &util::filename_within_collection($zip_filename);
1328        $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1329        $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1330    }
1331    elsif (ref ($metadata->{$field}) eq "ARRAY") {
1332        map {
1333        $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1334        } @{$metadata->{$field}};
1335    } else {
1336        $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1337    }
1338    }
1339}
1340
1341
1342sub compile_stats {
1343    my $self = shift(@_);
1344    my ($stats) = @_;
1345
1346    $stats->{'num_processed'} += $self->{'num_processed'};
1347    $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1348    $stats->{'num_archives'} += $self->{'num_archives'};
1349
1350}
1351sub associate_source_file {
1352    my $self = shift(@_);
1353   
1354    my ($doc_obj, $filename) = @_;
1355    my $cursection = $doc_obj->get_top_section();
1356    my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1357   
1358    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1359    $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1360}
1361
1362sub associate_cover_image {
1363    my $self = shift(@_);
1364    my ($doc_obj, $filename) = @_;
1365
1366    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1367
1368    $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1369    $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1370
1371    if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1372    # don't stat() for existence e.g. for multiple document input files
1373    # (eg SplitPlug)
1374    return;
1375    }
1376
1377    my $top_section=$doc_obj->get_top_section();
1378
1379    if (&util::fd_exists($upgraded_filename)) {
1380    $doc_obj->associate_source_file($filename);
1381        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1382    $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1383    } else {
1384    my $upper_filename = $filename;
1385    my $upgraded_upper_filename = $upgraded_filename;
1386
1387    $upper_filename =~ s/jpg$/JPG/;
1388    $upgraded_upper_filename =~ s/jpg$/JPG/;
1389
1390    if (&util::fd_exists($upgraded_upper_filename)) {
1391        $doc_obj->associate_source_file($upper_filename);
1392        $doc_obj->associate_file($upper_filename, "cover.jpg",
1393                     "image/jpeg");
1394        $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1395    } else {
1396        # file doesn't exist, so record the fact that it's missing so
1397        # we don't stat() again (stat is slow)
1398        $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1399    }
1400    }
1401
1402}
1403
1404
1405# Overridden by exploding plugins (eg. ISISPlug)
1406sub clean_up_after_exploding
1407{
1408    my $self = shift(@_);
1409}
1410
1411
1412
14131;
Note: See TracBrowser for help on using the browser.