root/main/trunk/greenstone2/perllib/plugins/BasePlugin.pm @ 27957

Revision 27957, 46.9 KB (checked in by ak19, 6 years ago)

For now, undoing the change made to BasePlugin? for the diffcol nightly task. Once all the tutorial collections are passed successfully through diffcol, the model collections can be rebuilt or set up to be rebuilt automatically with the line now commented out.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41use util;
42use FileUtils;
43
44use PrintInfo;
45
46BEGIN {
47    @BasePlugin::ISA = ( 'PrintInfo' );
48}
49
50# the different methods that can be applied when renaming
51# imported documents and their associated files
52our $file_rename_method_list =
53    [ { 'name' => "url",
54    'desc' => "{BasePlugin.rename_method.url}" },
55      { 'name' => "base64",
56    'desc' => "{BasePlugin.rename_method.base64}" },
57      { 'name' => "none",
58    'desc' => "{BasePlugin.rename_method.none}",
59    'hiddengli' => "yes" } ];
60
61our $encoding_list =
62    [ { 'name' => "ascii",
63    'desc' => "{BasePlugin.encoding.ascii}" },
64      { 'name' => "utf8",
65    'desc' => "{BasePlugin.encoding.utf8}" },
66      { 'name' => "unicode",
67    'desc' => "{BasePlugin.encoding.unicode}" } ];
68
69
70my $e = $encodings::encodings;
71foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
72{
73    my $hashEncode =
74    {'name' => $enc,
75     'desc' => $e->{$enc}->{'name'}};
76   
77    push(@{$encoding_list},$hashEncode);
78}
79
80our $encoding_plus_auto_list =
81    [ { 'name' => "auto",
82    'desc' => "{BasePlugin.filename_encoding.auto}" },
83      { 'name' => "auto-language-analysis",
84    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
85      { 'name' => "auto-filesystem-encoding",
86    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
87      { 'name' => "auto-fl",
88    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
89      { 'name' => "auto-lf",
90    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
91
92push(@{$encoding_plus_auto_list},@{$encoding_list});
93
94our $oidtype_list =
95    [ { 'name' => "auto",
96    'desc' => "{BasePlugin.OIDtype.auto}" },
97      { 'name' => "hash",
98        'desc' => "{import.OIDtype.hash}" },
99      { 'name' => "hash_on_ga_xml",
100        'desc' => "{import.OIDtype.hash_on_ga_xml}" },
101      { 'name' => "hash_on_full_filename",
102        'desc' => "{import.OIDtype.hash_on_full_filename}" },
103      { 'name' => "assigned",
104        'desc' => "{import.OIDtype.assigned}" },
105      { 'name' => "incremental",
106        'desc' => "{import.OIDtype.incremental}" },
107      { 'name' => "filename",
108        'desc' => "{import.OIDtype.filename}" },
109      { 'name' => "dirname",
110        'desc' => "{import.OIDtype.dirname}" },
111      { 'name' => "full_filename",
112        'desc' => "{import.OIDtype.full_filename}" } ];
113
114my $arguments =
115    [ { 'name' => "process_exp",
116    'desc' => "{BasePlugin.process_exp}",
117    'type' => "regexp",
118    'deft' => "",
119    'reqd' => "no" },
120      { 'name' => "no_blocking",
121    'desc' => "{BasePlugin.no_blocking}",
122    'type' => "flag",
123    'reqd' => "no"},
124      { 'name' => "block_exp",
125    'desc' => "{BasePlugin.block_exp}",
126    'type' => "regexp",
127    'deft' => "",
128    'reqd' => "no" },
129      { 'name' => "store_original_file",
130    'desc' => "{BasePlugin.store_original_file}",
131    'type' => "flag",
132    'reqd' => "no" },
133      { 'name' => "associate_ext",
134    'desc' => "{BasePlugin.associate_ext}",
135    'type' => "string",
136    'reqd' => "no" },
137      { 'name' => "associate_tail_re",
138    'desc' => "{BasePlugin.associate_tail_re}",
139    'type' => "string",
140    'reqd' => "no" },
141      { 'name' => "OIDtype",
142    'desc' => "{import.OIDtype}",
143    'type' => "enum",
144    'list' => $oidtype_list,
145    # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
146    'deft' => "auto",
147    'reqd' => "no" },
148      { 'name' => "OIDmetadata",
149    'desc' => "{import.OIDmetadata}",
150    'type' => "metadata",
151    'deft' => "dc.Identifier",
152    'reqd' => "no" },     
153#      { 'name' => "use_as_doc_identifier",
154#   'desc' => "{BasePlugin.use_as_doc_identifier}",
155#   'type' => "string",
156#   'reqd' => "no" ,
157#   'deft' => "" } ,
158      { 'name' => "no_cover_image",
159    'desc' => "{BasePlugin.no_cover_image}",
160    'type' => "flag",
161    'reqd' => "no" },
162      { 'name' => "filename_encoding",
163    'desc' => "{BasePlugin.filename_encoding}",
164    'type' => "enum",
165    'deft' => "auto",
166    'list' => $encoding_plus_auto_list,
167    'reqd' => "no" },
168      { 'name' => "smart_block",
169        'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
170        'type' => "flag",
171        'reqd' => "no",
172    'hiddengli' => "yes" }, # deprecated, but leave in for old collections
173      { 'name' => "file_rename_method",
174    'desc' => "{BasePlugin.file_rename_method}",
175    'type' => "enum",
176    'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
177    'list' => $file_rename_method_list,
178    'reqd' => "no"
179    }
180     
181      ];
182
183
184my $options = { 'name'     => "BasePlugin",
185        'desc'     => "{BasePlugin.desc}",
186        'abstract' => "yes",
187        'inherits' => "no",
188        'args'     => $arguments };
189
190sub new {
191
192    my ($class) = shift (@_);
193    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
194    push(@$pluginlist, $class);
195
196    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
197    push(@{$hashArgOptLists->{"OptList"}},$options);
198
199    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
200   
201    if ($self->{'info_only'}) {
202        # don't worry about any options etc
203        return bless $self, $class;
204    }
205
206    if ($self->{'smart_block'}) {
207    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
208    }
209    $self->{'smart_block'} = undef;
210
211    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
212    $self->{'plugin_type'} = $plugin_name;
213
214    # remove ex. from OIDmetadata iff it's the only namespace prefix
215    $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
216    $self->{'num_processed'} = 0;
217    $self->{'num_not_processed'} = 0;
218    $self->{'num_blocked'} = 0;
219    $self->{'num_archives'} = 0;
220    $self->{'cover_image'} = 1; # cover image is on by default
221    $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
222    $self->{'can_process_directories'} = 0;
223    #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
224   
225    my $associate_ext = $self->{'associate_ext'};
226    if ((defined $associate_ext) && ($associate_ext ne "")) {
227
228    my $associate_tail_re = $self->{'associate_tail_re'};
229    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
230        my $outhandle = $self->{'outhandle'};
231        print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
232        print $outhandle "         defaulting to 'associate_tail_re'\n";
233    }
234    else {
235        my @exts = split(/,/,$associate_ext);
236
237        my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
238        my $associate_tail_re = join("|",@exts_bracketed);
239        $self->{'associate_tail_re'} = $associate_tail_re;
240    }
241
242    delete $self->{'associate_ext'};
243    }
244
245    return bless $self, $class;
246
247}
248
249sub merge_inheritance
250{
251    my $self = {};
252    my @child_selfs = @_;
253
254    foreach my $child_self (@child_selfs) {
255    foreach my $key (keys %$child_self) {
256        if (defined $self->{$key}) {
257        if ($self->{$key} ne $child_self->{$key}) {
258#           print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
259#           print STDERR "Existing stored value = $self->{$key}\n";
260#           print STDERR "New (child) value     = $child_self->{$key}\n";
261#           print STDERR "Keeping existing value\n";
262            # Existing value seems to be option specified in collect.cfg
263
264            ### $self->{$key} = $child_self->{$key};
265           
266        }
267        else {
268##          print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
269        }
270
271        }
272        else {
273        $self->{$key} = $child_self->{$key};
274        }
275    }
276    }
277
278    return $self;   
279}
280
281# initialize BasePlugin options
282# if init() is overridden in a sub-class, remember to call BasePlugin::init()
283sub init {
284    my $self = shift (@_);
285    my ($verbosity, $outhandle, $failhandle) = @_;
286
287    # verbosity is passed through from the processor
288    $self->{'verbosity'} = $verbosity;
289
290    # as are the outhandle and failhandle
291    $self->{'outhandle'} = $outhandle if defined $outhandle;
292    $self->{'failhandle'} = $failhandle;
293#    $self->SUPER::init(@_);
294   
295    # set process_exp and block_exp to defaults unless they were
296    # explicitly set
297
298    if ((!$self->is_recursive()) and
299    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
300
301    $self->{'process_exp'} = $self->get_default_process_exp ();
302    if ($self->{'process_exp'} eq "") {
303        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
304    }
305    }
306
307    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
308    $self->{'block_exp'} = $self->get_default_block_exp ();
309    }
310
311}
312
313sub begin {
314    my $self = shift (@_);
315    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
316
317    if ($self->{'OIDtype'} eq "auto") {
318    # hasn't been set in the plugin, use the processor values
319    $self->{'OIDtype'} = $processor->{'OIDtype'};
320    $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
321    }
322    if ($self->{'OIDtype'} eq "hash") {
323    # should we hash on the file or on the doc xml??
324    $self->{'OIDtype'} = $self->get_oid_hash_type();
325    if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
326        $self->{'OIDtype'} = "hash_on_file";
327    }
328    }
329}
330
331# This is called once if removeold is set with import.pl. Most plugins will do
332# nothing but if a plugin does any stuff outside of creating doc obj, then
333# it may need to clear something.
334sub remove_all {
335    my $self = shift (@_);
336    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
337}
338
339# This is called per document for docs that have been deleted from the
340# collection. Most plugins will do nothing
341# but if a plugin does any stuff outside of creating doc obj, then it may need
342# to clear something.
343sub remove_one {
344    my $self = shift (@_);
345   
346    my ($file, $oids, $archivedir) = @_;
347    return 0 if $self->can_process_this_file($file);
348    return undef;
349}
350
351sub end {
352    # potentially called at the end of each plugin pass
353    # import.pl only has one plugin pass, but buildcol.pl has multiple ones
354
355    my ($self) = shift (@_);
356}
357
358sub deinit {
359    # called only once, after all plugin passes have been done
360
361    my ($self) = @_;
362}
363
364# default hashing type is to hash on the original file (or converted file)
365# override this to return hash_on_ga_xml for filetypes where hashing on the
366# file is no good eg video
367sub get_oid_hash_type {
368
369    my $self = shift (@_);
370
371    return "hash_on_file";
372}
373
374
375# this function should be overridden to return 1
376# in recursive plugins
377sub is_recursive {
378    my $self = shift (@_);
379
380    return 0;
381}
382
383sub get_default_block_exp {
384    my $self = shift (@_);
385
386    return "";
387}
388
389sub get_default_process_exp {
390    my $self = shift (@_);
391
392    return "";
393}
394
395
396# rename imported files and assoc files using URL encoding by default
397# as this will work for most plugins and give more legible filenames
398sub get_default_file_rename_method() {
399    my $self = shift (@_);
400    return "url";
401}
402
403# returns this plugin's active (possibly user-selected) file_rename_method
404sub get_file_rename_method() {
405    my $self = shift (@_);
406    my $rename_method = $self->{'file_rename_method'};
407    if($rename_method) {
408    return $rename_method;
409    } else {   
410    return $self->get_default_file_rename_method();
411    }
412}
413
414# default implementation is to do nothing
415sub store_block_files {
416   
417    my $self =shift (@_);
418    my ($filename_full_path, $block_hash) = @_;
419
420}
421
422# put files to block into hash
423sub use_block_expressions {
424
425    my $self =shift (@_);
426    my ($filename_full_path, $block_hash) = @_;
427
428    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
429
430    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
431    &util::block_filename($block_hash,$filename_full_path);
432    }
433
434}
435
436#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
437sub block_cover_image
438{
439    my $self =shift;
440    my ($filename, $block_hash) = @_;
441
442    $filename = &util::upgrade_if_dos_filename($filename);
443
444    if ($self->{'cover_image'}) {
445    my $coverfile = $filename;
446    $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
447
448    #if there is no file extension, coverfile will be the same as filename
449    return if $coverfile eq $filename;
450   
451    if (!&FileUtils::fileExists($coverfile)) {
452        $coverfile =~ s/jpg$/JPG/;
453    }   
454    if (&FileUtils::fileExists($coverfile)) {
455        &util::block_filename($block_hash,$coverfile);
456    }
457    }
458
459    return;
460}
461
462
463# discover all the files that should be blocked by this plugin
464# check the args ...
465sub file_block_read {
466
467    my $self = shift (@_); 
468    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
469    # Keep track of filenames with same root but different extensions
470    # Used to support -associate_ext and the more generalised
471    # -associate_tail_re
472    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
473
474    if (!-d $filename_full_path) {
475    $block_hash->{'all_files'}->{$file} = 1;
476    }
477
478    my $associate_tail_re = $self->{'associate_tail_re'};
479    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
480    my ($file_prefix,$file_ext)
481        = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
482    if ((defined $file_prefix) && (defined $file_ext)) {
483        my $shared_fileroot = $block_hash->{'shared_fileroot'};
484        if (!defined $shared_fileroot->{$file_prefix}) {
485        my $file_prefix_rec = { 'tie_to'  => undef,
486                        'exts'    => {} };
487        $shared_fileroot->{$file_prefix} = $file_prefix_rec;
488        }
489       
490        my $file_prefix_rec = $shared_fileroot->{$file_prefix};
491
492        if ($self->can_process_this_file($filename_full_path)) {
493        # This is the document the others should be tied to
494        $file_prefix_rec->{'tie_to'} = $file_ext;
495        }
496        else {
497        if ($file_ext =~ m/$associate_tail_re$/) {
498            # this file should be associated to the main one
499            $file_prefix_rec->{'exts'}->{$file_ext} = 1;
500        }
501        }
502
503    }
504    }
505
506    # check block expressions
507    $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
508
509    # now check whether we are actually processing this
510    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
511    return undef; # can't recognise
512    }
513   
514    # if we have a block_exp, then this overrides the normal 'smart' blocking
515    $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
516
517    # block the cover image if there is one
518    if ($self->{'cover_image'}) {
519    $self->block_cover_image($filename_full_path, $block_hash);
520    }
521   
522    return 1;
523}
524
525# plugins that rely on more than process_exp (eg XML plugins) can override this method
526sub can_process_this_file {
527    my $self = shift(@_);
528    my ($filename) = @_;
529
530    if (-d $filename && !$self->{'can_process_directories'}) {
531    return 0;
532    }
533
534    if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
535    return 1;
536    }
537    return 0;
538   
539}
540
541# Even if a plugin can extract metadata in its metadata_read pass,
542# make the default return 'undef' so processing of the file continues
543# down the pipeline, so other plugins can also have the opportunity to
544# locate metadata and set it up in the extrametakeys variables that
545# are passed around.
546
547sub can_process_this_file_for_metadata {
548    my $self = shift(@_);
549
550    return undef;
551}
552
553
554# just converts path as is to utf8.
555sub filepath_to_utf8 {
556    my $self = shift (@_); 
557    my ($file, $file_encoding) = @_;
558    my $filemeta = $file;
559
560    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
561
562    # Whenever filename-encoding is set to any of the auto settings, we
563    # check if the filename is already in UTF8. If it is, then we're done.
564    if($filename_encoding =~ m/auto/) {
565    if(&unicode::check_is_utf8($filemeta))
566    {
567        $filename_encoding = "utf8";
568        return $filemeta;
569    }
570    }
571   
572    # Auto setting, but filename is not utf8
573    if ($filename_encoding eq "auto")
574    {
575    # try textcat
576    $filename_encoding = $self->textcat_encoding($filemeta);
577   
578    # check the locale next
579    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
580   
581   
582    # now try the encoding of the document, if available
583    if ($filename_encoding eq "undefined" && defined $file_encoding) {
584        $filename_encoding = $file_encoding;
585    }
586
587    }
588
589    elsif ($filename_encoding eq "auto-language-analysis")
590    {   
591    $filename_encoding = $self->textcat_encoding($filemeta);
592
593    # now try the encoding of the document, if available
594    if ($filename_encoding eq "undefined" && defined $file_encoding) {
595        $filename_encoding = $file_encoding;
596    }
597    }
598
599    elsif ($filename_encoding eq "auto-filesystem-encoding")
600    {   
601    # try locale
602    $filename_encoding = $self->locale_encoding();
603    }
604
605    elsif ($filename_encoding eq "auto-fl")
606    {
607    # filesystem-encoding (locale) then language-analysis (textcat)
608    $filename_encoding = $self->locale_encoding();
609   
610    # try textcat
611    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
612   
613    # else assume filename encoding is encoding of file content, if that's available
614    if ($filename_encoding eq "undefined" && defined $file_encoding) {
615        $filename_encoding = $file_encoding;
616    }
617    }
618   
619    elsif ($filename_encoding eq "auto-lf")
620    {
621    # language-analysis (textcat) then filesystem-encoding (locale)
622    $filename_encoding = $self->textcat_encoding($filemeta);
623   
624    # guess filename encoding from encoding of file content, if available
625    if ($filename_encoding eq "undefined" && defined $file_encoding) {
626        $filename_encoding = $file_encoding;
627    }
628
629    # try locale
630    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
631    }
632   
633    # if still undefined, use utf8 as fallback
634    if ($filename_encoding eq "undefined") {
635    $filename_encoding = "utf8";
636    }
637
638    #print STDERR "**** UTF8 encoding the filename $filemeta ";
639   
640    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
641    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
642    # be always utf8 (in which case the filename's encoding is also set as utf8 even
643    # though the filename need not be if it originates from another system)--in such
644    # cases attempt to make the filename utf8 to match.
645    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
646    &unicode::ensure_utf8(\$filemeta);
647    }
648
649    # convert non-unicode encodings to utf8
650    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
651    $filemeta = &unicode::unicode2utf8(
652                       &unicode::convert2unicode($filename_encoding, \$filemeta)
653                       );
654    }
655
656    #print STDERR " from encoding $filename_encoding -> $filemeta\n";
657    return $filemeta;
658}
659
660# gets the filename with no path, converts to utf8, and then dm safes it.
661# filename_encoding set by user
662sub filename_to_utf8_metadata
663{
664    my $self = shift (@_); 
665    my ($file, $file_encoding) = @_;
666
667    my $outhandle = $self->{'outhandle'};
668
669    print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
670    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
671    print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
672
673
674    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
675    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
676
677    return $filemeta;
678}
679
680sub locale_encoding {
681    my $self = shift(@_);
682   
683    if (!defined $self->{'filesystem_encoding'}) {
684    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
685    }
686
687    #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
688    return $self->{'filesystem_encoding'}; # can be the string "undefined"
689}
690
691sub textcat_encoding {
692    my $self = shift(@_);
693    my ($filemeta) = @_;
694
695    # analyse filenames without extensions and digits (and trimmed of
696    # surrounding whitespace), so that irrelevant chars don't confuse
697    # textcat
698    my $strictfilemeta = $filemeta;
699    $strictfilemeta =~ s/\.[^\.]+$//g;
700    $strictfilemeta =~ s/\d//g;
701    $strictfilemeta =~ s/^\s*//g;
702    $strictfilemeta =~ s/\s*$//g;
703   
704    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
705    if(!defined $filename_encoding) {
706    $filename_encoding = "undefined";
707    }
708
709    return $filename_encoding; # can be the string "undefined"
710}
711
712# performs textcat
713sub encoding_from_language_analysis {
714    my $self = shift(@_);
715    my ($text) = @_;
716
717    my $outhandle = $self->{'outhandle'};
718    my $best_encoding = undef;
719   
720    # get the language/encoding of the textstring using textcat
721    require textcat;  # Only load the textcat module if it is required
722    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
723    my $results = $self->{'textcat'}->classify_cached_filename(\$text);
724
725
726    if (scalar @$results < 0) {
727    return undef;
728    }
729   
730    # We have some results, we choose the first
731    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
732   
733    $best_encoding = $encoding;
734    if (!defined $best_encoding) {
735    return undef;
736    }
737   
738    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
739    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
740    $best_encoding = 'utf8';
741    }
742   
743   
744    # check for equivalents where textcat doesn't have some encodings...
745    # eg MS versions of standard encodings
746    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
747    my $iso = $1; # which variant of the iso standard?
748    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
749    if ($text =~ /[\x80-\x9f]/) {
750        # Western Europe
751        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
752        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
753        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
754        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
755        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
756        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
757        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
758    }
759    }
760   
761    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
762    !defined $encodings::encodings->{$best_encoding})
763    {
764    if ($self->{'verbosity'}) {
765        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
766    }
767    $best_encoding = undef;
768    }
769   
770    return $best_encoding;
771}
772
773# uses locale
774sub get_filesystem_encoding
775{
776
777    my $self = shift(@_);
778
779    my $outhandle = $self->{'outhandle'};
780    my $filesystem_encoding = undef;
781
782    eval {
783    # Works for Windows as well, returning the DOS code page in use
784    use POSIX qw(locale_h);
785   
786    # With only one parameter, setlocale retrieves the
787    # current value
788    my $current_locale = setlocale(LC_CTYPE);
789   
790    my $char_encoding = undef;
791    if ($current_locale =~ m/\./) {
792        ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
793        $char_encoding = lc($char_encoding);
794    }
795    else {
796        if ($current_locale =~ m/^(posix|c)$/i) {
797        $char_encoding = "ascii";
798        }
799    }
800
801    if (defined $char_encoding) {
802        if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
803        $char_encoding = "$1\_$2\_$3";
804        }
805
806        $char_encoding =~ s/-/_/g;
807        $char_encoding =~ s/^utf_8$/utf8/;
808       
809        if ($char_encoding =~ m/^\d+$/) {
810        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
811            $char_encoding = "windows_$char_encoding";
812        }
813        elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
814            $char_encoding = "dos_$char_encoding";
815        }
816        }
817       
818        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
819        || (defined $encodings::encodings->{$char_encoding})) {
820        $filesystem_encoding = $char_encoding;
821        }
822        else {
823        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
824        }
825    }
826   
827
828    };
829    if ($@) {
830    print $outhandle "$@\n";
831    print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n";
832   
833    }
834
835    return $filesystem_encoding;
836}
837
838
839
840sub deduce_filename_encoding
841{
842    my $self = shift (@_); 
843    my ($file,$metadata,$plugin_filename_encoding) = @_;
844
845    my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
846    my $deduced_filename_encoding = undef;
847   
848    # Start by looking for manually assigned metadata
849    if (defined $gs_filename_encoding) {
850    if (ref ($gs_filename_encoding) eq "ARRAY") {
851        my $outhandle = $self->{'outhandle'};
852       
853        $deduced_filename_encoding = $gs_filename_encoding->[0];
854       
855        my $num_vals = scalar(@$gs_filename_encoding);
856        if ($num_vals>1) {
857        print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
858        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
859        }
860    }
861    else {
862        $deduced_filename_encoding = $gs_filename_encoding;
863    }
864    }
865   
866    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
867    # Look to see if plugin specifies this value
868
869    if (defined $plugin_filename_encoding) {
870        # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
871        if ($plugin_filename_encoding =~ m/^auto-.*$/) {
872        my $outhandle = $self->{'outhandle'};
873        print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
874        print $outhandle "         default to 'auto'\n";
875        $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
876        }
877       
878        if ($plugin_filename_encoding ne "auto") {
879        # We've been given a specific filenamne encoding
880        # => so use it!
881        $deduced_filename_encoding = $plugin_filename_encoding;
882        }
883    }
884    }
885   
886    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
887
888    # Look to file system to provide a character encoding
889
890    # If Windows NTFS, then -- assuming we work with long file names got through
891    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
892
893    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
894        # Can do better than working with the DOS character encoding returned by locale     
895        $deduced_filename_encoding = "unicode";
896    }
897    else {
898        # Unix of some form or other
899
900        # See if we can determine the file system encoding through locale
901        $deduced_filename_encoding = $self->locale_encoding();
902       
903        # if locale shows us filesystem is utf8, check to see filename is consistent
904        # => if not, then we have an "alien" filename on our hands
905
906        if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
907        if (!&unicode::check_is_utf8($file)) {
908            # "alien" filename, so revert
909            $deduced_filename_encoding = undef;
910        }
911        }
912    }
913    }
914   
915#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
916#       # Last chance, apply textcat to deduce filename encoding
917#       $deduced_filename_encoding = $self->textcat_encoding($file);
918#    }
919
920    if ($self->{'verbosity'}>3) {
921    my $outhandle = $self->{'outhandle'};
922
923    if (defined $deduced_filename_encoding) {
924        print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n";
925    }
926    else {
927        print $outhandle "  No filename encoding deduced\n";
928    }
929    }
930   
931    return $deduced_filename_encoding;
932}
933
934
935
936
937# Notionally written to be called once for each document, it is however safe to
938# call multiple times (as in the case of ImagePlugin) which calls this later on
939# after the original image has potentially been converted to a *new* source image
940# format (e.g. TIFF to PNG)
941
942sub set_Source_metadata {
943    my $self = shift (@_); 
944    my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
945   
946    # 1. Sets the filename (Source) for display encoded as Unicode if possible,
947    #    and (as a fallback) using %xx if not for non-ascii chars
948    # 2. Sets the url ref (SourceFile) to the URL encoded version
949    #    of filename for generated files
950   
951    my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
952
953    my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
954
955    my $octet_file = $raw_file;
956
957    # UTF-8 version of filename
958#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
959#   print STDERR "**** Setting Source Metadata given: $octet_file\n";
960#    }
961   
962    # Deal with (on Windows) raw filenames that are in their
963    # abbreviated DOS form
964
965    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
966    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
967        if (-e $raw_filename) {
968        my $unicode_filename = Win32::GetLongPathName($raw_filename);
969       
970        my $unused_full_uf;
971        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
972        }
973    }
974    }
975
976    my $url_encoded_filename;
977    if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
978    # => Generate a pretty print version of filename that is mapped to Unicode
979   
980    # Use filename_encoding to map raw filename to a Perl unicode-aware string
981    $url_encoded_filename = decode($filename_encoding,$octet_file);     
982    }
983    else {
984    # otherwise generate %xx encoded version of filename for char > 127
985    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
986    }
987   
988#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
989#   print STDERR "****** saving Source as:             $url_encoded_filename\n";
990#    }
991
992    # In the case of converted files and (generalized) exploded documents, there
993    # will already be a source filename => store as OrigSource before overriding
994    my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
995    if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
996    $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
997    }
998       
999    # Source is the UTF8 display name - not necessarily the name of the file on the system
1000    if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1001    # on Darwin want all display strings to be in composed form, then can search on that
1002    $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
1003    }
1004    $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
1005
1006   
1007    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
1008    # If using URL encoding, then SourceFile is the url-reference to url-encoded
1009    # renamed_raw_url: it's a url that refers to the actual file on the system
1010    my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
1011
1012    $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
1013                    $renamed_raw_url);
1014
1015#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1016#   print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
1017#    }
1018}
1019
1020# this should be called by all plugins to set the oid of the doc obj, rather
1021# than calling doc_obj->set_OID directly
1022sub add_OID {
1023    my $self = shift (@_); 
1024    my ($doc_obj, $force) = @_;
1025
1026    # don't add one if there is one already set, unless we are forced to do so
1027    return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
1028    $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1029
1030    # see if there is a plugin specific set_OID function
1031    if (defined ($self->can('set_OID'))) {
1032    $self->set_OID(@_); # pass through doc_obj and any extra arguments
1033    }
1034    else {
1035    # use the default set_OID() in doc.pm
1036    $doc_obj->set_OID();
1037    }
1038
1039}
1040
1041# The BasePlugin read_into_doc_obj() function. This function does all the
1042# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
1043# a file and sets up a slew of metadata all saved in doc_obj, which
1044# it then returns as part of a tuple (process_status,doc_obj)
1045#
1046# Much of this functionality used to reside in read, but it was broken
1047# down into a supporting routine to make the code more flexible. 
1048#
1049# recursive plugins (e.g. RecPlug) and specialized plugins like those
1050# capable of processing many documents within a single file (e.g.
1051# GMLPlug) will normally want to implement their own version of
1052# read_into_doc_obj()
1053#
1054# Note that $base_dir might be "" and that $file might
1055# include directories
1056
1057# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1058sub read_into_doc_obj {
1059    my $self = shift (@_); 
1060    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1061
1062    my $outhandle = $self->{'outhandle'};
1063
1064    # should we move this to read? What about secondary plugins?
1065    my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1066    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1067    print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1068    if $self->{'verbosity'} > 1;
1069
1070    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1071   
1072    # create a new document
1073    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1074    my $top_section = $doc_obj->get_top_section();
1075
1076    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1077    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1078   
1079
1080    my $plugin_filename_encoding = $self->{'filename_encoding'};
1081    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1082    $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1083
1084    # plugin specific stuff - what args do we need here??
1085    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1086    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1087    return -1;
1088    }
1089   
1090    # include any metadata passed in from previous plugins
1091    # note that this metadata is associated with the top level section
1092    my $section = $doc_obj->get_top_section();
1093    # can we merge these two methods??
1094    $self->add_associated_files($doc_obj, $filename_full_path);
1095    $self->extra_metadata ($doc_obj, $section, $metadata);
1096    $self->auto_extract_metadata($doc_obj);
1097
1098    # if we haven't found any Title so far, assign one
1099    # this was shifted to here from inside read()
1100    $self->title_fallback($doc_obj,$section,$filename_no_path);
1101   
1102    $self->add_OID($doc_obj);
1103   
1104    $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1105    return (1,$doc_obj);
1106}
1107
1108sub post_process_doc_obj {
1109    my $self = shift (@_); 
1110    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1111
1112    return 1;
1113}
1114
1115sub add_dummy_text {
1116    my $self = shift(@_);
1117    my ($doc_obj, $section) = @_;
1118
1119    # add NoText metadata so we can hide this dummy text in format statements
1120    $doc_obj->add_metadata($section, "NoText", "1");
1121    $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1122   
1123}
1124
1125# does nothing. Can be overridden by subclass
1126sub auto_extract_metadata {
1127    my $self = shift(@_);
1128    my ($doc_obj) = @_;
1129}
1130
1131# adds cover image, associate_file options stuff. Should be called by sub class
1132# read_into_doc_obj
1133sub add_associated_files {
1134    my $self = shift(@_);
1135    # whatis filename??
1136    my ($doc_obj, $filename) = @_;
1137   
1138    # add in the cover image
1139    if ($self->{'cover_image'}) {
1140    $self->associate_cover_image($doc_obj, $filename);
1141    }
1142    # store the original (used for eg TextPlugin to store the original for OAI)
1143    if ($self->{'store_original_file'}) {
1144    $self->associate_source_file($doc_obj, $filename);
1145    }
1146   
1147
1148}
1149
1150# implement this if you are extracting metadata for other documents
1151sub metadata_read {
1152    my $self = shift (@_);
1153    my ($pluginfo, $base_dir, $file, $block_hash,
1154    $extrametakeys, $extrametadata, $extrametafile,
1155    $processor, $gli, $aux) = @_;
1156   
1157    # can we process this file??
1158    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1159    return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
1160
1161    return 1; # we recognise the file, but don't actually do anything with it
1162}
1163
1164
1165# The BasePlugin read() function. This function calls read_into_doc_obj()
1166# to ensure all the right things to make general options work for a
1167# given plugin are done. It then calls the process() function which
1168# does all the work specific to a plugin (like the old read functions
1169# used to do). Most plugins should define their own process() function
1170# and let this read() function keep control. 
1171#
1172# recursive plugins (e.g. RecPlug) and specialized plugins like those
1173# capable of processing many documents within a single file (e.g.
1174# GMLPlug) might want to implement their own version of read(), but
1175# more likely need to implement their own version of read_into_doc_obj()
1176#
1177# Return number of files processed, undef if can't recognise, -1 if can't
1178# process
1179
1180sub read {
1181    my $self = shift (@_); 
1182    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1183
1184    # can we process this file??
1185    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1186
1187    return undef unless $self->can_process_this_file($filename_full_path);
1188   
1189    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1190   
1191    if ((defined $process_status) && ($process_status == 1)) {
1192   
1193    # process the document
1194    $processor->process($doc_obj);
1195
1196    $self->{'num_processed'} ++;
1197    undef $doc_obj;
1198    }
1199    # delete any temp files that we may have created
1200    $self->clean_up_after_doc_obj_processing();
1201
1202
1203    # if process_status == 1, then the file has been processed.
1204    return $process_status;
1205
1206}
1207
1208# returns undef if file is rejected by the plugin
1209sub process {
1210    my $self = shift (@_);
1211    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1212
1213    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1214
1215    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1216    print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1217
1218    die "\n";
1219
1220    return undef; # never gets here
1221}
1222
1223# overwrite this method to delete any temp files that we have created
1224sub clean_up_after_doc_obj_processing {
1225    my $self = shift(@_);
1226
1227}
1228
1229# write_file -- used by ConvertToPlug, for example in post processing
1230#
1231# where should this go, is here the best place??
1232sub utf8_write_file {
1233    my $self = shift (@_);
1234    my ($textref, $filename) = @_;
1235   
1236    if (!open (FILE, ">:utf8", $filename)) {
1237    gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1238    die "\n";
1239    }
1240    print FILE $$textref;
1241   
1242    close FILE;
1243}
1244
1245
1246sub filename_based_title
1247{
1248    my $self = shift (@_);
1249    my ($file) = @_;
1250
1251    my $file_derived_title = $file;
1252    $file_derived_title =~ s/_/ /g;
1253    $file_derived_title =~ s/\.[^.]+$//;
1254
1255    return $file_derived_title;
1256}
1257
1258
1259sub title_fallback
1260{
1261    my $self = shift (@_);
1262    my ($doc_obj,$section,$file) = @_;
1263
1264    if (!defined $doc_obj->get_metadata_element ($section, "Title")
1265    || $doc_obj->get_metadata_element($section, "Title") eq "") {
1266
1267    my $source_file = $doc_obj->get_metadata_element($section, "Source");
1268    my $file_derived_title;
1269    if (defined $source_file) {
1270        $file_derived_title =  $self->filename_based_title($source_file);
1271    }
1272    else {
1273        # pp = pretty print
1274        my $pp_file = (defined $source_file) ? $source_file : $file;
1275
1276        my $raw_title = $self->filename_based_title($file);
1277        my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1278    }
1279
1280
1281    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1282        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1283    }
1284    else {
1285        $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1286    }
1287    }
1288   
1289}
1290
1291# add any extra metadata that's been passed around from one
1292# plugin to another.
1293# extra_metadata uses add_utf8_metadata so it expects metadata values
1294# to already be in utf8
1295sub extra_metadata {
1296    my $self = shift (@_);
1297    my ($doc_obj, $cursection, $metadata) = @_;
1298
1299    my $associate_tail_re = $self->{'associate_tail_re'};
1300
1301# Sort the extra metadata for diffcol so these meta appear in a consistent order
1302# in doc.xml. Necessary for the ex.PDF.* and ex.File.* meta that's extracted in
1303# the PDFBox collection, as the order of these varies between CentOS and Ubuntu.
1304#    foreach my $field (sort keys(%$metadata)) {
1305    foreach my $field (keys(%$metadata)) {
1306    # $metadata->{$field} may be an array reference
1307    if ($field eq "gsdlassocfile_tobe") {
1308        # 'gsdlassocfile_tobe' is artificially introduced metadata
1309        # that is used to signal that certain additional files should
1310        # be tied to this document.  Useful in situations where a
1311        # metadata pass in the plugin pipeline works out some files
1312        # need to be associated with a document, but the document hasn't
1313        # been formed yet.
1314        my $equiv_form = "";
1315        foreach my $gaf (@{$metadata->{$field}}) {
1316        my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1317        my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1318       
1319        # we need to make sure the filename is valid utf-8 - we do
1320        # this by url or base64 encoding it
1321        # $tail_filename is the name that we store the file as
1322        $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1323        $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1324        $doc_obj->associate_source_file($full_filename);
1325        # If the filename is url_encoded, we need to encode the % signs
1326        # in the filename, so that it works in a url
1327        my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1328        # work out extended tail extension (i.e. matching tail re)
1329
1330        my ($file_prefix,$file_extended_ext)
1331            = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1332        my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1333        my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1334
1335        # the greenstone 2 stuff
1336        my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1337        #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1338        my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1339
1340        my $srcicon = "_icon".$doc_ext."_";
1341        my $end_doclink = "</a>";
1342       
1343        my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1344
1345
1346        if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1347            # for metadata such as [mp3._edited] [mp3._full] ...
1348            $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1349        }
1350
1351        # for multiple metadata such as [mp3.assoclink]
1352        $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1353
1354        $equiv_form .= " $assoc_form"; 
1355
1356        # following are used for greenstone 3,
1357        $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1358        $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1359        $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1360
1361        }
1362        $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1363    }
1364    elsif ($field eq "gsdlzipfilename") {
1365        # special case for when files have come out of a zip. source_path
1366        # (used for archives dbs and keeping track for incremental import)
1367        # must be set to the zip file name
1368        my $zip_filename = $metadata->{$field};
1369        # overwrite the source_path
1370        $doc_obj->set_source_path($zip_filename);
1371        # and set the metadata
1372        $zip_filename = &util::filename_within_collection($zip_filename);
1373        $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1374        $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1375    }
1376    elsif (ref ($metadata->{$field}) eq "ARRAY") {
1377        map {
1378        $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1379        } @{$metadata->{$field}};
1380    } else {
1381        $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1382    }
1383    }
1384}
1385
1386
1387sub compile_stats {
1388    my $self = shift(@_);
1389    my ($stats) = @_;
1390
1391    $stats->{'num_processed'} += $self->{'num_processed'};
1392    $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1393    $stats->{'num_archives'} += $self->{'num_archives'};
1394
1395}
1396sub associate_source_file {
1397    my $self = shift(@_);
1398   
1399    my ($doc_obj, $filename) = @_;
1400    my $cursection = $doc_obj->get_top_section();
1401    my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1402   
1403    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1404    # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1405    $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1406    $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1407}
1408
1409sub associate_cover_image {
1410    my $self = shift(@_);
1411    my ($doc_obj, $filename) = @_;
1412
1413    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1414
1415    $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1416    $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1417
1418    if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1419    # don't stat() for existence e.g. for multiple document input files
1420    # (eg SplitPlug)
1421    return;
1422    }
1423
1424    my $top_section=$doc_obj->get_top_section();
1425
1426    if (&FileUtils::fileExists($upgraded_filename)) {
1427    $doc_obj->associate_source_file($filename);
1428        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1429    $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1430    } else {
1431    my $upper_filename = $filename;
1432    my $upgraded_upper_filename = $upgraded_filename;
1433
1434    $upper_filename =~ s/jpg$/JPG/;
1435    $upgraded_upper_filename =~ s/jpg$/JPG/;
1436
1437    if (&FileUtils::fileExists($upgraded_upper_filename)) {
1438        $doc_obj->associate_source_file($upper_filename);
1439        $doc_obj->associate_file($upper_filename, "cover.jpg",
1440                     "image/jpeg");
1441        $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1442    } else {
1443        # file doesn't exist, so record the fact that it's missing so
1444        # we don't stat() again (stat is slow)
1445        $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1446    }
1447    }
1448
1449}
1450
1451
1452# Overridden by exploding plugins (eg. ISISPlug)
1453sub clean_up_after_exploding
1454{
1455    my $self = shift(@_);
1456}
1457
1458
1459
14601;
Note: See TracBrowser for help on using the browser.