root/gsdl/trunk/perllib/plugins/BasePlugin.pm @ 16557

Revision 16557, 32.4 KB (checked in by ak19, 11 years ago)

Auto filename encoding has several additional settings now, these are handled by subroutine filepath_to_utf8 which has changed accordingly. Some additional helper subroutines added. This file BasePlugin?.pm is an intermediate but working version (still has many debug output statements even when most are commented out, but as I want to test the changes out on Windows first, I want to retain the debug statements).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33
34use multiread;
35use encodings;
36use unicode;
37use textcat;
38use doc;
39eval "require diagnostics"; # some perl distros (eg mac) don't have this
40use ghtml;
41use gsprintf 'gsprintf';
42
43use PrintInfo;
44
45BEGIN {
46    @BasePlugin::ISA = ( 'PrintInfo' );
47}
48
49our $encoding_list =
50    [ { 'name' => "ascii",
51    'desc' => "{BasePlugin.encoding.ascii}" },
52      { 'name' => "utf8",
53    'desc' => "{BasePlugin.encoding.utf8}" },
54      { 'name' => "unicode",
55    'desc' => "{BasePlugin.encoding.unicode}" } ];
56     
57
58my $e = $encodings::encodings;
59foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
60{
61    my $hashEncode =
62    {'name' => $enc,
63     'desc' => $e->{$enc}->{'name'}};
64   
65    push(@{$encoding_list},$hashEncode);
66}
67
68our $encoding_plus_auto_list =
69    [ { 'name' => "auto",
70    'desc' => "{BasePlugin.filename_encoding.auto}" },
71       { 'name' => "auto-language-analysis",
72    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
73      { 'name' => "auto-filesystem-encoding",
74    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
75      { 'name' => "auto-fl",
76    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
77      { 'name' => "auto-lf",
78    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
79
80push(@{$encoding_plus_auto_list},@{$encoding_list});
81
82my $arguments =
83    [ { 'name' => "process_exp",
84    'desc' => "{BasePlugin.process_exp}",
85    'type' => "regexp",
86    'deft' => "",
87    'reqd' => "no" },
88      { 'name' => "no_blocking",
89    'desc' => "{BasePlugin.no_blocking}",
90    'type' => "flag",
91    'reqd' => "no"},
92      { 'name' => "block_exp",
93    'desc' => "{BasePlugin.block_exp}",
94    'type' => "regexp",
95    'deft' => "",
96    'reqd' => "no" },
97      { 'name' => "associate_ext",
98    'desc' => "{BasePlugin.associate_ext}",
99    'type' => "string",
100    'reqd' => "no" },
101      { 'name' => "associate_tail_re",
102    'desc' => "{BasePlugin.associate_tail_re}",
103    'type' => "string",
104    'reqd' => "no" },
105      { 'name' => "use_as_doc_identifier",
106    'desc' => "{BasePlugin.use_as_doc_identifier}",
107    'type' => "string",
108    'reqd' => "no" ,
109    'deft' => "" } ,
110     { 'name' => "no_cover_image",
111    'desc' => "{BasePlugin.no_cover_image}",
112    'type' => "flag",
113    'reqd' => "no" },
114      { 'name' => "filename_encoding",
115    'desc' => "{BasePlugin.filename_encoding}",
116    'type' => "enum",
117    'deft' => "auto",
118    'list' => $encoding_plus_auto_list,
119    'reqd' => "no" },
120      { 'name' => "smart_block",
121        'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
122        'type' => "flag",
123        'reqd' => "no",
124    'hiddengli' => "yes" } # deprecated, but leave in for old collections
125
126     
127      ];
128
129
130my $options = { 'name'     => "BasePlugin",
131        'desc'     => "{BasePlugin.desc}",
132        'abstract' => "yes",
133        'inherits' => "no",
134        'args'     => $arguments };
135
136
137sub new {
138
139    my ($class) = shift (@_);
140    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141    push(@$pluginlist, $class);
142
143    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144    push(@{$hashArgOptLists->{"OptList"}},$options);
145
146    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
147   
148    if ($self->{'info_only'}) {
149        # don't worry about any options etc
150        return bless $self, $class;
151    }
152
153    if ($self->{'smart_block'}) {
154    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
155    }
156    $self->{'smart_block'} = undef;
157
158    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
159    $self->{'plugin_type'} = $plugin_name;
160
161    $self->{'num_processed'} = 0;
162    $self->{'num_not_processed'} = 0;
163    $self->{'num_blocked'} = 0;
164    $self->{'num_archives'} = 0;
165    $self->{'cover_image'} = 1; # cover image is on by default
166    $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
167    #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
168   
169    my $associate_ext = $self->{'associate_ext'};
170    if ((defined $associate_ext) && ($associate_ext ne "")) {
171
172    my $associate_tail_re = $self->{'associate_tail_re'};
173    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
174        my $outhandle = $self->{'outhandle'};
175        print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
176        print $outhandle "         defaulting to 'associate_tail_re'\n";
177    }
178    else {
179        my @exts = split(/,/,$associate_ext);
180
181        my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
182        my $associate_tail_re = join("|",@exts_bracketed);
183        $self->{'associate_tail_re'} = $associate_tail_re;
184    }
185
186    delete $self->{'associate_ext'};
187    }
188
189    return bless $self, $class;
190
191}
192
193# initialize BasePlugin options
194# if init() is overridden in a sub-class, remember to call BasePlugin::init()
195sub init {
196    my $self = shift (@_);
197    my ($verbosity, $outhandle, $failhandle) = @_;
198
199    # verbosity is passed through from the processor
200    $self->{'verbosity'} = $verbosity;
201
202    # as are the outhandle and failhandle
203    $self->{'outhandle'} = $outhandle if defined $outhandle;
204    $self->{'failhandle'} = $failhandle;
205#    $self->SUPER::init(@_);
206   
207    # set process_exp and block_exp to defaults unless they were
208    # explicitly set
209
210    if ((!$self->is_recursive()) and
211    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
212
213    $self->{'process_exp'} = $self->get_default_process_exp ();
214    if ($self->{'process_exp'} eq "") {
215        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
216    }
217    }
218
219    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
220    $self->{'block_exp'} = $self->get_default_block_exp ();
221    }
222
223}
224
225sub begin {
226    my $self = shift (@_);
227    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
228}
229
230sub end {
231    # potentially called at the end of each plugin pass
232    # import.pl only has one plugin pass, but buildcol.pl has multiple ones
233
234    my ($self) = shift (@_);
235}
236
237sub deinit {
238    # called only once, after all plugin passes have been done
239
240    my ($self) = @_;
241}
242
243sub set_incremental {
244    my $self = shift(@_);
245    my ($incremental) = @_;
246
247    $self->{'incremental'} = $incremental;
248}
249
250# this function should be overridden to return 1
251# in recursive plugins
252sub is_recursive {
253    my $self = shift (@_);
254
255    return 0;
256}
257
258sub get_default_block_exp {
259    my $self = shift (@_);
260
261    return "";
262}
263
264sub get_default_process_exp {
265    my $self = shift (@_);
266
267    return "";
268}
269
270# default implementation is to do nothing
271sub store_block_files {
272   
273    my $self =shift (@_);
274    my ($filename_full_path, $block_hash) = @_;
275
276}
277
278# put files to block into hash
279sub use_block_expressions {
280
281    my $self =shift (@_);
282    my ($filename_full_path, $block_hash) = @_;
283
284    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
285    $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
286    }
287
288}
289
290#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
291sub block_cover_image
292{
293    my $self =shift;
294    my ($filename, $block_hash) = @_;
295
296    if ($self->{'cover_image'}) {
297    my $coverfile = $filename;
298    $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
299    if (!-e $coverfile) {
300        $coverfile =~ s/jpg$/JPG/;
301    }   
302    if (-e $coverfile) {
303        $block_hash->{'file_blocks'}->{$coverfile} = 1;
304    }
305    }
306
307    return;
308}
309
310
311# discover all the files that should be blocked by this plugin
312# check the args ...
313sub file_block_read {
314
315    my $self = shift (@_); 
316    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
317    # Keep track of filenames with same root but different extensions
318    # Used to support -associate_ext and the more generalised
319    # -associate_tail_re
320    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
321
322    my $associate_tail_re = $self->{'associate_tail_re'};
323    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
324   
325    my ($file_prefix,$file_ext)
326        = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
327   
328    if ((defined $file_prefix) && (defined $file_ext)) {
329        my $shared_fileroot = $block_hash->{'shared_fileroot'};
330        if (!defined $shared_fileroot->{$file_prefix}) {
331        my $file_prefix_rec = { 'tie_to'  => undef,
332                        'exts'    => {} };
333        $shared_fileroot->{$file_prefix} = $file_prefix_rec;
334        }
335       
336        my $file_prefix_rec = $shared_fileroot->{$file_prefix};
337
338        if ($self->can_process_this_file($filename_full_path)) {
339        # This is the document the others should be tied to
340        $file_prefix_rec->{'tie_to'} = $file_ext;
341        }
342        else {
343        if ($file_ext =~ m/$associate_tail_re$/) {
344            # this file should be associated to the main one
345            $file_prefix_rec->{'exts'}->{$file_ext} = 1;
346        }
347        }
348
349    }
350    }
351
352    # check block expressions
353    $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
354
355    # now check whether we are actually processing this
356    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
357    return undef; # can't recognise
358    }
359   
360    $self->store_block_files($filename_full_path, $block_hash) unless $self->{'no_blocking'};
361
362    # block the cover image if there is one
363    if ($self->{'cover_image'}) {
364    $self->block_cover_image($filename_full_path, $block_hash) unless $self->{'no_blocking'};
365    }
366       
367    return 1;
368}
369
370# plugins that rely on more than process_exp (eg XML plugins) can override this method
371sub can_process_this_file {
372    my $self = shift(@_);
373    my ($filename) = @_;
374
375    if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
376    return 1;
377    }
378    return 0;
379   
380}
381
382# just converts path as is to utf8.
383sub filepath_to_utf8 {
384    my $self = shift (@_); 
385    my ($file, $file_encoding) = @_;
386    my $filemeta = $file;
387
388    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
389
390##  print STDERR "**** User chose filename encoding setting: $filename_encoding\n";
391   
392    # Whenever filename-encoding is set to any of the auto settings, we
393    # check if the filename is already in UTF8. If it is, then we're done.
394    if($filename_encoding =~ m/auto/) {
395    if(&unicode::check_is_utf8($filemeta))
396    {
397##      print STDERR "**** It is already UTF8\n";
398        $filename_encoding = "utf8";
399        return $filemeta;
400    }
401    }
402   
403    # Auto setting, but filename is not utf8
404    if ($filename_encoding eq "auto")
405    {
406    # try textcat
407    $filename_encoding = $self->textcat_encoding($filemeta);
408   
409    # check the locale next
410    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
411   
412
413    # now try the encoding of the document, if available
414    if ($filename_encoding eq "undefined" && defined $file_encoding) {
415        $filename_encoding = $file_encoding;
416    }
417
418    }
419
420    elsif ($filename_encoding eq "auto-language-analysis")
421    {   
422    $filename_encoding = $self->textcat_encoding($filemeta);
423
424    # now try the encoding of the document, if available
425    if ($filename_encoding eq "undefined" && defined $file_encoding) {
426        $filename_encoding = $file_encoding;
427    }
428    }
429
430    elsif ($filename_encoding eq "auto-filesystem-encoding")
431    {
432    # try locale
433    $filename_encoding = $self->locale_encoding();
434    }
435
436    elsif ($filename_encoding eq "auto-fl")
437    {
438    # filesystem-encoding (locale) then language-analysis (textcat)
439    $filename_encoding = $self->locale_encoding();
440   
441    # try textcat
442    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
443       
444    # else assume filename encoding is encoding of file content, if that's available
445    if ($filename_encoding eq "undefined" && defined $file_encoding) {
446        $filename_encoding = $file_encoding;
447    }
448    }
449   
450    elsif ($filename_encoding eq "auto-lf")
451    {
452    # language-analysis (textcat) then filesystem-encoding (locale)
453    $filename_encoding = $self->textcat_encoding($filemeta);
454   
455    # guess filename encoding from encoding of file content, if available
456    if ($filename_encoding eq "undefined" && defined $file_encoding) {
457        $filename_encoding = $file_encoding;
458    }
459
460    # try locale
461    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
462    }
463
464##    print STDERR "**** filename_encoding selected: $filename_encoding \n";
465       
466    # if still undefined, use utf8 as fallback
467    if ($filename_encoding eq "undefined") {
468    $filename_encoding = "utf8";
469    }
470
471    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
472    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
473    # be always utf8 (in which case the filename's encoding is also set as utf8 even
474    # though the filename need not be if it originates from another system)--in such
475    # cases attempt to make the filename utf8 to match.
476    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
477##  print STDERR "**** BEFORE utf8 conversion: $filemeta\n";
478    &unicode::ensure_utf8(\$filemeta);
479##  print STDERR "**** AFTER utf8 conversion: $filemeta\n";
480    }
481
482
483    # convert non-unicode encodings to utf8
484    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
485    $filemeta = &unicode::unicode2utf8(
486      &unicode::convert2unicode($filename_encoding, \$filemeta)
487    );
488    }
489
490    print "*** filename encoding found: $filename_encoding\n";
491    print "*** utf8 encoded filename: $filemeta\n";
492
493    return $filemeta;
494}
495
496# gets the filename with no path, converts to utf8, and then dm safes it.
497#filename_encoding set by user
498sub filename_to_utf8_metadata
499{
500    my $self = shift (@_); 
501    my ($file, $file_encoding) = @_;
502
503    my $outhandle = $self->{'outhandle'};
504
505    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
506    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
507
508    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
509
510    return $dmsafe_filemeta;
511
512}
513
514sub locale_encoding {
515    my $self = shift(@_);
516   
517    if (!defined $self->{'filesystem_encoding'}) {
518    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
519    $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
520    }
521
522    print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
523    return $self->{'filesystem_encoding'}; # can be the string "undefined"
524}
525
526sub textcat_encoding {
527    my $self = shift(@_);
528    my ($filemeta) = @_;
529
530    # analyse filenames without extensions and digits (and trimmed of surrounding
531    # whitespace), so that irrelevant chars don't confuse textcat
532    my $strictfilemeta = $filemeta;
533    $strictfilemeta =~ s/\.[^\.]+$//g;
534    $strictfilemeta =~ s/\d//g;
535    $strictfilemeta =~ s/^\s*//g;
536    $strictfilemeta =~ s/\s*$//g;
537   
538##    print STDERR "**** strict filename is |$strictfilemeta|\n";
539    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
540    if(!defined $filename_encoding) {
541    $filename_encoding = "undefined";
542    }
543
544##    print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n";
545    return $filename_encoding; # can be the string "undefined"
546}
547
548# performs textcat
549sub encoding_from_language_analysis {
550    my $self = shift(@_);
551    my ($text) = @_;
552
553    my $outhandle = $self->{'outhandle'};
554    my $best_encoding = undef;
555   
556    # get the language/encoding of the file using textcat
557    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
558    #my $results = $self->{'textcat'}->classify(\$text);
559    my $results = $self->{'textcat'}->classify_cached(\$text);
560
561
562    if (scalar @$results < 0) {
563    print STDERR "**** Textcat returned 0 results\n";
564    return undef;
565    }
566   
567    print STDERR "**** TEXTCAT RESULTS for $text: ";
568    print STDERR join(",", @$results);
569    print STDERR "\n";
570
571    # We have some results, we choose the first
572    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
573   
574    $best_encoding = $encoding;
575    if (!defined $best_encoding) {
576##  print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n";
577    return undef;
578    }
579       
580    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
581    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
582##  print STDERR "*** Filename turns out to be UTF8\n";
583    $best_encoding = 'utf8';
584    }
585   
586   
587    # check for equivalents where textcat doesn't have some encodings...
588    # eg MS versions of standard encodings
589    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
590##  print STDERR "**** best_encoding is ISO_8859: $best_encoding\n";
591
592    my $iso = $1; # which variant of the iso standard?
593    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
594    if ($text =~ /[\x80-\x9f]/) {
595##      print STDERR "**** best_encoding is some windows value: $best_encoding\n";
596        # Western Europe
597        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
598        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
599        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
600        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
601        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
602        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
603        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
604##      print STDERR "**** best_encoding windows value: $best_encoding\n";
605    }
606    }
607   
608    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
609    !defined $encodings::encodings->{$best_encoding})
610    {
611    if ($self->{'verbosity'}) {
612        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
613    }
614##  print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n"; 
615    $best_encoding = undef;
616    }
617##    print STDERR "**** language: $language\n" if defined $language;
618##    print STDERR "**** encoding: $best_encoding\n" if defined $encoding;
619   
620    return $best_encoding;
621}
622
623# uses locale
624sub get_filesystem_encoding {
625
626    my $self = shift(@_);
627
628    my $outhandle = $self->{'outhandle'};
629    my $filesystem_encoding = undef;
630
631    eval {
632    use POSIX qw(locale_h);
633   
634    # With only one parameter, setlocale retrieves the
635    # current value
636    my $current_locale = setlocale(LC_CTYPE);
637   
638    if ($current_locale =~ m/^.*\.(.*?)$/) {
639        my $char_encoding = lc($1);
640        if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
641        $char_encoding = "$1\_$2\_$3";
642        }
643
644        $char_encoding =~ s/-/_/g;
645        $char_encoding =~ s/^utf_8$/utf8/;
646       
647        if ($char_encoding =~ m/^\d+$/) {
648        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
649            $char_encoding = "windows_$char_encoding";
650        }
651        elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
652            $char_encoding = "dos_$char_encoding";
653        }
654        }
655       
656        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
657        || (defined $encodings::encodings->{$char_encoding})) {
658        $filesystem_encoding = $char_encoding;
659        }
660        else {
661        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
662        }
663    }
664   
665
666    };
667    if ($@) {
668    print $outhandle "$@\n";
669    print $outhandle "Warning: Unable to establish locale.  Will assume filesytem is UTF-8\n";
670   
671    }
672    return $filesystem_encoding;
673}
674
675# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
676sub set_Source_metadata {
677    my $self = shift (@_); 
678    my ($doc_obj, $filename_no_path, $file_encoding) = @_;
679
680    my $top_section = $doc_obj->get_top_section();
681   
682    # UTF-8 version of filename
683    my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
684    $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
685
686}
687     
688sub add_OID {
689
690    my $self = shift (@_); 
691    my ($doc_obj) = @_;
692
693    # See if a metadata field is specified as the field
694    if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
695    my $metadata_doc_id = $self->{'use_as_doc_identifier'};
696
697    # Consider "tidying" up metadata_doc_id to be something
698    # suitable in a URL
699    # Could even support a user specified plugin RE for this.
700
701    my $top_section = $doc_obj->get_top_section();
702    my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
703##  print STDERR "**** oid = $oid\n";
704        $doc_obj->set_OID($oid);
705    }
706    # See if there is a plugin-specific set_OID function...
707    elsif (defined ($self->can('set_OID'))) {
708    # it will need $doc_obj to set the Identifier metadata...
709    $self->set_OID(@_); # pass through any extra arguments supplied
710    } else {
711    # use the default set_OID() in doc.pm
712    $doc_obj->set_OID();
713    }
714}
715
716
717
718# The BasePlugin read_into_doc_obj() function. This function does all the
719# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
720# a file and sets up a slew of metadata all saved in doc_obj, which
721# it then returns as part of a tuple (process_status,doc_obj)
722#
723# Much of this functionality used to reside in read, but it was broken
724# down into a supporting routine to make the code more flexible. 
725#
726# recursive plugins (e.g. RecPlug) and specialized plugins like those
727# capable of processing many documents within a single file (e.g.
728# GMLPlug) will normally want to implement their own version of
729# read_into_doc_obj()
730#
731# Note that $base_dir might be "" and that $file might
732# include directories
733
734# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
735sub read_into_doc_obj {
736    my $self = shift (@_); 
737    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
738
739    my $outhandle = $self->{'outhandle'};
740
741    # should we move this to read? What about secondary plugins?
742    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
743    print $outhandle "$self->{'plugin_type'} processing $file\n"
744        if $self->{'verbosity'} > 1;
745
746    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
747    # create a new document
748    my $doc_obj = new doc ($filename_full_path, "indexed_doc");
749    my $top_section = $doc_obj->get_top_section();
750
751    # this should look at the plugin option too...
752    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});   
753    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
754    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
755 
756    $self->set_Source_metadata($doc_obj, $filename_no_path);
757
758    # plugin specific stuff - what args do we need here??
759    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
760    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
761    return -1;
762    }
763   
764    # include any metadata passed in from previous plugins
765    # note that this metadata is associated with the top level section
766    my $section = $doc_obj->get_top_section();
767    # can we merge these two methods??
768    $self->add_associated_files($doc_obj, $filename_full_path);
769    $self->extra_metadata ($doc_obj, $section, $metadata);
770    $self->auto_extract_metadata($doc_obj);
771
772    # if we haven't found any Title so far, assign one
773    # this was shifted to here from inside read()
774    $self->title_fallback($doc_obj,$section,$filename_no_path);
775   
776    $self->add_OID($doc_obj);
777   
778    return (1,$doc_obj);
779}
780
781sub add_dummy_text {
782    my $self = shift(@_);
783    my ($doc_obj, $section) = @_;
784
785    # add NoText metadata so we can hide this dummy text in format statements
786    $doc_obj->add_metadata($section, "NoText", "1");
787    $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
788   
789}
790
791# does nothing. Can be overridden by subclass
792sub auto_extract_metadata {
793    my $self = shift(@_);
794    my ($doc_obj) = @_;
795}
796
797# adds cover image, associate_file options stuff. Should be called by sub class
798# read_into_doc_obj
799sub add_associated_files {
800    my $self = shift(@_);
801    # whatis filename??
802    my ($doc_obj, $filename) = @_;
803   
804    # add in the cover image
805    if ($self->{'cover_image'}) {
806    $self->associate_cover_image($doc_obj, $filename);
807    }
808   
809
810}
811
812# implement this if you are extracting metadata for other documents
813sub metadata_read {
814    my $self = shift (@_);
815    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
816   
817    # can we process this file??
818    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
819    return undef unless $self->can_process_this_file($filename_full_path);
820
821    return 1; # we recognise the file, but don't actually do anything with it
822}
823
824
825# The BasePlugin read() function. This function calls read_into_doc_obj()
826# to ensure all the right things to make general options work for a
827# given plugin are done. It then calls the process() function which
828# does all the work specific to a plugin (like the old read functions
829# used to do). Most plugins should define their own process() function
830# and let this read() function keep control. 
831#
832# recursive plugins (e.g. RecPlug) and specialized plugins like those
833# capable of processing many documents within a single file (e.g.
834# GMLPlug) might want to implement their own version of read(), but
835# more likely need to implement their own version of read_into_doc_obj()
836#
837# Return number of files processed, undef if can't recognise, -1 if can't
838# process
839
840sub read {
841    my $self = shift (@_); 
842    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
843
844    # can we process this file??
845    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
846    return undef unless $self->can_process_this_file($filename_full_path);
847   
848    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
849   
850    if ((defined $process_status) && ($process_status == 1)) {
851
852    # process the document
853    $processor->process($doc_obj);
854
855    $self->{'num_processed'} ++;
856    undef $doc_obj;
857    }
858    # delete any temp files that we may have created
859    $self->clean_up_after_doc_obj_processing();
860
861    # if process_status == 1, then the file has been processed.
862    return $process_status;
863
864}
865
866# returns undef if file is rejected by the plugin
867sub process {
868    my $self = shift (@_);
869    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
870
871    gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
872
873    return undef; # never gets here
874}
875
876# overwrite this method to delete any temp files that we have created
877sub clean_up_after_doc_obj_processing {
878    my $self = shift(@_);
879
880}
881
882# write_file -- used by ConvertToPlug, for example in post processing
883#
884# where should this go, is here the best place??
885sub utf8_write_file {
886    my $self = shift (@_);
887    my ($textref, $filename) = @_;
888   
889    if (!open (FILE, ">$filename")) {
890    gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
891     die "\n";
892     }
893    print FILE $$textref;
894   
895    close FILE;
896}
897
898
899sub filename_based_title
900{
901    my $self = shift (@_);
902    my ($file) = @_;
903
904    my $file_derived_title = $file;
905    $file_derived_title =~ s/_/ /g;
906    $file_derived_title =~ s/\..*?$//;
907
908    return $file_derived_title;
909}
910
911
912sub title_fallback
913{
914    my $self = shift (@_);
915    my ($doc_obj,$section,$file) = @_;
916
917    if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
918
919    my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
920    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
921        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
922    }
923    else {
924        $doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title);
925    }
926    }
927       
928}
929 
930# add any extra metadata that's been passed around from one
931# plugin to another.
932# extra_metadata uses add_utf8_metadata so it expects metadata values
933# to already be in utf8
934sub extra_metadata {
935    my $self = shift (@_);
936    my ($doc_obj, $cursection, $metadata) = @_;
937
938    my $associate_tail_re = $self->{'associate_tail_re'};
939
940    foreach my $field (keys(%$metadata)) {
941    # $metadata->{$field} may be an array reference
942    if ($field eq "gsdlassocfile_tobe") {
943        # 'gsdlassocfile_tobe' is artificially introduced metadata
944        # that is used to signal that certain additional files should
945        # be tied to this document.  Useful in situations where a
946        # metadata pass in the plugin pipeline works out some files
947        # need to be associated with a document, but the document hasn't
948        # been formed yet.
949        my $equiv_form = "";
950        foreach my $gaf (@{$metadata->{$field}}) {
951        my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
952        my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
953        my $filename = $full_filename;
954        $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
955
956        # work out extended tail extension (i.e. matching tail re)
957
958        my ($file_prefix,$file_extended_ext)
959            = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
960        my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
961
962        my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
963        my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
964        my $srcicon = "_icon".$doc_ext."_";
965        my $end_doclink = "</a>";
966
967        my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
968
969        if (defined $pre_doc_ext) {
970            # for metadata such as [mp3._edited] [mp3._full] ...
971            $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
972        }
973
974        # for multiple metadata such as [mp3.assoclink]
975        $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
976       
977        $equiv_form .= " $assoc_form"; 
978        }
979        $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
980    }
981    elsif (ref ($metadata->{$field}) eq "ARRAY") {
982        map {
983        $doc_obj->add_utf8_metadata ($cursection, $field, $_);
984        } @{$metadata->{$field}};
985    } else {
986        $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
987    }
988    }
989}
990
991
992sub compile_stats {
993    my $self = shift(@_);
994    my ($stats) = @_;
995
996    $stats->{'num_processed'} += $self->{'num_processed'};
997    $stats->{'num_not_processed'} += $self->{'num_not_processed'};
998    $stats->{'num_archives'} += $self->{'num_archives'};
999
1000}
1001
1002sub associate_cover_image {
1003    my $self = shift;
1004    my ($doc_obj, $filename) = @_;
1005
1006    $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1007    if (exists $self->{'covers_missing_cache'}->{$filename}) {
1008    # don't stat() for existence eg for multiple document input files
1009    # (eg SplitPlug)
1010    return;
1011    }
1012
1013    my $top_section=$doc_obj->get_top_section();
1014
1015    if (-e $filename) {
1016        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1017    $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1018    } else {
1019    my $upper_filename = $filename;
1020    $upper_filename =~ s/jpg$/JPG/;
1021    if (-e $upper_filename) {
1022        $doc_obj->associate_file($upper_filename, "cover.jpg",
1023                     "image/jpeg");
1024        $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1025    } else {
1026        # file doesn't exist, so record the fact that it's missing so
1027        # we don't stat() again (stat is slow)
1028        $self->{'covers_missing_cache'}->{$filename} = 1;
1029    }
1030    }
1031
1032}
1033
1034
1035# Overridden by exploding plugins (eg. ISISPlug)
1036sub clean_up_after_exploding
1037{
1038    my $self = shift(@_);
1039}
1040
1041
1042
10431;
Note: See TracBrowser for help on using the browser.