source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 22705

Last change on this file since 22705 was 22705, checked in by davidb, 14 years ago

User of AutoloadConverterScripting expanded to encompass PowerPoint and Excel. No longer need OOConvertBinaryFile and PBConvertBinaryFile

  • Property svn:keywords set to Author Date Id Revision
File size: 37.1 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33
34use encodings;
35use unicode;
36use doc;
37use ghtml;
38use gsprintf 'gsprintf';
39
40use PrintInfo;
41
42BEGIN {
43 @BasePlugin::ISA = ( 'PrintInfo' );
44}
45
46# the different methods that can be applied when renaming
47# imported documents and their associated files
48our $file_rename_method_list =
49 [ { 'name' => "url",
50 'desc' => "{BasePlugin.rename_method.url}" },
51 { 'name' => "base64",
52 'desc' => "{BasePlugin.rename_method.base64}" },
53 { 'name' => "none",
54 'desc' => "{BasePlugin.rename_method.none}",
55 'hiddengli' => "yes" } ];
56
57our $encoding_list =
58 [ { 'name' => "ascii",
59 'desc' => "{BasePlugin.encoding.ascii}" },
60 { 'name' => "utf8",
61 'desc' => "{BasePlugin.encoding.utf8}" },
62 { 'name' => "unicode",
63 'desc' => "{BasePlugin.encoding.unicode}" } ];
64
65
66my $e = $encodings::encodings;
67foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
68{
69 my $hashEncode =
70 {'name' => $enc,
71 'desc' => $e->{$enc}->{'name'}};
72
73 push(@{$encoding_list},$hashEncode);
74}
75
76our $encoding_plus_auto_list =
77 [ { 'name' => "auto",
78 'desc' => "{BasePlugin.filename_encoding.auto}" },
79 { 'name' => "auto-language-analysis",
80 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
81 { 'name' => "auto-filesystem-encoding",
82 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
83 { 'name' => "auto-fl",
84 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
85 { 'name' => "auto-lf",
86 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
87
88push(@{$encoding_plus_auto_list},@{$encoding_list});
89
90our $oidtype_list =
91 [ { 'name' => "auto",
92 'desc' => "{BasePlugin.OIDtype.auto}" },
93 { 'name' => "hash",
94 'desc' => "{import.OIDtype.hash}" },
95 { 'name' => "assigned",
96 'desc' => "{import.OIDtype.assigned}" },
97 { 'name' => "incremental",
98 'desc' => "{import.OIDtype.incremental}" },
99 { 'name' => "dirname",
100 'desc' => "{import.OIDtype.dirname}" } ];
101
102my $arguments =
103 [ { 'name' => "process_exp",
104 'desc' => "{BasePlugin.process_exp}",
105 'type' => "regexp",
106 'deft' => "",
107 'reqd' => "no" },
108 { 'name' => "no_blocking",
109 'desc' => "{BasePlugin.no_blocking}",
110 'type' => "flag",
111 'reqd' => "no"},
112 { 'name' => "block_exp",
113 'desc' => "{BasePlugin.block_exp}",
114 'type' => "regexp",
115 'deft' => "",
116 'reqd' => "no" },
117 { 'name' => "store_original_file",
118 'desc' => "{BasePlugin.store_original_file}",
119 'type' => "flag",
120 'reqd' => "no" },
121 { 'name' => "associate_ext",
122 'desc' => "{BasePlugin.associate_ext}",
123 'type' => "string",
124 'reqd' => "no" },
125 { 'name' => "associate_tail_re",
126 'desc' => "{BasePlugin.associate_tail_re}",
127 'type' => "string",
128 'reqd' => "no" },
129 { 'name' => "OIDtype",
130 'desc' => "{import.OIDtype}",
131 'type' => "enum",
132 'list' => $oidtype_list,
133 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
134 'deft' => "auto",
135 'reqd' => "no" },
136 { 'name' => "OIDmetadata",
137 'desc' => "{import.OIDmetadata}",
138 'type' => "metadata",
139 'deft' => "dc.Identifier",
140 'reqd' => "no" },
141# { 'name' => "use_as_doc_identifier",
142# 'desc' => "{BasePlugin.use_as_doc_identifier}",
143# 'type' => "string",
144# 'reqd' => "no" ,
145# 'deft' => "" } ,
146 { 'name' => "no_cover_image",
147 'desc' => "{BasePlugin.no_cover_image}",
148 'type' => "flag",
149 'reqd' => "no" },
150 { 'name' => "filename_encoding",
151 'desc' => "{BasePlugin.filename_encoding}",
152 'type' => "enum",
153 'deft' => "auto",
154 'list' => $encoding_plus_auto_list,
155 'reqd' => "no" },
156 { 'name' => "smart_block",
157 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
158 'type' => "flag",
159 'reqd' => "no",
160 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
161 { 'name' => "file_rename_method",
162 'desc' => "{BasePlugin.file_rename_method}",
163 'type' => "enum",
164 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
165 'list' => $file_rename_method_list,
166 'reqd' => "no"
167 }
168
169 ];
170
171
172my $options = { 'name' => "BasePlugin",
173 'desc' => "{BasePlugin.desc}",
174 'abstract' => "yes",
175 'inherits' => "no",
176 'args' => $arguments };
177
178sub new {
179
180 my ($class) = shift (@_);
181 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
182 push(@$pluginlist, $class);
183
184 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
185 push(@{$hashArgOptLists->{"OptList"}},$options);
186
187 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
188
189 if ($self->{'info_only'}) {
190 # don't worry about any options etc
191 return bless $self, $class;
192 }
193
194 if ($self->{'smart_block'}) {
195 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
196 }
197 $self->{'smart_block'} = undef;
198
199 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
200 $self->{'plugin_type'} = $plugin_name;
201
202 # remove ex. from OIDmetadata
203 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
204 $self->{'num_processed'} = 0;
205 $self->{'num_not_processed'} = 0;
206 $self->{'num_blocked'} = 0;
207 $self->{'num_archives'} = 0;
208 $self->{'cover_image'} = 1; # cover image is on by default
209 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
210 $self->{'can_process_directories'} = 0;
211 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
212
213 my $associate_ext = $self->{'associate_ext'};
214 if ((defined $associate_ext) && ($associate_ext ne "")) {
215
216 my $associate_tail_re = $self->{'associate_tail_re'};
217 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
218 my $outhandle = $self->{'outhandle'};
219 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
220 print $outhandle " defaulting to 'associate_tail_re'\n";
221 }
222 else {
223 my @exts = split(/,/,$associate_ext);
224
225 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
226 my $associate_tail_re = join("|",@exts_bracketed);
227 $self->{'associate_tail_re'} = $associate_tail_re;
228 }
229
230 delete $self->{'associate_ext'};
231 }
232
233 return bless $self, $class;
234
235}
236
237sub merge_inheritance
238{
239 my $self = {};
240 my @child_selfs = @_;
241
242 foreach my $child_self (@child_selfs) {
243 foreach my $key (keys %$child_self) {
244 if (defined $self->{$key}) {
245 if ($self->{$key} ne $child_self->{$key}) {
246# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
247# print STDERR "Existing stored value = $self->{$key}\n";
248# print STDERR "New (child) value = $child_self->{$key}\n";
249# print STDERR "Keeping existing value\n";
250 # Existing value seems to be option specified in collect.cfg
251
252 ### $self->{$key} = $child_self->{$key};
253
254 }
255 else {
256## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
257 }
258
259 }
260 else {
261 $self->{$key} = $child_self->{$key};
262 }
263 }
264 }
265
266 return $self;
267}
268
269# initialize BasePlugin options
270# if init() is overridden in a sub-class, remember to call BasePlugin::init()
271sub init {
272 my $self = shift (@_);
273 my ($verbosity, $outhandle, $failhandle) = @_;
274
275 # verbosity is passed through from the processor
276 $self->{'verbosity'} = $verbosity;
277
278 # as are the outhandle and failhandle
279 $self->{'outhandle'} = $outhandle if defined $outhandle;
280 $self->{'failhandle'} = $failhandle;
281# $self->SUPER::init(@_);
282
283 # set process_exp and block_exp to defaults unless they were
284 # explicitly set
285
286 if ((!$self->is_recursive()) and
287 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
288
289 $self->{'process_exp'} = $self->get_default_process_exp ();
290 if ($self->{'process_exp'} eq "") {
291 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
292 }
293 }
294
295 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
296 $self->{'block_exp'} = $self->get_default_block_exp ();
297 }
298
299}
300
301sub begin {
302 my $self = shift (@_);
303 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
304
305 if ($self->{'OIDtype'} eq "auto") {
306 # hasn't been set in the plugin, use the processor values
307 $self->{'OIDtype'} = $processor->{'OIDtype'};
308 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
309 }
310 if ($self->{'OIDtype'} eq "hash") {
311 # should we hash on the file or on the doc xml??
312 $self->{'OIDtype'} = $self->get_oid_hash_type();
313 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
314 $self->{'OIDtype'} = "hash_on_file";
315 }
316 }
317}
318
319# This is called once if removeold is set with import.pl. Most plugins will do
320# nothing but if a plugin does any stuff outside of creating doc obj, then
321# it may need to clear something.
322sub remove_all {
323 my $self = shift (@_);
324 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
325}
326
327# This is called per document for docs that have been deleted from the
328# collection. Most plugins will do nothing
329# but if a plugin does any stuff outside of creating doc obj, then it may need
330# to clear something.
331sub remove_one {
332 my $self = shift (@_);
333
334 my ($file, $oids, $archivedir) = @_;
335 return 0 if $self->can_process_this_file($file);
336 return undef;
337}
338
339sub end {
340 # potentially called at the end of each plugin pass
341 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
342
343 my ($self) = shift (@_);
344}
345
346sub deinit {
347 # called only once, after all plugin passes have been done
348
349 my ($self) = @_;
350}
351
352# default hashing type is to hash on the original file (or converted file)
353# override this to return hash_on_ga_xml for filetypes where hashing on the
354# file is no good eg video
355sub get_oid_hash_type {
356
357 my $self = shift (@_);
358
359 return "hash_on_file";
360}
361
362
363# this function should be overridden to return 1
364# in recursive plugins
365sub is_recursive {
366 my $self = shift (@_);
367
368 return 0;
369}
370
371sub get_default_block_exp {
372 my $self = shift (@_);
373
374 return "";
375}
376
377sub get_default_process_exp {
378 my $self = shift (@_);
379
380 return "";
381}
382
383# rename imported files and assoc files using URL encoding by default
384# as this will work for most plugins and give more legible filenames
385sub get_default_file_rename_method() {
386 my $self = shift (@_);
387 return "url";
388}
389
390# returns this plugin's active (possibly user-selected) file_rename_method
391sub get_file_rename_method() {
392 my $self = shift (@_);
393 my $rename_method = $self->{'file_rename_method'};
394 if($rename_method) {
395 return $rename_method;
396 } else {
397 return $self->get_default_file_rename_method();
398 }
399}
400
401# default implementation is to do nothing
402sub store_block_files {
403
404 my $self =shift (@_);
405 my ($filename_full_path, $block_hash) = @_;
406
407}
408
409# put files to block into hash
410sub use_block_expressions {
411
412 my $self =shift (@_);
413 my ($filename_full_path, $block_hash) = @_;
414
415 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
416 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
417 }
418
419}
420
421#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
422sub block_cover_image
423{
424 my $self =shift;
425 my ($filename, $block_hash) = @_;
426
427 if ($self->{'cover_image'}) {
428 my $coverfile = $filename;
429 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
430 if (!-e $coverfile) {
431 $coverfile =~ s/jpg$/JPG/;
432 }
433 if (-e $coverfile) {
434 $block_hash->{'file_blocks'}->{$coverfile} = 1;
435 }
436 }
437
438 return;
439}
440
441
442# discover all the files that should be blocked by this plugin
443# check the args ...
444sub file_block_read {
445
446 my $self = shift (@_);
447 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
448 # Keep track of filenames with same root but different extensions
449 # Used to support -associate_ext and the more generalised
450 # -associate_tail_re
451 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
452
453 if (!-d $filename_full_path) {
454 $block_hash->{'all_files'}->{$file} = 1;
455 }
456
457 my $associate_tail_re = $self->{'associate_tail_re'};
458 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
459 my ($file_prefix,$file_ext)
460 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
461 if ((defined $file_prefix) && (defined $file_ext)) {
462 my $shared_fileroot = $block_hash->{'shared_fileroot'};
463 if (!defined $shared_fileroot->{$file_prefix}) {
464 my $file_prefix_rec = { 'tie_to' => undef,
465 'exts' => {} };
466 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
467 }
468
469 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
470
471 if ($self->can_process_this_file($filename_full_path)) {
472 # This is the document the others should be tied to
473 $file_prefix_rec->{'tie_to'} = $file_ext;
474 }
475 else {
476 if ($file_ext =~ m/$associate_tail_re$/) {
477 # this file should be associated to the main one
478 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
479 }
480 }
481
482 }
483 }
484
485 # check block expressions
486 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
487
488 # now check whether we are actually processing this
489 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
490 return undef; # can't recognise
491 }
492
493 # if we have a block_exp, then this overrides the normal 'smart' blocking
494 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
495
496 # block the cover image if there is one
497 if ($self->{'cover_image'}) {
498 $self->block_cover_image($filename_full_path, $block_hash);
499 }
500
501 return 1;
502}
503
504# plugins that rely on more than process_exp (eg XML plugins) can override this method
505sub can_process_this_file {
506 my $self = shift(@_);
507 my ($filename) = @_;
508
509 if (-d $filename && !$self->{'can_process_directories'}) {
510 return 0;
511 }
512
513 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
514 return 1;
515 }
516 return 0;
517
518}
519
520# just converts path as is to utf8.
521sub filepath_to_utf8 {
522 my $self = shift (@_);
523 my ($file, $file_encoding) = @_;
524 my $filemeta = $file;
525
526 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
527
528 # Whenever filename-encoding is set to any of the auto settings, we
529 # check if the filename is already in UTF8. If it is, then we're done.
530 if($filename_encoding =~ m/auto/) {
531 if(&unicode::check_is_utf8($filemeta))
532 {
533 $filename_encoding = "utf8";
534 return $filemeta;
535 }
536 }
537
538 # Auto setting, but filename is not utf8
539 if ($filename_encoding eq "auto")
540 {
541 # try textcat
542 $filename_encoding = $self->textcat_encoding($filemeta);
543
544 # check the locale next
545 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
546
547
548 # now try the encoding of the document, if available
549 if ($filename_encoding eq "undefined" && defined $file_encoding) {
550 $filename_encoding = $file_encoding;
551 }
552
553 }
554
555 elsif ($filename_encoding eq "auto-language-analysis")
556 {
557 $filename_encoding = $self->textcat_encoding($filemeta);
558
559 # now try the encoding of the document, if available
560 if ($filename_encoding eq "undefined" && defined $file_encoding) {
561 $filename_encoding = $file_encoding;
562 }
563 }
564
565 elsif ($filename_encoding eq "auto-filesystem-encoding")
566 {
567 # try locale
568 $filename_encoding = $self->locale_encoding();
569 }
570
571 elsif ($filename_encoding eq "auto-fl")
572 {
573 # filesystem-encoding (locale) then language-analysis (textcat)
574 $filename_encoding = $self->locale_encoding();
575
576 # try textcat
577 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
578
579 # else assume filename encoding is encoding of file content, if that's available
580 if ($filename_encoding eq "undefined" && defined $file_encoding) {
581 $filename_encoding = $file_encoding;
582 }
583 }
584
585 elsif ($filename_encoding eq "auto-lf")
586 {
587 # language-analysis (textcat) then filesystem-encoding (locale)
588 $filename_encoding = $self->textcat_encoding($filemeta);
589
590 # guess filename encoding from encoding of file content, if available
591 if ($filename_encoding eq "undefined" && defined $file_encoding) {
592 $filename_encoding = $file_encoding;
593 }
594
595 # try locale
596 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
597 }
598
599 # if still undefined, use utf8 as fallback
600 if ($filename_encoding eq "undefined") {
601 $filename_encoding = "utf8";
602 }
603
604 #print STDERR "**** UTF8 encoding the filename $filemeta ";
605
606 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
607 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
608 # be always utf8 (in which case the filename's encoding is also set as utf8 even
609 # though the filename need not be if it originates from another system)--in such
610 # cases attempt to make the filename utf8 to match.
611 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
612 &unicode::ensure_utf8(\$filemeta);
613 }
614
615 # convert non-unicode encodings to utf8
616 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
617 $filemeta = &unicode::unicode2utf8(
618 &unicode::convert2unicode($filename_encoding, \$filemeta)
619 );
620 }
621
622 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
623 return $filemeta;
624}
625
626# gets the filename with no path, converts to utf8, and then dm safes it.
627# filename_encoding set by user
628sub filename_to_utf8_metadata
629{
630 my $self = shift (@_);
631 my ($file, $file_encoding) = @_;
632
633 my $outhandle = $self->{'outhandle'};
634
635 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
636 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
637
638 return $filemeta;
639}
640
641sub locale_encoding {
642 my $self = shift(@_);
643
644 if (!defined $self->{'filesystem_encoding'}) {
645 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
646 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
647 }
648
649 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
650 return $self->{'filesystem_encoding'}; # can be the string "undefined"
651}
652
653sub textcat_encoding {
654 my $self = shift(@_);
655 my ($filemeta) = @_;
656
657 # analyse filenames without extensions and digits (and trimmed of
658 # surrounding whitespace), so that irrelevant chars don't confuse
659 # textcat
660 my $strictfilemeta = $filemeta;
661 $strictfilemeta =~ s/\.[^\.]+$//g;
662 $strictfilemeta =~ s/\d//g;
663 $strictfilemeta =~ s/^\s*//g;
664 $strictfilemeta =~ s/\s*$//g;
665
666 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
667 if(!defined $filename_encoding) {
668 $filename_encoding = "undefined";
669 }
670
671 return $filename_encoding; # can be the string "undefined"
672}
673
674# performs textcat
675sub encoding_from_language_analysis {
676 my $self = shift(@_);
677 my ($text) = @_;
678
679 my $outhandle = $self->{'outhandle'};
680 my $best_encoding = undef;
681
682 # get the language/encoding of the textstring using textcat
683 require textcat; # Only load the textcat module if it is required
684 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
685 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
686
687
688 if (scalar @$results < 0) {
689 return undef;
690 }
691
692 # We have some results, we choose the first
693 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
694
695 $best_encoding = $encoding;
696 if (!defined $best_encoding) {
697 return undef;
698 }
699
700 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
701 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
702 $best_encoding = 'utf8';
703 }
704
705
706 # check for equivalents where textcat doesn't have some encodings...
707 # eg MS versions of standard encodings
708 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
709 my $iso = $1; # which variant of the iso standard?
710 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
711 if ($text =~ /[\x80-\x9f]/) {
712 # Western Europe
713 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
714 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
715 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
716 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
717 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
718 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
719 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
720 }
721 }
722
723 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
724 !defined $encodings::encodings->{$best_encoding})
725 {
726 if ($self->{'verbosity'}) {
727 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
728 }
729 $best_encoding = undef;
730 }
731
732 return $best_encoding;
733}
734
735# uses locale
736sub get_filesystem_encoding {
737
738 my $self = shift(@_);
739
740 my $outhandle = $self->{'outhandle'};
741 my $filesystem_encoding = undef;
742
743 eval {
744 use POSIX qw(locale_h);
745
746 # With only one parameter, setlocale retrieves the
747 # current value
748 my $current_locale = setlocale(LC_CTYPE);
749
750 if ($current_locale =~ m/^.*\.(.*?)$/) {
751 my $char_encoding = lc($1);
752 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
753 $char_encoding = "$1\_$2\_$3";
754 }
755
756 $char_encoding =~ s/-/_/g;
757 $char_encoding =~ s/^utf_8$/utf8/;
758
759 if ($char_encoding =~ m/^\d+$/) {
760 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
761 $char_encoding = "windows_$char_encoding";
762 }
763 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
764 $char_encoding = "dos_$char_encoding";
765 }
766 }
767
768 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
769 || (defined $encodings::encodings->{$char_encoding})) {
770 $filesystem_encoding = $char_encoding;
771 }
772 else {
773 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
774 }
775 }
776
777
778 };
779 if ($@) {
780 print $outhandle "$@\n";
781 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
782
783 }
784 return $filesystem_encoding;
785}
786
787# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
788sub set_Source_metadata {
789 my $self = shift (@_);
790 my ($doc_obj, $filename_no_path, $file_encoding) = @_;
791
792 my $top_section = $doc_obj->get_top_section();
793
794 # UTF-8 version of filename
795 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
796
797 # Source is the UTF8 display name - not necessarily the name of the file on the system
798 $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
799
800 $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
801 # If using URL encoding, then SourceFile is the url-reference to url-encoded
802 # filemeta: it's a url that refers to the actual file on the system
803 $filemeta = &unicode::filename_to_url($filemeta);
804
805 $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
806}
807
808# this should be called by all plugins to set the oid of the doc obj, rather
809# than calling doc_obj->set_OID directly
810sub add_OID {
811 my $self = shift (@_);
812 my ($doc_obj) = @_;
813
814 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
815
816 # see if there is a plugin specific set_OID function
817 if (defined ($self->can('set_OID'))) {
818 $self->set_OID(@_); # pass through doc_obj and any extra arguments
819 }
820 else {
821 # use the default set_OID() in doc.pm
822 $doc_obj->set_OID();
823 }
824
825}
826
827# The BasePlugin read_into_doc_obj() function. This function does all the
828# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
829# a file and sets up a slew of metadata all saved in doc_obj, which
830# it then returns as part of a tuple (process_status,doc_obj)
831#
832# Much of this functionality used to reside in read, but it was broken
833# down into a supporting routine to make the code more flexible.
834#
835# recursive plugins (e.g. RecPlug) and specialized plugins like those
836# capable of processing many documents within a single file (e.g.
837# GMLPlug) will normally want to implement their own version of
838# read_into_doc_obj()
839#
840# Note that $base_dir might be "" and that $file might
841# include directories
842
843# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
844sub read_into_doc_obj {
845 my $self = shift (@_);
846 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
847
848 my $outhandle = $self->{'outhandle'};
849
850 # should we move this to read? What about secondary plugins?
851 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
852 print $outhandle "$self->{'plugin_type'} processing $file\n"
853 if $self->{'verbosity'} > 1;
854
855 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
856
857 # create a new document
858 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
859 my $top_section = $doc_obj->get_top_section();
860
861 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
862 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
863
864
865 # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
866 # of the UTF8 filename (SourceFile) for generated files
867 $self->set_Source_metadata($doc_obj, $filename_no_path);
868
869
870 # plugin specific stuff - what args do we need here??
871 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
872 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
873 return -1;
874 }
875
876 # include any metadata passed in from previous plugins
877 # note that this metadata is associated with the top level section
878 my $section = $doc_obj->get_top_section();
879 # can we merge these two methods??
880 $self->add_associated_files($doc_obj, $filename_full_path);
881 $self->extra_metadata ($doc_obj, $section, $metadata);
882 $self->auto_extract_metadata($doc_obj);
883
884 # if we haven't found any Title so far, assign one
885 # this was shifted to here from inside read()
886 $self->title_fallback($doc_obj,$section,$filename_no_path);
887
888 $self->add_OID($doc_obj);
889
890 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
891 return (1,$doc_obj);
892}
893
894sub post_process_doc_obj {
895 my $self = shift (@_);
896 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
897
898 return 1;
899}
900
901sub add_dummy_text {
902 my $self = shift(@_);
903 my ($doc_obj, $section) = @_;
904
905 # add NoText metadata so we can hide this dummy text in format statements
906 $doc_obj->add_metadata($section, "NoText", "1");
907 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
908
909}
910
911# does nothing. Can be overridden by subclass
912sub auto_extract_metadata {
913 my $self = shift(@_);
914 my ($doc_obj) = @_;
915}
916
917# adds cover image, associate_file options stuff. Should be called by sub class
918# read_into_doc_obj
919sub add_associated_files {
920 my $self = shift(@_);
921 # whatis filename??
922 my ($doc_obj, $filename) = @_;
923
924 # add in the cover image
925 if ($self->{'cover_image'}) {
926 $self->associate_cover_image($doc_obj, $filename);
927 }
928 # store the original (used for eg TextPlugin to store the original for OAI)
929 if ($self->{'store_original_file'}) {
930 $self->associate_source_file($doc_obj, $filename);
931 }
932
933
934}
935
936# implement this if you are extracting metadata for other documents
937sub metadata_read {
938 my $self = shift (@_);
939 my ($pluginfo, $base_dir, $file, $block_hash,
940 $extrametakeys, $extrametadata, $extrametafile,
941 $processor, $maxdocs, $gli) = @_;
942
943 # can we process this file??
944 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
945 return undef unless $self->can_process_this_file($filename_full_path);
946
947 return 1; # we recognise the file, but don't actually do anything with it
948}
949
950
951# The BasePlugin read() function. This function calls read_into_doc_obj()
952# to ensure all the right things to make general options work for a
953# given plugin are done. It then calls the process() function which
954# does all the work specific to a plugin (like the old read functions
955# used to do). Most plugins should define their own process() function
956# and let this read() function keep control.
957#
958# recursive plugins (e.g. RecPlug) and specialized plugins like those
959# capable of processing many documents within a single file (e.g.
960# GMLPlug) might want to implement their own version of read(), but
961# more likely need to implement their own version of read_into_doc_obj()
962#
963# Return number of files processed, undef if can't recognise, -1 if can't
964# process
965
966sub read {
967 my $self = shift (@_);
968 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
969
970 # can we process this file??
971 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
972
973 return undef unless $self->can_process_this_file($filename_full_path);
974
975 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
976
977 if ((defined $process_status) && ($process_status == 1)) {
978
979 # process the document
980 $processor->process($doc_obj);
981
982 $self->{'num_processed'} ++;
983 undef $doc_obj;
984 }
985 # delete any temp files that we may have created
986 $self->clean_up_after_doc_obj_processing();
987
988
989 # if process_status == 1, then the file has been processed.
990 return $process_status;
991
992}
993
994# returns undef if file is rejected by the plugin
995sub process {
996 my $self = shift (@_);
997 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
998
999 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
1000
1001 return undef; # never gets here
1002}
1003
1004# overwrite this method to delete any temp files that we have created
1005sub clean_up_after_doc_obj_processing {
1006 my $self = shift(@_);
1007
1008}
1009
1010# write_file -- used by ConvertToPlug, for example in post processing
1011#
1012# where should this go, is here the best place??
1013sub utf8_write_file {
1014 my $self = shift (@_);
1015 my ($textref, $filename) = @_;
1016
1017 if (!open (FILE, ">$filename")) {
1018 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1019 die "\n";
1020 }
1021 print FILE $$textref;
1022
1023 close FILE;
1024}
1025
1026
1027sub filename_based_title
1028{
1029 my $self = shift (@_);
1030 my ($file) = @_;
1031
1032 my $file_derived_title = $file;
1033 $file_derived_title =~ s/_/ /g;
1034 $file_derived_title =~ s/\.[^.]$//;
1035
1036 return $file_derived_title;
1037}
1038
1039
1040sub title_fallback
1041{
1042 my $self = shift (@_);
1043 my ($doc_obj,$section,$file) = @_;
1044
1045 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1046 or $doc_obj->get_metadata_element($section, "Title") eq "") {
1047
1048 my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
1049 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1050 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1051 }
1052 else {
1053 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1054 }
1055 }
1056
1057}
1058
1059# add any extra metadata that's been passed around from one
1060# plugin to another.
1061# extra_metadata uses add_utf8_metadata so it expects metadata values
1062# to already be in utf8
1063sub extra_metadata {
1064 my $self = shift (@_);
1065 my ($doc_obj, $cursection, $metadata) = @_;
1066
1067 my $associate_tail_re = $self->{'associate_tail_re'};
1068
1069 foreach my $field (keys(%$metadata)) {
1070 # $metadata->{$field} may be an array reference
1071 if ($field eq "gsdlassocfile_tobe") {
1072 # 'gsdlassocfile_tobe' is artificially introduced metadata
1073 # that is used to signal that certain additional files should
1074 # be tied to this document. Useful in situations where a
1075 # metadata pass in the plugin pipeline works out some files
1076 # need to be associated with a document, but the document hasn't
1077 # been formed yet.
1078 my $equiv_form = "";
1079 foreach my $gaf (@{$metadata->{$field}}) {
1080 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1081 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1082
1083 # we need to make sure the filename is valid utf-8 - we do
1084 # this by url or base64 encoding it
1085 # $tail_filename is the name that we store the file as
1086 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1087 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1088 $doc_obj->associate_source_file($full_filename);
1089 # If the filename is url_encoded, we need to encode the % signs
1090 # in the filename, so that it works in a url
1091 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1092 # work out extended tail extension (i.e. matching tail re)
1093
1094 my ($file_prefix,$file_extended_ext)
1095 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1096 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1097 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1098 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1099 my $srcicon = "_icon".$doc_ext."_";
1100 my $end_doclink = "</a>";
1101
1102 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1103
1104 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1105 # for metadata such as [mp3._edited] [mp3._full] ...
1106 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1107 }
1108
1109 # for multiple metadata such as [mp3.assoclink]
1110 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1111
1112 $equiv_form .= " $assoc_form";
1113 }
1114 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1115 }
1116 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1117 map {
1118 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1119 } @{$metadata->{$field}};
1120 } else {
1121 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1122 }
1123 }
1124}
1125
1126
1127sub compile_stats {
1128 my $self = shift(@_);
1129 my ($stats) = @_;
1130
1131 $stats->{'num_processed'} += $self->{'num_processed'};
1132 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1133 $stats->{'num_archives'} += $self->{'num_archives'};
1134
1135}
1136sub associate_source_file {
1137 my $self = shift(@_);
1138
1139 my ($doc_obj, $filename) = @_;
1140 my $cursection = $doc_obj->get_top_section();
1141 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1142
1143 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1144 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1145}
1146
1147sub associate_cover_image {
1148 my $self = shift(@_);
1149 my ($doc_obj, $filename) = @_;
1150
1151 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1152 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1153 # don't stat() for existence eg for multiple document input files
1154 # (eg SplitPlug)
1155 return;
1156 }
1157
1158 my $top_section=$doc_obj->get_top_section();
1159
1160 if (-e $filename) {
1161 $doc_obj->associate_source_file($filename);
1162 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1163 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1164 } else {
1165 my $upper_filename = $filename;
1166 $upper_filename =~ s/jpg$/JPG/;
1167 if (-e $upper_filename) {
1168 $doc_obj->associate_source_file($upper_filename);
1169 $doc_obj->associate_file($upper_filename, "cover.jpg",
1170 "image/jpeg");
1171 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1172 } else {
1173 # file doesn't exist, so record the fact that it's missing so
1174 # we don't stat() again (stat is slow)
1175 $self->{'covers_missing_cache'}->{$filename} = 1;
1176 }
1177 }
1178
1179}
1180
1181
1182# Overridden by exploding plugins (eg. ISISPlug)
1183sub clean_up_after_exploding
1184{
1185 my $self = shift(@_);
1186}
1187
1188
1189
11901;
Note: See TracBrowser for help on using the repository browser.