source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 20999

Last change on this file since 20999 was 20778, checked in by kjdon, 15 years ago

plugins now need to add any auxiliary source files as source assoc files, so we know when to reimport for incremental import. Have started this, but not finished and not tested :-)

  • Property svn:keywords set to Author Date Id Revision
File size: 35.5 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33
34use encodings;
35use unicode;
36use textcat;
37use doc;
38use ghtml;
39use gsprintf 'gsprintf';
40
41use PrintInfo;
42
43BEGIN {
44 @BasePlugin::ISA = ( 'PrintInfo' );
45}
46
47# the different methods that can be applied when renaming
48# imported documents and their associated files
49our $file_rename_method_list =
50 [ { 'name' => "url",
51 'desc' => "{BasePlugin.rename_method.url}" },
52 { 'name' => "base64",
53 'desc' => "{BasePlugin.rename_method.base64}" },
54 { 'name' => "none",
55 'desc' => "{BasePlugin.rename_method.none}",
56 'hiddengli' => "yes" } ];
57
58our $encoding_list =
59 [ { 'name' => "ascii",
60 'desc' => "{BasePlugin.encoding.ascii}" },
61 { 'name' => "utf8",
62 'desc' => "{BasePlugin.encoding.utf8}" },
63 { 'name' => "unicode",
64 'desc' => "{BasePlugin.encoding.unicode}" } ];
65
66
67my $e = $encodings::encodings;
68foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
69{
70 my $hashEncode =
71 {'name' => $enc,
72 'desc' => $e->{$enc}->{'name'}};
73
74 push(@{$encoding_list},$hashEncode);
75}
76
77our $encoding_plus_auto_list =
78 [ { 'name' => "auto",
79 'desc' => "{BasePlugin.filename_encoding.auto}" },
80 { 'name' => "auto-language-analysis",
81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
82 { 'name' => "auto-filesystem-encoding",
83 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
84 { 'name' => "auto-fl",
85 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
86 { 'name' => "auto-lf",
87 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
88
89push(@{$encoding_plus_auto_list},@{$encoding_list});
90
91our $oidtype_list =
92 [ { 'name' => "auto",
93 'desc' => "{BasePlugin.OIDtype.auto}" },
94 { 'name' => "hash",
95 'desc' => "{import.OIDtype.hash}" },
96 { 'name' => "assigned",
97 'desc' => "{import.OIDtype.assigned}" },
98 { 'name' => "incremental",
99 'desc' => "{import.OIDtype.incremental}" },
100 { 'name' => "dirname",
101 'desc' => "{import.OIDtype.dirname}" } ];
102
103my $arguments =
104 [ { 'name' => "process_exp",
105 'desc' => "{BasePlugin.process_exp}",
106 'type' => "regexp",
107 'deft' => "",
108 'reqd' => "no" },
109 { 'name' => "no_blocking",
110 'desc' => "{BasePlugin.no_blocking}",
111 'type' => "flag",
112 'reqd' => "no"},
113 { 'name' => "block_exp",
114 'desc' => "{BasePlugin.block_exp}",
115 'type' => "regexp",
116 'deft' => "",
117 'reqd' => "no" },
118 { 'name' => "associate_ext",
119 'desc' => "{BasePlugin.associate_ext}",
120 'type' => "string",
121 'reqd' => "no" },
122 { 'name' => "associate_tail_re",
123 'desc' => "{BasePlugin.associate_tail_re}",
124 'type' => "string",
125 'reqd' => "no" },
126 { 'name' => "OIDtype",
127 'desc' => "{import.OIDtype}",
128 'type' => "enum",
129 'list' => $oidtype_list,
130 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
131 'deft' => "auto",
132 'reqd' => "no" },
133 { 'name' => "OIDmetadata",
134 'desc' => "{import.OIDmetadata}",
135 'type' => "metadata",
136 'deft' => "dc.Identifier",
137 'reqd' => "no" },
138# { 'name' => "use_as_doc_identifier",
139# 'desc' => "{BasePlugin.use_as_doc_identifier}",
140# 'type' => "string",
141# 'reqd' => "no" ,
142# 'deft' => "" } ,
143 { 'name' => "no_cover_image",
144 'desc' => "{BasePlugin.no_cover_image}",
145 'type' => "flag",
146 'reqd' => "no" },
147 { 'name' => "filename_encoding",
148 'desc' => "{BasePlugin.filename_encoding}",
149 'type' => "enum",
150 'deft' => "auto",
151 'list' => $encoding_plus_auto_list,
152 'reqd' => "no" },
153 { 'name' => "smart_block",
154 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
155 'type' => "flag",
156 'reqd' => "no",
157 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
158 { 'name' => "file_rename_method",
159 'desc' => "{BasePlugin.file_rename_method}",
160 'type' => "enum",
161 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
162 'list' => $file_rename_method_list,
163 'reqd' => "no"
164 }
165
166 ];
167
168
169my $options = { 'name' => "BasePlugin",
170 'desc' => "{BasePlugin.desc}",
171 'abstract' => "yes",
172 'inherits' => "no",
173 'args' => $arguments };
174
175sub new {
176
177 my ($class) = shift (@_);
178 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
179 push(@$pluginlist, $class);
180
181 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
182 push(@{$hashArgOptLists->{"OptList"}},$options);
183
184 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
185
186 if ($self->{'info_only'}) {
187 # don't worry about any options etc
188 return bless $self, $class;
189 }
190
191 if ($self->{'smart_block'}) {
192 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
193 }
194 $self->{'smart_block'} = undef;
195
196 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
197 $self->{'plugin_type'} = $plugin_name;
198
199 # remove ex. from OIDmetadata
200 $self->{'OIDmetadata'} =~ s/^ex\.// if defined $self->{'OIDmetadata'};
201 $self->{'num_processed'} = 0;
202 $self->{'num_not_processed'} = 0;
203 $self->{'num_blocked'} = 0;
204 $self->{'num_archives'} = 0;
205 $self->{'cover_image'} = 1; # cover image is on by default
206 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
207 $self->{'can_process_directories'} = 0;
208 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
209
210 my $associate_ext = $self->{'associate_ext'};
211 if ((defined $associate_ext) && ($associate_ext ne "")) {
212
213 my $associate_tail_re = $self->{'associate_tail_re'};
214 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
215 my $outhandle = $self->{'outhandle'};
216 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
217 print $outhandle " defaulting to 'associate_tail_re'\n";
218 }
219 else {
220 my @exts = split(/,/,$associate_ext);
221
222 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
223 my $associate_tail_re = join("|",@exts_bracketed);
224 $self->{'associate_tail_re'} = $associate_tail_re;
225 }
226
227 delete $self->{'associate_ext'};
228 }
229
230 return bless $self, $class;
231
232}
233
234sub merge_inheritance
235{
236 my $self = {};
237 my @child_selfs = @_;
238
239 foreach my $child_self (@child_selfs) {
240 foreach my $key (keys %$child_self) {
241 if (defined $self->{$key}) {
242 if ($self->{$key} ne $child_self->{$key}) {
243# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
244# print STDERR "Existing stored value = $self->{$key}\n";
245# print STDERR "New (child) value = $child_self->{$key}\n";
246# print STDERR "Keeping existing value\n";
247 # Existing value seems to be option specified in collect.cfg
248
249 ### $self->{$key} = $child_self->{$key};
250
251 }
252 else {
253## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
254 }
255
256 }
257 else {
258 $self->{$key} = $child_self->{$key};
259 }
260 }
261 }
262
263 return $self;
264}
265
266# initialize BasePlugin options
267# if init() is overridden in a sub-class, remember to call BasePlugin::init()
268sub init {
269 my $self = shift (@_);
270 my ($verbosity, $outhandle, $failhandle) = @_;
271
272 # verbosity is passed through from the processor
273 $self->{'verbosity'} = $verbosity;
274
275 # as are the outhandle and failhandle
276 $self->{'outhandle'} = $outhandle if defined $outhandle;
277 $self->{'failhandle'} = $failhandle;
278# $self->SUPER::init(@_);
279
280 # set process_exp and block_exp to defaults unless they were
281 # explicitly set
282
283 if ((!$self->is_recursive()) and
284 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
285
286 $self->{'process_exp'} = $self->get_default_process_exp ();
287 if ($self->{'process_exp'} eq "") {
288 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
289 }
290 }
291
292 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
293 $self->{'block_exp'} = $self->get_default_block_exp ();
294 }
295
296}
297
298sub begin {
299 my $self = shift (@_);
300 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
301
302 if ($self->{'OIDtype'} eq "auto") {
303 # hasn't been set in the plugin, use the processor values
304 $self->{'OIDtype'} = $processor->{'OIDtype'};
305 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
306 }
307 if ($self->{'OIDtype'} eq "hash") {
308 # should we hash on the file or on the doc xml??
309 $self->{'OIDtype'} = $self->get_oid_hash_type();
310 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
311 $self->{'OIDtype'} = "hash_on_file";
312 }
313 }
314}
315
316sub end {
317 # potentially called at the end of each plugin pass
318 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
319
320 my ($self) = shift (@_);
321}
322
323sub deinit {
324 # called only once, after all plugin passes have been done
325
326 my ($self) = @_;
327}
328
329# default hashing type is to hash on the original file (or converted file)
330# override this to return hash_on_ga_xml for filetypes where hashing on the
331# file is no good eg video
332sub get_oid_hash_type {
333
334 my $self = shift (@_);
335
336 return "hash_on_file";
337}
338
339
340# this function should be overridden to return 1
341# in recursive plugins
342sub is_recursive {
343 my $self = shift (@_);
344
345 return 0;
346}
347
348sub get_default_block_exp {
349 my $self = shift (@_);
350
351 return "";
352}
353
354sub get_default_process_exp {
355 my $self = shift (@_);
356
357 return "";
358}
359
360# rename imported files and assoc files using URL encoding by default
361# as this will work for most plugins and give more legible filenames
362sub get_default_file_rename_method() {
363 my $self = shift (@_);
364 return "url";
365}
366
367# returns this plugin's active (possibly user-selected) file_rename_method
368sub get_file_rename_method() {
369 my $self = shift (@_);
370 my $rename_method = $self->{'file_rename_method'};
371 if($rename_method) {
372 return $rename_method;
373 } else {
374 return $self->get_default_file_rename_method();
375 }
376}
377
378# default implementation is to do nothing
379sub store_block_files {
380
381 my $self =shift (@_);
382 my ($filename_full_path, $block_hash) = @_;
383
384}
385
386# put files to block into hash
387sub use_block_expressions {
388
389 my $self =shift (@_);
390 my ($filename_full_path, $block_hash) = @_;
391
392 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
393 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
394 }
395
396}
397
398#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
399sub block_cover_image
400{
401 my $self =shift;
402 my ($filename, $block_hash) = @_;
403
404 if ($self->{'cover_image'}) {
405 my $coverfile = $filename;
406 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
407 if (!-e $coverfile) {
408 $coverfile =~ s/jpg$/JPG/;
409 }
410 if (-e $coverfile) {
411 $block_hash->{'file_blocks'}->{$coverfile} = 1;
412 }
413 }
414
415 return;
416}
417
418
419# discover all the files that should be blocked by this plugin
420# check the args ...
421sub file_block_read {
422
423 my $self = shift (@_);
424 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
425 # Keep track of filenames with same root but different extensions
426 # Used to support -associate_ext and the more generalised
427 # -associate_tail_re
428 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
429
430 if (!-d $filename_full_path) {
431 $block_hash->{'all_files'}->{$file} = 1;
432 }
433
434 my $associate_tail_re = $self->{'associate_tail_re'};
435 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
436 my ($file_prefix,$file_ext)
437 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
438 if ((defined $file_prefix) && (defined $file_ext)) {
439 my $shared_fileroot = $block_hash->{'shared_fileroot'};
440 if (!defined $shared_fileroot->{$file_prefix}) {
441 my $file_prefix_rec = { 'tie_to' => undef,
442 'exts' => {} };
443 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
444 }
445
446 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
447
448 if ($self->can_process_this_file($filename_full_path)) {
449 # This is the document the others should be tied to
450 $file_prefix_rec->{'tie_to'} = $file_ext;
451 }
452 else {
453 if ($file_ext =~ m/$associate_tail_re$/) {
454 # this file should be associated to the main one
455 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
456 }
457 }
458
459 }
460 }
461
462 # check block expressions
463 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
464
465 # now check whether we are actually processing this
466 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
467 return undef; # can't recognise
468 }
469
470 # if we have a block_exp, then this overrides the normal 'smart' blocking
471 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
472
473 # block the cover image if there is one
474 if ($self->{'cover_image'}) {
475 $self->block_cover_image($filename_full_path, $block_hash);
476 }
477
478 return 1;
479}
480
481# plugins that rely on more than process_exp (eg XML plugins) can override this method
482sub can_process_this_file {
483 my $self = shift(@_);
484 my ($filename) = @_;
485
486 if (-d $filename && !$self->{'can_process_directories'}) {
487 return 0;
488 }
489 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
490 return 1;
491 }
492 return 0;
493
494}
495
496# just converts path as is to utf8.
497sub filepath_to_utf8 {
498 my $self = shift (@_);
499 my ($file, $file_encoding) = @_;
500 my $filemeta = $file;
501
502 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
503
504 # Whenever filename-encoding is set to any of the auto settings, we
505 # check if the filename is already in UTF8. If it is, then we're done.
506 if($filename_encoding =~ m/auto/) {
507 if(&unicode::check_is_utf8($filemeta))
508 {
509 $filename_encoding = "utf8";
510 return $filemeta;
511 }
512 }
513
514 # Auto setting, but filename is not utf8
515 if ($filename_encoding eq "auto")
516 {
517 # try textcat
518 $filename_encoding = $self->textcat_encoding($filemeta);
519
520 # check the locale next
521 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
522
523
524 # now try the encoding of the document, if available
525 if ($filename_encoding eq "undefined" && defined $file_encoding) {
526 $filename_encoding = $file_encoding;
527 }
528
529 }
530
531 elsif ($filename_encoding eq "auto-language-analysis")
532 {
533 $filename_encoding = $self->textcat_encoding($filemeta);
534
535 # now try the encoding of the document, if available
536 if ($filename_encoding eq "undefined" && defined $file_encoding) {
537 $filename_encoding = $file_encoding;
538 }
539 }
540
541 elsif ($filename_encoding eq "auto-filesystem-encoding")
542 {
543 # try locale
544 $filename_encoding = $self->locale_encoding();
545 }
546
547 elsif ($filename_encoding eq "auto-fl")
548 {
549 # filesystem-encoding (locale) then language-analysis (textcat)
550 $filename_encoding = $self->locale_encoding();
551
552 # try textcat
553 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
554
555 # else assume filename encoding is encoding of file content, if that's available
556 if ($filename_encoding eq "undefined" && defined $file_encoding) {
557 $filename_encoding = $file_encoding;
558 }
559 }
560
561 elsif ($filename_encoding eq "auto-lf")
562 {
563 # language-analysis (textcat) then filesystem-encoding (locale)
564 $filename_encoding = $self->textcat_encoding($filemeta);
565
566 # guess filename encoding from encoding of file content, if available
567 if ($filename_encoding eq "undefined" && defined $file_encoding) {
568 $filename_encoding = $file_encoding;
569 }
570
571 # try locale
572 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
573 }
574
575 # if still undefined, use utf8 as fallback
576 if ($filename_encoding eq "undefined") {
577 $filename_encoding = "utf8";
578 }
579
580 #print STDERR "**** UTF8 encoding the filename $filemeta ";
581
582 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
583 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
584 # be always utf8 (in which case the filename's encoding is also set as utf8 even
585 # though the filename need not be if it originates from another system)--in such
586 # cases attempt to make the filename utf8 to match.
587 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
588 &unicode::ensure_utf8(\$filemeta);
589 }
590
591 # convert non-unicode encodings to utf8
592 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
593 $filemeta = &unicode::unicode2utf8(
594 &unicode::convert2unicode($filename_encoding, \$filemeta)
595 );
596 }
597
598 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
599 return $filemeta;
600}
601
602# gets the filename with no path, converts to utf8, and then dm safes it.
603# filename_encoding set by user
604sub filename_to_utf8_metadata
605{
606 my $self = shift (@_);
607 my ($file, $file_encoding) = @_;
608
609 my $outhandle = $self->{'outhandle'};
610
611 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
612 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
613
614 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
615
616 return $dmsafe_filemeta;
617
618}
619
620sub locale_encoding {
621 my $self = shift(@_);
622
623 if (!defined $self->{'filesystem_encoding'}) {
624 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
625 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
626 }
627
628 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
629 return $self->{'filesystem_encoding'}; # can be the string "undefined"
630}
631
632sub textcat_encoding {
633 my $self = shift(@_);
634 my ($filemeta) = @_;
635
636 # analyse filenames without extensions and digits (and trimmed of surrounding
637 # whitespace), so that irrelevant chars don't confuse textcat
638 my $strictfilemeta = $filemeta;
639 $strictfilemeta =~ s/\.[^\.]+$//g;
640 $strictfilemeta =~ s/\d//g;
641 $strictfilemeta =~ s/^\s*//g;
642 $strictfilemeta =~ s/\s*$//g;
643
644 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
645 if(!defined $filename_encoding) {
646 $filename_encoding = "undefined";
647 }
648
649 return $filename_encoding; # can be the string "undefined"
650}
651
652# performs textcat
653sub encoding_from_language_analysis {
654 my $self = shift(@_);
655 my ($text) = @_;
656
657 my $outhandle = $self->{'outhandle'};
658 my $best_encoding = undef;
659
660 # get the language/encoding of the textstring using textcat
661 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
662 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
663
664
665 if (scalar @$results < 0) {
666 return undef;
667 }
668
669 # We have some results, we choose the first
670 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
671
672 $best_encoding = $encoding;
673 if (!defined $best_encoding) {
674 return undef;
675 }
676
677 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
678 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
679 $best_encoding = 'utf8';
680 }
681
682
683 # check for equivalents where textcat doesn't have some encodings...
684 # eg MS versions of standard encodings
685 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
686 my $iso = $1; # which variant of the iso standard?
687 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
688 if ($text =~ /[\x80-\x9f]/) {
689 # Western Europe
690 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
691 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
692 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
693 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
694 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
695 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
696 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
697 }
698 }
699
700 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
701 !defined $encodings::encodings->{$best_encoding})
702 {
703 if ($self->{'verbosity'}) {
704 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
705 }
706 $best_encoding = undef;
707 }
708
709 return $best_encoding;
710}
711
712# uses locale
713sub get_filesystem_encoding {
714
715 my $self = shift(@_);
716
717 my $outhandle = $self->{'outhandle'};
718 my $filesystem_encoding = undef;
719
720 eval {
721 use POSIX qw(locale_h);
722
723 # With only one parameter, setlocale retrieves the
724 # current value
725 my $current_locale = setlocale(LC_CTYPE);
726
727 if ($current_locale =~ m/^.*\.(.*?)$/) {
728 my $char_encoding = lc($1);
729 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
730 $char_encoding = "$1\_$2\_$3";
731 }
732
733 $char_encoding =~ s/-/_/g;
734 $char_encoding =~ s/^utf_8$/utf8/;
735
736 if ($char_encoding =~ m/^\d+$/) {
737 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
738 $char_encoding = "windows_$char_encoding";
739 }
740 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
741 $char_encoding = "dos_$char_encoding";
742 }
743 }
744
745 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
746 || (defined $encodings::encodings->{$char_encoding})) {
747 $filesystem_encoding = $char_encoding;
748 }
749 else {
750 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
751 }
752 }
753
754
755 };
756 if ($@) {
757 print $outhandle "$@\n";
758 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
759
760 }
761 return $filesystem_encoding;
762}
763
764# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
765sub set_Source_metadata {
766 my $self = shift (@_);
767 my ($doc_obj, $filename_no_path, $file_encoding) = @_;
768
769 my $top_section = $doc_obj->get_top_section();
770
771 # UTF-8 version of filename
772 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
773
774 # Source is the UTF8 display name - not necessarily the name of the file on the system
775 $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
776
777 $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
778 # If using URL encoding, then SourceFile is the url-reference to url-encoded
779 # filemeta: it's a url that refers to the actual file on the system
780 $filemeta = &unicode::filename_to_url($filemeta);
781
782 $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
783}
784
785# this should be called by all plugins to set the oid of the doc obj, rather
786# than calling doc_obj->set_OID directly
787sub add_OID {
788 my $self = shift (@_);
789 my ($doc_obj) = @_;
790
791 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
792
793 # see if there is a plugin specific set_OID function
794 if (defined ($self->can('set_OID'))) {
795 $self->set_OID(@_); # pass through doc_obj and any extra arguments
796 }
797 else {
798 # use the default set_OID() in doc.pm
799 $doc_obj->set_OID();
800 }
801
802}
803
804# The BasePlugin read_into_doc_obj() function. This function does all the
805# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
806# a file and sets up a slew of metadata all saved in doc_obj, which
807# it then returns as part of a tuple (process_status,doc_obj)
808#
809# Much of this functionality used to reside in read, but it was broken
810# down into a supporting routine to make the code more flexible.
811#
812# recursive plugins (e.g. RecPlug) and specialized plugins like those
813# capable of processing many documents within a single file (e.g.
814# GMLPlug) will normally want to implement their own version of
815# read_into_doc_obj()
816#
817# Note that $base_dir might be "" and that $file might
818# include directories
819
820# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
821sub read_into_doc_obj {
822 my $self = shift (@_);
823 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
824
825 my $outhandle = $self->{'outhandle'};
826
827 # should we move this to read? What about secondary plugins?
828 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
829 print $outhandle "$self->{'plugin_type'} processing $file\n"
830 if $self->{'verbosity'} > 1;
831
832 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
833
834 # create a new document
835 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
836 my $top_section = $doc_obj->get_top_section();
837
838 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
839 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
840
841
842 # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
843 # of the UTF8 filename (SourceFile) for generated files
844 $self->set_Source_metadata($doc_obj, $filename_no_path);
845
846
847 # plugin specific stuff - what args do we need here??
848 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
849 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
850 return -1;
851 }
852
853 # include any metadata passed in from previous plugins
854 # note that this metadata is associated with the top level section
855 my $section = $doc_obj->get_top_section();
856 # can we merge these two methods??
857 $self->add_associated_files($doc_obj, $filename_full_path);
858 $self->extra_metadata ($doc_obj, $section, $metadata);
859 $self->auto_extract_metadata($doc_obj);
860
861 # if we haven't found any Title so far, assign one
862 # this was shifted to here from inside read()
863 $self->title_fallback($doc_obj,$section,$filename_no_path);
864
865 $self->add_OID($doc_obj);
866
867 return (1,$doc_obj);
868}
869
870sub add_dummy_text {
871 my $self = shift(@_);
872 my ($doc_obj, $section) = @_;
873
874 # add NoText metadata so we can hide this dummy text in format statements
875 $doc_obj->add_metadata($section, "NoText", "1");
876 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
877
878}
879
880# does nothing. Can be overridden by subclass
881sub auto_extract_metadata {
882 my $self = shift(@_);
883 my ($doc_obj) = @_;
884}
885
886# adds cover image, associate_file options stuff. Should be called by sub class
887# read_into_doc_obj
888sub add_associated_files {
889 my $self = shift(@_);
890 # whatis filename??
891 my ($doc_obj, $filename) = @_;
892
893 # add in the cover image
894 if ($self->{'cover_image'}) {
895 $self->associate_cover_image($doc_obj, $filename);
896 }
897
898
899}
900
901# implement this if you are extracting metadata for other documents
902sub metadata_read {
903 my $self = shift (@_);
904 my ($pluginfo, $base_dir, $file, $block_hash,
905 $extrametakeys, $extrametadata, $extrametafile,
906 $processor, $maxdocs, $gli) = @_;
907
908 # can we process this file??
909 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
910 return undef unless $self->can_process_this_file($filename_full_path);
911
912 return 1; # we recognise the file, but don't actually do anything with it
913}
914
915
916# The BasePlugin read() function. This function calls read_into_doc_obj()
917# to ensure all the right things to make general options work for a
918# given plugin are done. It then calls the process() function which
919# does all the work specific to a plugin (like the old read functions
920# used to do). Most plugins should define their own process() function
921# and let this read() function keep control.
922#
923# recursive plugins (e.g. RecPlug) and specialized plugins like those
924# capable of processing many documents within a single file (e.g.
925# GMLPlug) might want to implement their own version of read(), but
926# more likely need to implement their own version of read_into_doc_obj()
927#
928# Return number of files processed, undef if can't recognise, -1 if can't
929# process
930
931sub read {
932 my $self = shift (@_);
933 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
934
935 # can we process this file??
936 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
937
938 return undef unless $self->can_process_this_file($filename_full_path);
939
940 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
941
942 if ((defined $process_status) && ($process_status == 1)) {
943
944 # process the document
945 $processor->process($doc_obj);
946
947 $self->{'num_processed'} ++;
948 undef $doc_obj;
949 }
950 # delete any temp files that we may have created
951 $self->clean_up_after_doc_obj_processing();
952
953
954 # if process_status == 1, then the file has been processed.
955 return $process_status;
956
957}
958
959# returns undef if file is rejected by the plugin
960sub process {
961 my $self = shift (@_);
962 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
963
964 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
965
966 return undef; # never gets here
967}
968
969# overwrite this method to delete any temp files that we have created
970sub clean_up_after_doc_obj_processing {
971 my $self = shift(@_);
972
973}
974
975# write_file -- used by ConvertToPlug, for example in post processing
976#
977# where should this go, is here the best place??
978sub utf8_write_file {
979 my $self = shift (@_);
980 my ($textref, $filename) = @_;
981
982 if (!open (FILE, ">$filename")) {
983 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
984 die "\n";
985 }
986 print FILE $$textref;
987
988 close FILE;
989}
990
991
992sub filename_based_title
993{
994 my $self = shift (@_);
995 my ($file) = @_;
996
997 my $file_derived_title = $file;
998 $file_derived_title =~ s/_/ /g;
999 $file_derived_title =~ s/\..*?$//;
1000
1001 return $file_derived_title;
1002}
1003
1004
1005sub title_fallback
1006{
1007 my $self = shift (@_);
1008 my ($doc_obj,$section,$file) = @_;
1009
1010 if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
1011
1012 my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
1013 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1014 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1015 }
1016 else {
1017 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1018 }
1019 }
1020
1021}
1022
1023# add any extra metadata that's been passed around from one
1024# plugin to another.
1025# extra_metadata uses add_utf8_metadata so it expects metadata values
1026# to already be in utf8
1027sub extra_metadata {
1028 my $self = shift (@_);
1029 my ($doc_obj, $cursection, $metadata) = @_;
1030
1031 my $associate_tail_re = $self->{'associate_tail_re'};
1032
1033 foreach my $field (keys(%$metadata)) {
1034 # $metadata->{$field} may be an array reference
1035 if ($field eq "gsdlassocfile_tobe") {
1036 # 'gsdlassocfile_tobe' is artificially introduced metadata
1037 # that is used to signal that certain additional files should
1038 # be tied to this document. Useful in situations where a
1039 # metadata pass in the plugin pipeline works out some files
1040 # need to be associated with a document, but the document hasn't
1041 # been formed yet.
1042 my $equiv_form = "";
1043 foreach my $gaf (@{$metadata->{$field}}) {
1044 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1045 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1046
1047 # we need to make sure the filename is valid utf-8 - we do
1048 # this by url or base64 encoding it
1049 # $tail_filename is the name that we store the file as
1050 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1051 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1052 $doc_obj->associate_source_file($full_filename);
1053 # If the filename is url_encoded, we need to encode the % signs
1054 # in the filename, so that it works in a url
1055 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1056 # work out extended tail extension (i.e. matching tail re)
1057
1058 my ($file_prefix,$file_extended_ext)
1059 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1060 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1061 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1062 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$url_tail_filename\">";
1063 my $srcicon = "_icon".$doc_ext."_";
1064 my $end_doclink = "</a>";
1065
1066 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1067
1068 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1069 # for metadata such as [mp3._edited] [mp3._full] ...
1070 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1071 }
1072
1073 # for multiple metadata such as [mp3.assoclink]
1074 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1075
1076 $equiv_form .= " $assoc_form";
1077 }
1078 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1079 }
1080 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1081 map {
1082 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1083 } @{$metadata->{$field}};
1084 } else {
1085 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1086 }
1087 }
1088}
1089
1090
1091sub compile_stats {
1092 my $self = shift(@_);
1093 my ($stats) = @_;
1094
1095 $stats->{'num_processed'} += $self->{'num_processed'};
1096 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1097 $stats->{'num_archives'} += $self->{'num_archives'};
1098
1099}
1100
1101sub associate_cover_image {
1102 my $self = shift;
1103 my ($doc_obj, $filename) = @_;
1104
1105 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1106 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1107 # don't stat() for existence eg for multiple document input files
1108 # (eg SplitPlug)
1109 return;
1110 }
1111
1112 my $top_section=$doc_obj->get_top_section();
1113
1114 if (-e $filename) {
1115 $doc_obj->associate_source_file($filename);
1116 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1117 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1118 } else {
1119 my $upper_filename = $filename;
1120 $upper_filename =~ s/jpg$/JPG/;
1121 if (-e $upper_filename) {
1122 $doc_obj->associate_source_file($upper_filename);
1123 $doc_obj->associate_file($upper_filename, "cover.jpg",
1124 "image/jpeg");
1125 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1126 } else {
1127 # file doesn't exist, so record the fact that it's missing so
1128 # we don't stat() again (stat is slow)
1129 $self->{'covers_missing_cache'}->{$filename} = 1;
1130 }
1131 }
1132
1133}
1134
1135
1136# Overridden by exploding plugins (eg. ISISPlug)
1137sub clean_up_after_exploding
1138{
1139 my $self = shift(@_);
1140}
1141
1142
1143
11441;
Note: See TracBrowser for help on using the repository browser.