source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 46.7 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41use util;
42use FileUtils;
43
44use PrintInfo;
45
46BEGIN {
47 @BasePlugin::ISA = ( 'PrintInfo' );
48}
49
50# the different methods that can be applied when renaming
51# imported documents and their associated files
52our $file_rename_method_list =
53 [ { 'name' => "url",
54 'desc' => "{BasePlugin.rename_method.url}" },
55 { 'name' => "base64",
56 'desc' => "{BasePlugin.rename_method.base64}" },
57 { 'name' => "none",
58 'desc' => "{BasePlugin.rename_method.none}",
59 'hiddengli' => "yes" } ];
60
61our $encoding_list =
62 [ { 'name' => "ascii",
63 'desc' => "{BasePlugin.encoding.ascii}" },
64 { 'name' => "utf8",
65 'desc' => "{BasePlugin.encoding.utf8}" },
66 { 'name' => "unicode",
67 'desc' => "{BasePlugin.encoding.unicode}" } ];
68
69
70my $e = $encodings::encodings;
71foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
72{
73 my $hashEncode =
74 {'name' => $enc,
75 'desc' => $e->{$enc}->{'name'}};
76
77 push(@{$encoding_list},$hashEncode);
78}
79
80our $encoding_plus_auto_list =
81 [ { 'name' => "auto",
82 'desc' => "{BasePlugin.filename_encoding.auto}" },
83 { 'name' => "auto-language-analysis",
84 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
85 { 'name' => "auto-filesystem-encoding",
86 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
87 { 'name' => "auto-fl",
88 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
89 { 'name' => "auto-lf",
90 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
91
92push(@{$encoding_plus_auto_list},@{$encoding_list});
93
94our $oidtype_list =
95 [ { 'name' => "auto",
96 'desc' => "{BasePlugin.OIDtype.auto}" },
97 { 'name' => "hash",
98 'desc' => "{import.OIDtype.hash}" },
99 { 'name' => "hash_on_ga_xml",
100 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
101 { 'name' => "hash_on_full_filename",
102 'desc' => "{import.OIDtype.hash_on_full_filename}" },
103 { 'name' => "assigned",
104 'desc' => "{import.OIDtype.assigned}" },
105 { 'name' => "incremental",
106 'desc' => "{import.OIDtype.incremental}" },
107 { 'name' => "filename",
108 'desc' => "{import.OIDtype.filename}" },
109 { 'name' => "dirname",
110 'desc' => "{import.OIDtype.dirname}" },
111 { 'name' => "full_filename",
112 'desc' => "{import.OIDtype.full_filename}" } ];
113
114my $arguments =
115 [ { 'name' => "process_exp",
116 'desc' => "{BasePlugin.process_exp}",
117 'type' => "regexp",
118 'deft' => "",
119 'reqd' => "no" },
120 { 'name' => "no_blocking",
121 'desc' => "{BasePlugin.no_blocking}",
122 'type' => "flag",
123 'reqd' => "no"},
124 { 'name' => "block_exp",
125 'desc' => "{BasePlugin.block_exp}",
126 'type' => "regexp",
127 'deft' => "",
128 'reqd' => "no" },
129 { 'name' => "store_original_file",
130 'desc' => "{BasePlugin.store_original_file}",
131 'type' => "flag",
132 'reqd' => "no" },
133 { 'name' => "associate_ext",
134 'desc' => "{BasePlugin.associate_ext}",
135 'type' => "string",
136 'reqd' => "no" },
137 { 'name' => "associate_tail_re",
138 'desc' => "{BasePlugin.associate_tail_re}",
139 'type' => "string",
140 'reqd' => "no" },
141 { 'name' => "OIDtype",
142 'desc' => "{import.OIDtype}",
143 'type' => "enum",
144 'list' => $oidtype_list,
145 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
146 'deft' => "auto",
147 'reqd' => "no" },
148 { 'name' => "OIDmetadata",
149 'desc' => "{import.OIDmetadata}",
150 'type' => "metadata",
151 'deft' => "dc.Identifier",
152 'reqd' => "no" },
153# { 'name' => "use_as_doc_identifier",
154# 'desc' => "{BasePlugin.use_as_doc_identifier}",
155# 'type' => "string",
156# 'reqd' => "no" ,
157# 'deft' => "" } ,
158 { 'name' => "no_cover_image",
159 'desc' => "{BasePlugin.no_cover_image}",
160 'type' => "flag",
161 'reqd' => "no" },
162 { 'name' => "filename_encoding",
163 'desc' => "{BasePlugin.filename_encoding}",
164 'type' => "enum",
165 'deft' => "auto",
166 'list' => $encoding_plus_auto_list,
167 'reqd' => "no" },
168 { 'name' => "smart_block",
169 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
170 'type' => "flag",
171 'reqd' => "no",
172 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
173 { 'name' => "file_rename_method",
174 'desc' => "{BasePlugin.file_rename_method}",
175 'type' => "enum",
176 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
177 'list' => $file_rename_method_list,
178 'reqd' => "no"
179 }
180
181 ];
182
183
184my $options = { 'name' => "BasePlugin",
185 'desc' => "{BasePlugin.desc}",
186 'abstract' => "yes",
187 'inherits' => "no",
188 'args' => $arguments };
189
190sub new {
191
192 my ($class) = shift (@_);
193 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
194 push(@$pluginlist, $class);
195
196 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
197 push(@{$hashArgOptLists->{"OptList"}},$options);
198
199 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
200
201 if ($self->{'info_only'}) {
202 # don't worry about any options etc
203 return bless $self, $class;
204 }
205
206 if ($self->{'smart_block'}) {
207 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
208 }
209 $self->{'smart_block'} = undef;
210
211 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
212 $self->{'plugin_type'} = $plugin_name;
213
214 # remove ex. from OIDmetadata iff it's the only namespace prefix
215 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
216 $self->{'num_processed'} = 0;
217 $self->{'num_not_processed'} = 0;
218 $self->{'num_blocked'} = 0;
219 $self->{'num_archives'} = 0;
220 $self->{'cover_image'} = 1; # cover image is on by default
221 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
222 $self->{'can_process_directories'} = 0;
223 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
224
225 my $associate_ext = $self->{'associate_ext'};
226 if ((defined $associate_ext) && ($associate_ext ne "")) {
227
228 my $associate_tail_re = $self->{'associate_tail_re'};
229 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
230 my $outhandle = $self->{'outhandle'};
231 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
232 print $outhandle " defaulting to 'associate_tail_re'\n";
233 }
234 else {
235 my @exts = split(/,/,$associate_ext);
236
237 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
238 my $associate_tail_re = join("|",@exts_bracketed);
239 $self->{'associate_tail_re'} = $associate_tail_re;
240 }
241
242 delete $self->{'associate_ext'};
243 }
244
245 return bless $self, $class;
246
247}
248
249sub merge_inheritance
250{
251 my $self = {};
252 my @child_selfs = @_;
253
254 foreach my $child_self (@child_selfs) {
255 foreach my $key (keys %$child_self) {
256 if (defined $self->{$key}) {
257 if ($self->{$key} ne $child_self->{$key}) {
258# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
259# print STDERR "Existing stored value = $self->{$key}\n";
260# print STDERR "New (child) value = $child_self->{$key}\n";
261# print STDERR "Keeping existing value\n";
262 # Existing value seems to be option specified in collect.cfg
263
264 ### $self->{$key} = $child_self->{$key};
265
266 }
267 else {
268## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
269 }
270
271 }
272 else {
273 $self->{$key} = $child_self->{$key};
274 }
275 }
276 }
277
278 return $self;
279}
280
281# initialize BasePlugin options
282# if init() is overridden in a sub-class, remember to call BasePlugin::init()
283sub init {
284 my $self = shift (@_);
285 my ($verbosity, $outhandle, $failhandle) = @_;
286
287 # verbosity is passed through from the processor
288 $self->{'verbosity'} = $verbosity;
289
290 # as are the outhandle and failhandle
291 $self->{'outhandle'} = $outhandle if defined $outhandle;
292 $self->{'failhandle'} = $failhandle;
293# $self->SUPER::init(@_);
294
295 # set process_exp and block_exp to defaults unless they were
296 # explicitly set
297
298 if ((!$self->is_recursive()) and
299 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
300
301 $self->{'process_exp'} = $self->get_default_process_exp ();
302 if ($self->{'process_exp'} eq "") {
303 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
304 }
305 }
306
307 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
308 $self->{'block_exp'} = $self->get_default_block_exp ();
309 }
310
311}
312
313sub begin {
314 my $self = shift (@_);
315 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
316
317 if ($self->{'OIDtype'} eq "auto") {
318 # hasn't been set in the plugin, use the processor values
319 $self->{'OIDtype'} = $processor->{'OIDtype'};
320 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
321 }
322 if ($self->{'OIDtype'} eq "hash") {
323 # should we hash on the file or on the doc xml??
324 $self->{'OIDtype'} = $self->get_oid_hash_type();
325 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
326 $self->{'OIDtype'} = "hash_on_file";
327 }
328 }
329}
330
331# This is called once if removeold is set with import.pl. Most plugins will do
332# nothing but if a plugin does any stuff outside of creating doc obj, then
333# it may need to clear something.
334sub remove_all {
335 my $self = shift (@_);
336 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
337}
338
339# This is called per document for docs that have been deleted from the
340# collection. Most plugins will do nothing
341# but if a plugin does any stuff outside of creating doc obj, then it may need
342# to clear something.
343sub remove_one {
344 my $self = shift (@_);
345
346 my ($file, $oids, $archivedir) = @_;
347 return 0 if $self->can_process_this_file($file);
348 return undef;
349}
350
351sub end {
352 # potentially called at the end of each plugin pass
353 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
354
355 my ($self) = shift (@_);
356}
357
358sub deinit {
359 # called only once, after all plugin passes have been done
360
361 my ($self) = @_;
362}
363
364# default hashing type is to hash on the original file (or converted file)
365# override this to return hash_on_ga_xml for filetypes where hashing on the
366# file is no good eg video
367sub get_oid_hash_type {
368
369 my $self = shift (@_);
370
371 return "hash_on_file";
372}
373
374
375# this function should be overridden to return 1
376# in recursive plugins
377sub is_recursive {
378 my $self = shift (@_);
379
380 return 0;
381}
382
383sub get_default_block_exp {
384 my $self = shift (@_);
385
386 return "";
387}
388
389sub get_default_process_exp {
390 my $self = shift (@_);
391
392 return "";
393}
394
395
396# rename imported files and assoc files using URL encoding by default
397# as this will work for most plugins and give more legible filenames
398sub get_default_file_rename_method() {
399 my $self = shift (@_);
400 return "url";
401}
402
403# returns this plugin's active (possibly user-selected) file_rename_method
404sub get_file_rename_method() {
405 my $self = shift (@_);
406 my $rename_method = $self->{'file_rename_method'};
407 if($rename_method) {
408 return $rename_method;
409 } else {
410 return $self->get_default_file_rename_method();
411 }
412}
413
414# default implementation is to do nothing
415sub store_block_files {
416
417 my $self =shift (@_);
418 my ($filename_full_path, $block_hash) = @_;
419
420}
421
422# put files to block into hash
423sub use_block_expressions {
424
425 my $self =shift (@_);
426 my ($filename_full_path, $block_hash) = @_;
427
428 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
429
430 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
431 &util::block_filename($block_hash,$filename_full_path);
432 }
433
434}
435
436#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
437sub block_cover_image
438{
439 my $self =shift;
440 my ($filename, $block_hash) = @_;
441
442 $filename = &util::upgrade_if_dos_filename($filename);
443
444 if ($self->{'cover_image'}) {
445 my $coverfile = $filename;
446 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
447
448 #if there is no file extension, coverfile will be the same as filename
449 return if $coverfile eq $filename;
450
451 if (!&FileUtils::fileExists($coverfile)) {
452 $coverfile =~ s/jpg$/JPG/;
453 }
454 if (&FileUtils::fileExists($coverfile)) {
455 &util::block_filename($block_hash,$coverfile);
456 }
457 }
458
459 return;
460}
461
462
463# discover all the files that should be blocked by this plugin
464# check the args ...
465sub file_block_read {
466
467 my $self = shift (@_);
468 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
469 # Keep track of filenames with same root but different extensions
470 # Used to support -associate_ext and the more generalised
471 # -associate_tail_re
472 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
473
474 if (!-d $filename_full_path) {
475 $block_hash->{'all_files'}->{$file} = 1;
476 }
477
478 my $associate_tail_re = $self->{'associate_tail_re'};
479 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
480 my ($file_prefix,$file_ext)
481 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
482 if ((defined $file_prefix) && (defined $file_ext)) {
483 my $shared_fileroot = $block_hash->{'shared_fileroot'};
484 if (!defined $shared_fileroot->{$file_prefix}) {
485 my $file_prefix_rec = { 'tie_to' => undef,
486 'exts' => {} };
487 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
488 }
489
490 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
491
492 if ($self->can_process_this_file($filename_full_path)) {
493 # This is the document the others should be tied to
494 $file_prefix_rec->{'tie_to'} = $file_ext;
495 }
496 else {
497 if ($file_ext =~ m/$associate_tail_re$/) {
498 # this file should be associated to the main one
499 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
500 }
501 }
502
503 }
504 }
505
506 # check block expressions
507 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
508
509 # now check whether we are actually processing this
510 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
511 return undef; # can't recognise
512 }
513
514 # if we have a block_exp, then this overrides the normal 'smart' blocking
515 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
516
517 # block the cover image if there is one
518 if ($self->{'cover_image'}) {
519 $self->block_cover_image($filename_full_path, $block_hash);
520 }
521
522 return 1;
523}
524
525# plugins that rely on more than process_exp (eg XML plugins) can override this method
526sub can_process_this_file {
527 my $self = shift(@_);
528 my ($filename) = @_;
529
530 if (-d $filename && !$self->{'can_process_directories'}) {
531 return 0;
532 }
533
534 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
535 return 1;
536 }
537 return 0;
538
539}
540
541# Even if a plugin can extract metadata in its metadata_read pass,
542# make the default return 'undef' so processing of the file continues
543# down the pipeline, so other plugins can also have the opportunity to
544# locate metadata and set it up in the extrametakeys variables that
545# are passed around.
546
547sub can_process_this_file_for_metadata {
548 my $self = shift(@_);
549
550 return undef;
551}
552
553
554# just converts path as is to utf8.
555sub filepath_to_utf8 {
556 my $self = shift (@_);
557 my ($file, $file_encoding) = @_;
558 my $filemeta = $file;
559
560 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
561
562 # Whenever filename-encoding is set to any of the auto settings, we
563 # check if the filename is already in UTF8. If it is, then we're done.
564 if($filename_encoding =~ m/auto/) {
565 if(&unicode::check_is_utf8($filemeta))
566 {
567 $filename_encoding = "utf8";
568 return $filemeta;
569 }
570 }
571
572 # Auto setting, but filename is not utf8
573 if ($filename_encoding eq "auto")
574 {
575 # try textcat
576 $filename_encoding = $self->textcat_encoding($filemeta);
577
578 # check the locale next
579 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
580
581
582 # now try the encoding of the document, if available
583 if ($filename_encoding eq "undefined" && defined $file_encoding) {
584 $filename_encoding = $file_encoding;
585 }
586
587 }
588
589 elsif ($filename_encoding eq "auto-language-analysis")
590 {
591 $filename_encoding = $self->textcat_encoding($filemeta);
592
593 # now try the encoding of the document, if available
594 if ($filename_encoding eq "undefined" && defined $file_encoding) {
595 $filename_encoding = $file_encoding;
596 }
597 }
598
599 elsif ($filename_encoding eq "auto-filesystem-encoding")
600 {
601 # try locale
602 $filename_encoding = $self->locale_encoding();
603 }
604
605 elsif ($filename_encoding eq "auto-fl")
606 {
607 # filesystem-encoding (locale) then language-analysis (textcat)
608 $filename_encoding = $self->locale_encoding();
609
610 # try textcat
611 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
612
613 # else assume filename encoding is encoding of file content, if that's available
614 if ($filename_encoding eq "undefined" && defined $file_encoding) {
615 $filename_encoding = $file_encoding;
616 }
617 }
618
619 elsif ($filename_encoding eq "auto-lf")
620 {
621 # language-analysis (textcat) then filesystem-encoding (locale)
622 $filename_encoding = $self->textcat_encoding($filemeta);
623
624 # guess filename encoding from encoding of file content, if available
625 if ($filename_encoding eq "undefined" && defined $file_encoding) {
626 $filename_encoding = $file_encoding;
627 }
628
629 # try locale
630 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
631 }
632
633 # if still undefined, use utf8 as fallback
634 if ($filename_encoding eq "undefined") {
635 $filename_encoding = "utf8";
636 }
637
638 #print STDERR "**** UTF8 encoding the filename $filemeta ";
639
640 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
641 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
642 # be always utf8 (in which case the filename's encoding is also set as utf8 even
643 # though the filename need not be if it originates from another system)--in such
644 # cases attempt to make the filename utf8 to match.
645 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
646 &unicode::ensure_utf8(\$filemeta);
647 }
648
649 # convert non-unicode encodings to utf8
650 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
651 $filemeta = &unicode::unicode2utf8(
652 &unicode::convert2unicode($filename_encoding, \$filemeta)
653 );
654 }
655
656 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
657 return $filemeta;
658}
659
660# gets the filename with no path, converts to utf8, and then dm safes it.
661# filename_encoding set by user
662sub filename_to_utf8_metadata
663{
664 my $self = shift (@_);
665 my ($file, $file_encoding) = @_;
666
667 my $outhandle = $self->{'outhandle'};
668
669 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
670 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
671 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
672
673
674 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
675 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
676
677 return $filemeta;
678}
679
680sub locale_encoding {
681 my $self = shift(@_);
682
683 if (!defined $self->{'filesystem_encoding'}) {
684 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
685 }
686
687 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
688 return $self->{'filesystem_encoding'}; # can be the string "undefined"
689}
690
691sub textcat_encoding {
692 my $self = shift(@_);
693 my ($filemeta) = @_;
694
695 # analyse filenames without extensions and digits (and trimmed of
696 # surrounding whitespace), so that irrelevant chars don't confuse
697 # textcat
698 my $strictfilemeta = $filemeta;
699 $strictfilemeta =~ s/\.[^\.]+$//g;
700 $strictfilemeta =~ s/\d//g;
701 $strictfilemeta =~ s/^\s*//g;
702 $strictfilemeta =~ s/\s*$//g;
703
704 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
705 if(!defined $filename_encoding) {
706 $filename_encoding = "undefined";
707 }
708
709 return $filename_encoding; # can be the string "undefined"
710}
711
712# performs textcat
713sub encoding_from_language_analysis {
714 my $self = shift(@_);
715 my ($text) = @_;
716
717 my $outhandle = $self->{'outhandle'};
718 my $best_encoding = undef;
719
720 # get the language/encoding of the textstring using textcat
721 require textcat; # Only load the textcat module if it is required
722 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
723 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
724
725
726 if (scalar @$results < 0) {
727 return undef;
728 }
729
730 # We have some results, we choose the first
731 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
732
733 $best_encoding = $encoding;
734 if (!defined $best_encoding) {
735 return undef;
736 }
737
738 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
739 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
740 $best_encoding = 'utf8';
741 }
742
743
744 # check for equivalents where textcat doesn't have some encodings...
745 # eg MS versions of standard encodings
746 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
747 my $iso = $1; # which variant of the iso standard?
748 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
749 if ($text =~ /[\x80-\x9f]/) {
750 # Western Europe
751 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
752 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
753 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
754 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
755 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
756 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
757 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
758 }
759 }
760
761 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
762 !defined $encodings::encodings->{$best_encoding})
763 {
764 if ($self->{'verbosity'}) {
765 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
766 }
767 $best_encoding = undef;
768 }
769
770 return $best_encoding;
771}
772
773# uses locale
774sub get_filesystem_encoding
775{
776
777 my $self = shift(@_);
778
779 my $outhandle = $self->{'outhandle'};
780 my $filesystem_encoding = undef;
781
782 eval {
783 # Works for Windows as well, returning the DOS code page in use
784 use POSIX qw(locale_h);
785
786 # With only one parameter, setlocale retrieves the
787 # current value
788 my $current_locale = setlocale(LC_CTYPE);
789
790 my $char_encoding = undef;
791 if ($current_locale =~ m/\./) {
792 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
793 $char_encoding = lc($char_encoding);
794 }
795 else {
796 if ($current_locale =~ m/^(posix|c)$/i) {
797 $char_encoding = "ascii";
798 }
799 }
800
801 if (defined $char_encoding) {
802 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
803 $char_encoding = "$1\_$2\_$3";
804 }
805
806 $char_encoding =~ s/-/_/g;
807 $char_encoding =~ s/^utf_8$/utf8/;
808
809 if ($char_encoding =~ m/^\d+$/) {
810 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
811 $char_encoding = "windows_$char_encoding";
812 }
813 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
814 $char_encoding = "dos_$char_encoding";
815 }
816 }
817
818 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
819 || (defined $encodings::encodings->{$char_encoding})) {
820 $filesystem_encoding = $char_encoding;
821 }
822 else {
823 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
824 }
825 }
826
827
828 };
829 if ($@) {
830 print $outhandle "$@\n";
831 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
832
833 }
834
835 return $filesystem_encoding;
836}
837
838
839
840sub deduce_filename_encoding
841{
842 my $self = shift (@_);
843 my ($file,$metadata,$plugin_filename_encoding) = @_;
844
845 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
846 my $deduced_filename_encoding = undef;
847
848 # Start by looking for manually assigned metadata
849 if (defined $gs_filename_encoding) {
850 if (ref ($gs_filename_encoding) eq "ARRAY") {
851 my $outhandle = $self->{'outhandle'};
852
853 $deduced_filename_encoding = $gs_filename_encoding->[0];
854
855 my $num_vals = scalar(@$gs_filename_encoding);
856 if ($num_vals>1) {
857 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
858 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
859 }
860 }
861 else {
862 $deduced_filename_encoding = $gs_filename_encoding;
863 }
864 }
865
866 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
867 # Look to see if plugin specifies this value
868
869 if (defined $plugin_filename_encoding) {
870 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
871 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
872 my $outhandle = $self->{'outhandle'};
873 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
874 print $outhandle " default to 'auto'\n";
875 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
876 }
877
878 if ($plugin_filename_encoding ne "auto") {
879 # We've been given a specific filenamne encoding
880 # => so use it!
881 $deduced_filename_encoding = $plugin_filename_encoding;
882 }
883 }
884 }
885
886 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
887
888 # Look to file system to provide a character encoding
889
890 # If Windows NTFS, then -- assuming we work with long file names got through
891 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
892
893 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
894 # Can do better than working with the DOS character encoding returned by locale
895 $deduced_filename_encoding = "unicode";
896 }
897 else {
898 # Unix of some form or other
899
900 # See if we can determine the file system encoding through locale
901 $deduced_filename_encoding = $self->locale_encoding();
902
903 # if locale shows us filesystem is utf8, check to see filename is consistent
904 # => if not, then we have an "alien" filename on our hands
905
906 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
907 if (!&unicode::check_is_utf8($file)) {
908 # "alien" filename, so revert
909 $deduced_filename_encoding = undef;
910 }
911 }
912 }
913 }
914
915# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
916# # Last chance, apply textcat to deduce filename encoding
917# $deduced_filename_encoding = $self->textcat_encoding($file);
918# }
919
920 if ($self->{'verbosity'}>3) {
921 my $outhandle = $self->{'outhandle'};
922
923 if (defined $deduced_filename_encoding) {
924 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
925 }
926 else {
927 print $outhandle " No filename encoding deduced\n";
928 }
929 }
930
931 return $deduced_filename_encoding;
932}
933
934
935
936
937# Notionally written to be called once for each document, it is however safe to
938# call multiple times (as in the case of ImagePlugin) which calls this later on
939# after the original image has potentially been converted to a *new* source image
940# format (e.g. TIFF to PNG)
941
942sub set_Source_metadata {
943 my $self = shift (@_);
944 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
945
946 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
947 # and (as a fallback) using %xx if not for non-ascii chars
948 # 2. Sets the url ref (SourceFile) to the URL encoded version
949 # of filename for generated files
950
951 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
952
953 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
954
955 my $octet_file = $raw_file;
956
957 # UTF-8 version of filename
958# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
959# print STDERR "**** Setting Source Metadata given: $octet_file\n";
960# }
961
962 # Deal with (on Windows) raw filenames that are in their
963 # abbreviated DOS form
964
965 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
966 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
967 if (-e $raw_filename) {
968 my $unicode_filename = Win32::GetLongPathName($raw_filename);
969
970 my $unused_full_uf;
971 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
972 }
973 }
974 }
975
976 my $url_encoded_filename;
977 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
978 # => Generate a pretty print version of filename that is mapped to Unicode
979
980 # Use filename_encoding to map raw filename to a Perl unicode-aware string
981 $url_encoded_filename = decode($filename_encoding,$octet_file);
982 }
983 else {
984 # otherwise generate %xx encoded version of filename for char > 127
985 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
986 }
987
988# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
989# print STDERR "****** saving Source as: $url_encoded_filename\n";
990# }
991
992 # In the case of converted files and (generalized) exploded documents, there
993 # will already be a source filename => store as OrigSource before overriding
994 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
995 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
996 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
997 }
998
999 # Source is the UTF8 display name - not necessarily the name of the file on the system
1000 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1001 # on Darwin want all display strings to be in composed form, then can search on that
1002 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
1003 }
1004 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
1005
1006
1007 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
1008 # If using URL encoding, then SourceFile is the url-reference to url-encoded
1009 # renamed_raw_url: it's a url that refers to the actual file on the system
1010 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
1011
1012 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
1013 $renamed_raw_url);
1014
1015# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1016# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1017# }
1018}
1019
1020# this should be called by all plugins to set the oid of the doc obj, rather
1021# than calling doc_obj->set_OID directly
1022sub add_OID {
1023 my $self = shift (@_);
1024 my ($doc_obj, $force) = @_;
1025
1026 # don't add one if there is one already set, unless we are forced to do so
1027 return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
1028 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1029
1030 # see if there is a plugin specific set_OID function
1031 if (defined ($self->can('set_OID'))) {
1032 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1033 }
1034 else {
1035 # use the default set_OID() in doc.pm
1036 $doc_obj->set_OID();
1037 }
1038
1039}
1040
1041# The BasePlugin read_into_doc_obj() function. This function does all the
1042# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1043# a file and sets up a slew of metadata all saved in doc_obj, which
1044# it then returns as part of a tuple (process_status,doc_obj)
1045#
1046# Much of this functionality used to reside in read, but it was broken
1047# down into a supporting routine to make the code more flexible.
1048#
1049# recursive plugins (e.g. RecPlug) and specialized plugins like those
1050# capable of processing many documents within a single file (e.g.
1051# GMLPlug) will normally want to implement their own version of
1052# read_into_doc_obj()
1053#
1054# Note that $base_dir might be "" and that $file might
1055# include directories
1056
1057# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1058sub read_into_doc_obj {
1059 my $self = shift (@_);
1060 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1061
1062 my $outhandle = $self->{'outhandle'};
1063
1064 # should we move this to read? What about secondary plugins?
1065 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1066 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1067 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1068 if $self->{'verbosity'} > 1;
1069
1070 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1071
1072 # create a new document
1073 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1074 my $top_section = $doc_obj->get_top_section();
1075
1076 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1077 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1078
1079
1080 my $plugin_filename_encoding = $self->{'filename_encoding'};
1081 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1082 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1083
1084 # plugin specific stuff - what args do we need here??
1085 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1086 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1087 return -1;
1088 }
1089
1090 # include any metadata passed in from previous plugins
1091 # note that this metadata is associated with the top level section
1092 my $section = $doc_obj->get_top_section();
1093 # can we merge these two methods??
1094 $self->add_associated_files($doc_obj, $filename_full_path);
1095 $self->extra_metadata ($doc_obj, $section, $metadata);
1096 $self->auto_extract_metadata($doc_obj);
1097
1098 # if we haven't found any Title so far, assign one
1099 # this was shifted to here from inside read()
1100 $self->title_fallback($doc_obj,$section,$filename_no_path);
1101
1102 $self->add_OID($doc_obj);
1103
1104 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1105 return (1,$doc_obj);
1106}
1107
1108sub post_process_doc_obj {
1109 my $self = shift (@_);
1110 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1111
1112 return 1;
1113}
1114
1115sub add_dummy_text {
1116 my $self = shift(@_);
1117 my ($doc_obj, $section) = @_;
1118
1119 # add NoText metadata so we can hide this dummy text in format statements
1120 $doc_obj->add_metadata($section, "NoText", "1");
1121 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1122
1123}
1124
1125# does nothing. Can be overridden by subclass
1126sub auto_extract_metadata {
1127 my $self = shift(@_);
1128 my ($doc_obj) = @_;
1129}
1130
1131# adds cover image, associate_file options stuff. Should be called by sub class
1132# read_into_doc_obj
1133sub add_associated_files {
1134 my $self = shift(@_);
1135 # whatis filename??
1136 my ($doc_obj, $filename) = @_;
1137
1138 # add in the cover image
1139 if ($self->{'cover_image'}) {
1140 $self->associate_cover_image($doc_obj, $filename);
1141 }
1142 # store the original (used for eg TextPlugin to store the original for OAI)
1143 if ($self->{'store_original_file'}) {
1144 $self->associate_source_file($doc_obj, $filename);
1145 }
1146
1147
1148}
1149
1150# implement this if you are extracting metadata for other documents
1151sub metadata_read {
1152 my $self = shift (@_);
1153 my ($pluginfo, $base_dir, $file, $block_hash,
1154 $extrametakeys, $extrametadata, $extrametafile,
1155 $processor, $gli, $aux) = @_;
1156
1157 # can we process this file??
1158 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1159 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
1160
1161 return 1; # we recognise the file, but don't actually do anything with it
1162}
1163
1164
1165# The BasePlugin read() function. This function calls read_into_doc_obj()
1166# to ensure all the right things to make general options work for a
1167# given plugin are done. It then calls the process() function which
1168# does all the work specific to a plugin (like the old read functions
1169# used to do). Most plugins should define their own process() function
1170# and let this read() function keep control.
1171#
1172# recursive plugins (e.g. RecPlug) and specialized plugins like those
1173# capable of processing many documents within a single file (e.g.
1174# GMLPlug) might want to implement their own version of read(), but
1175# more likely need to implement their own version of read_into_doc_obj()
1176#
1177# Return number of files processed, undef if can't recognise, -1 if can't
1178# process
1179
1180sub read {
1181 my $self = shift (@_);
1182 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1183
1184 # can we process this file??
1185 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1186
1187 return undef unless $self->can_process_this_file($filename_full_path);
1188
1189 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1190
1191 if ((defined $process_status) && ($process_status == 1)) {
1192
1193 # process the document
1194 $processor->process($doc_obj);
1195
1196 $self->{'num_processed'} ++;
1197 undef $doc_obj;
1198 }
1199 # delete any temp files that we may have created
1200 $self->clean_up_after_doc_obj_processing();
1201
1202
1203 # if process_status == 1, then the file has been processed.
1204 return $process_status;
1205
1206}
1207
1208# returns undef if file is rejected by the plugin
1209sub process {
1210 my $self = shift (@_);
1211 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1212
1213 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1214
1215 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1216 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1217
1218 die "\n";
1219
1220 return undef; # never gets here
1221}
1222
1223# overwrite this method to delete any temp files that we have created
1224sub clean_up_after_doc_obj_processing {
1225 my $self = shift(@_);
1226
1227}
1228
1229# write_file -- used by ConvertToPlug, for example in post processing
1230#
1231# where should this go, is here the best place??
1232sub utf8_write_file {
1233 my $self = shift (@_);
1234 my ($textref, $filename) = @_;
1235
1236 if (!open (FILE, ">:utf8", $filename)) {
1237 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1238 die "\n";
1239 }
1240 print FILE $$textref;
1241
1242 close FILE;
1243}
1244
1245
1246sub filename_based_title
1247{
1248 my $self = shift (@_);
1249 my ($file) = @_;
1250
1251 my $file_derived_title = $file;
1252 $file_derived_title =~ s/_/ /g;
1253 $file_derived_title =~ s/\.[^.]+$//;
1254
1255 return $file_derived_title;
1256}
1257
1258
1259sub title_fallback
1260{
1261 my $self = shift (@_);
1262 my ($doc_obj,$section,$file) = @_;
1263
1264 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1265 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1266
1267 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1268 my $file_derived_title;
1269 if (defined $source_file) {
1270 $file_derived_title = $self->filename_based_title($source_file);
1271 }
1272 else {
1273 # pp = pretty print
1274 my $pp_file = (defined $source_file) ? $source_file : $file;
1275
1276 my $raw_title = $self->filename_based_title($file);
1277 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1278 }
1279
1280
1281 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1282 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1283 }
1284 else {
1285 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1286 }
1287 }
1288
1289}
1290
1291# add any extra metadata that's been passed around from one
1292# plugin to another.
1293# extra_metadata uses add_utf8_metadata so it expects metadata values
1294# to already be in utf8
1295sub extra_metadata {
1296 my $self = shift (@_);
1297 my ($doc_obj, $cursection, $metadata) = @_;
1298
1299 my $associate_tail_re = $self->{'associate_tail_re'};
1300
1301 foreach my $field (keys(%$metadata)) {
1302 # $metadata->{$field} may be an array reference
1303 if ($field eq "gsdlassocfile_tobe") {
1304 # 'gsdlassocfile_tobe' is artificially introduced metadata
1305 # that is used to signal that certain additional files should
1306 # be tied to this document. Useful in situations where a
1307 # metadata pass in the plugin pipeline works out some files
1308 # need to be associated with a document, but the document hasn't
1309 # been formed yet.
1310 my $equiv_form = "";
1311 foreach my $gaf (@{$metadata->{$field}}) {
1312 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1313 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1314
1315 # we need to make sure the filename is valid utf-8 - we do
1316 # this by url or base64 encoding it
1317 # $tail_filename is the name that we store the file as
1318 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1319 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1320 $doc_obj->associate_source_file($full_filename);
1321 # If the filename is url_encoded, we need to encode the % signs
1322 # in the filename, so that it works in a url
1323 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1324 # work out extended tail extension (i.e. matching tail re)
1325
1326 my ($file_prefix,$file_extended_ext)
1327 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1328 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1329 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1330
1331 # the greenstone 2 stuff
1332 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1333 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1334 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1335
1336 my $srcicon = "_icon".$doc_ext."_";
1337 my $end_doclink = "</a>";
1338
1339 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1340
1341
1342 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1343 # for metadata such as [mp3._edited] [mp3._full] ...
1344 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1345 }
1346
1347 # for multiple metadata such as [mp3.assoclink]
1348 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1349
1350 $equiv_form .= " $assoc_form";
1351
1352 # following are used for greenstone 3,
1353 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1354 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1355 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1356
1357 }
1358 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1359 }
1360 elsif ($field eq "gsdlzipfilename") {
1361 # special case for when files have come out of a zip. source_path
1362 # (used for archives dbs and keeping track for incremental import)
1363 # must be set to the zip file name
1364 my $zip_filename = $metadata->{$field};
1365 # overwrite the source_path
1366 $doc_obj->set_source_path($zip_filename);
1367 # and set the metadata
1368 $zip_filename = &util::filename_within_collection($zip_filename);
1369 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1370 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1371 }
1372 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1373 map {
1374 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1375 } @{$metadata->{$field}};
1376 } else {
1377 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1378 }
1379 }
1380}
1381
1382
1383sub compile_stats {
1384 my $self = shift(@_);
1385 my ($stats) = @_;
1386
1387 $stats->{'num_processed'} += $self->{'num_processed'};
1388 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1389 $stats->{'num_archives'} += $self->{'num_archives'};
1390
1391}
1392sub associate_source_file {
1393 my $self = shift(@_);
1394
1395 my ($doc_obj, $filename) = @_;
1396 my $cursection = $doc_obj->get_top_section();
1397 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1398
1399 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1400 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1401 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1402 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1403}
1404
1405sub associate_cover_image {
1406 my $self = shift(@_);
1407 my ($doc_obj, $filename) = @_;
1408
1409 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1410
1411 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1412 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1413
1414 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1415 # don't stat() for existence e.g. for multiple document input files
1416 # (eg SplitPlug)
1417 return;
1418 }
1419
1420 my $top_section=$doc_obj->get_top_section();
1421
1422 if (&FileUtils::fileExists($upgraded_filename)) {
1423 $doc_obj->associate_source_file($filename);
1424 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1425 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1426 } else {
1427 my $upper_filename = $filename;
1428 my $upgraded_upper_filename = $upgraded_filename;
1429
1430 $upper_filename =~ s/jpg$/JPG/;
1431 $upgraded_upper_filename =~ s/jpg$/JPG/;
1432
1433 if (&FileUtils::fileExists($upgraded_upper_filename)) {
1434 $doc_obj->associate_source_file($upper_filename);
1435 $doc_obj->associate_file($upper_filename, "cover.jpg",
1436 "image/jpeg");
1437 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1438 } else {
1439 # file doesn't exist, so record the fact that it's missing so
1440 # we don't stat() again (stat is slow)
1441 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1442 }
1443 }
1444
1445}
1446
1447
1448# Overridden by exploding plugins (eg. ISISPlug)
1449sub clean_up_after_exploding
1450{
1451 my $self = shift(@_);
1452}
1453
1454
1455
14561;
Note: See TracBrowser for help on using the repository browser.