source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 26893

Last change on this file since 26893 was 26893, checked in by kjdon, 11 years ago

ConvertBinaryFile needs to reset the doc OID after all the processing has been done. This will mean it uses the top level plugin OIDtype settings, rather than the secondary plugin ones.

  • Property svn:keywords set to Author Date Id Revision
File size: 46.6 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use PrintInfo;
43
44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
47
48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
54 'desc' => "{BasePlugin.rename_method.base64}" },
55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
57 'hiddengli' => "yes" } ];
58
59our $encoding_list =
60 [ { 'name' => "ascii",
61 'desc' => "{BasePlugin.encoding.ascii}" },
62 { 'name' => "utf8",
63 'desc' => "{BasePlugin.encoding.utf8}" },
64 { 'name' => "unicode",
65 'desc' => "{BasePlugin.encoding.unicode}" } ];
66
67
68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
75 push(@{$encoding_list},$hashEncode);
76}
77
78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
80 'desc' => "{BasePlugin.filename_encoding.auto}" },
81 { 'name' => "auto-language-analysis",
82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
89
90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
92our $oidtype_list =
93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
96 'desc' => "{import.OIDtype.hash}" },
97 { 'name' => "hash_on_ga_xml",
98 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
99 { 'name' => "hash_on_full_filename",
100 'desc' => "{import.OIDtype.hash_on_full_filename}" },
101 { 'name' => "assigned",
102 'desc' => "{import.OIDtype.assigned}" },
103 { 'name' => "incremental",
104 'desc' => "{import.OIDtype.incremental}" },
105 { 'name' => "filename",
106 'desc' => "{import.OIDtype.filename}" },
107 { 'name' => "dirname",
108 'desc' => "{import.OIDtype.dirname}" },
109 { 'name' => "full_filename",
110 'desc' => "{import.OIDtype.full_filename}" } ];
111
112my $arguments =
113 [ { 'name' => "process_exp",
114 'desc' => "{BasePlugin.process_exp}",
115 'type' => "regexp",
116 'deft' => "",
117 'reqd' => "no" },
118 { 'name' => "no_blocking",
119 'desc' => "{BasePlugin.no_blocking}",
120 'type' => "flag",
121 'reqd' => "no"},
122 { 'name' => "block_exp",
123 'desc' => "{BasePlugin.block_exp}",
124 'type' => "regexp",
125 'deft' => "",
126 'reqd' => "no" },
127 { 'name' => "store_original_file",
128 'desc' => "{BasePlugin.store_original_file}",
129 'type' => "flag",
130 'reqd' => "no" },
131 { 'name' => "associate_ext",
132 'desc' => "{BasePlugin.associate_ext}",
133 'type' => "string",
134 'reqd' => "no" },
135 { 'name' => "associate_tail_re",
136 'desc' => "{BasePlugin.associate_tail_re}",
137 'type' => "string",
138 'reqd' => "no" },
139 { 'name' => "OIDtype",
140 'desc' => "{import.OIDtype}",
141 'type' => "enum",
142 'list' => $oidtype_list,
143 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
144 'deft' => "auto",
145 'reqd' => "no" },
146 { 'name' => "OIDmetadata",
147 'desc' => "{import.OIDmetadata}",
148 'type' => "metadata",
149 'deft' => "dc.Identifier",
150 'reqd' => "no" },
151# { 'name' => "use_as_doc_identifier",
152# 'desc' => "{BasePlugin.use_as_doc_identifier}",
153# 'type' => "string",
154# 'reqd' => "no" ,
155# 'deft' => "" } ,
156 { 'name' => "no_cover_image",
157 'desc' => "{BasePlugin.no_cover_image}",
158 'type' => "flag",
159 'reqd' => "no" },
160 { 'name' => "filename_encoding",
161 'desc' => "{BasePlugin.filename_encoding}",
162 'type' => "enum",
163 'deft' => "auto",
164 'list' => $encoding_plus_auto_list,
165 'reqd' => "no" },
166 { 'name' => "smart_block",
167 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
168 'type' => "flag",
169 'reqd' => "no",
170 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
171 { 'name' => "file_rename_method",
172 'desc' => "{BasePlugin.file_rename_method}",
173 'type' => "enum",
174 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
175 'list' => $file_rename_method_list,
176 'reqd' => "no"
177 }
178
179 ];
180
181
182my $options = { 'name' => "BasePlugin",
183 'desc' => "{BasePlugin.desc}",
184 'abstract' => "yes",
185 'inherits' => "no",
186 'args' => $arguments };
187
188sub new {
189
190 my ($class) = shift (@_);
191 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
192 push(@$pluginlist, $class);
193
194 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
195 push(@{$hashArgOptLists->{"OptList"}},$options);
196
197 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
198
199 if ($self->{'info_only'}) {
200 # don't worry about any options etc
201 return bless $self, $class;
202 }
203
204 if ($self->{'smart_block'}) {
205 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
206 }
207 $self->{'smart_block'} = undef;
208
209 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
210 $self->{'plugin_type'} = $plugin_name;
211
212 # remove ex. from OIDmetadata iff it's the only namespace prefix
213 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
214 $self->{'num_processed'} = 0;
215 $self->{'num_not_processed'} = 0;
216 $self->{'num_blocked'} = 0;
217 $self->{'num_archives'} = 0;
218 $self->{'cover_image'} = 1; # cover image is on by default
219 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
220 $self->{'can_process_directories'} = 0;
221 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
222
223 my $associate_ext = $self->{'associate_ext'};
224 if ((defined $associate_ext) && ($associate_ext ne "")) {
225
226 my $associate_tail_re = $self->{'associate_tail_re'};
227 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
228 my $outhandle = $self->{'outhandle'};
229 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
230 print $outhandle " defaulting to 'associate_tail_re'\n";
231 }
232 else {
233 my @exts = split(/,/,$associate_ext);
234
235 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
236 my $associate_tail_re = join("|",@exts_bracketed);
237 $self->{'associate_tail_re'} = $associate_tail_re;
238 }
239
240 delete $self->{'associate_ext'};
241 }
242
243 return bless $self, $class;
244
245}
246
247sub merge_inheritance
248{
249 my $self = {};
250 my @child_selfs = @_;
251
252 foreach my $child_self (@child_selfs) {
253 foreach my $key (keys %$child_self) {
254 if (defined $self->{$key}) {
255 if ($self->{$key} ne $child_self->{$key}) {
256# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
257# print STDERR "Existing stored value = $self->{$key}\n";
258# print STDERR "New (child) value = $child_self->{$key}\n";
259# print STDERR "Keeping existing value\n";
260 # Existing value seems to be option specified in collect.cfg
261
262 ### $self->{$key} = $child_self->{$key};
263
264 }
265 else {
266## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
267 }
268
269 }
270 else {
271 $self->{$key} = $child_self->{$key};
272 }
273 }
274 }
275
276 return $self;
277}
278
279# initialize BasePlugin options
280# if init() is overridden in a sub-class, remember to call BasePlugin::init()
281sub init {
282 my $self = shift (@_);
283 my ($verbosity, $outhandle, $failhandle) = @_;
284
285 # verbosity is passed through from the processor
286 $self->{'verbosity'} = $verbosity;
287
288 # as are the outhandle and failhandle
289 $self->{'outhandle'} = $outhandle if defined $outhandle;
290 $self->{'failhandle'} = $failhandle;
291# $self->SUPER::init(@_);
292
293 # set process_exp and block_exp to defaults unless they were
294 # explicitly set
295
296 if ((!$self->is_recursive()) and
297 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
298
299 $self->{'process_exp'} = $self->get_default_process_exp ();
300 if ($self->{'process_exp'} eq "") {
301 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
302 }
303 }
304
305 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
306 $self->{'block_exp'} = $self->get_default_block_exp ();
307 }
308
309}
310
311sub begin {
312 my $self = shift (@_);
313 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
314
315 if ($self->{'OIDtype'} eq "auto") {
316 # hasn't been set in the plugin, use the processor values
317 $self->{'OIDtype'} = $processor->{'OIDtype'};
318 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
319 }
320 if ($self->{'OIDtype'} eq "hash") {
321 # should we hash on the file or on the doc xml??
322 $self->{'OIDtype'} = $self->get_oid_hash_type();
323 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
324 $self->{'OIDtype'} = "hash_on_file";
325 }
326 }
327}
328
329# This is called once if removeold is set with import.pl. Most plugins will do
330# nothing but if a plugin does any stuff outside of creating doc obj, then
331# it may need to clear something.
332sub remove_all {
333 my $self = shift (@_);
334 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
335}
336
337# This is called per document for docs that have been deleted from the
338# collection. Most plugins will do nothing
339# but if a plugin does any stuff outside of creating doc obj, then it may need
340# to clear something.
341sub remove_one {
342 my $self = shift (@_);
343
344 my ($file, $oids, $archivedir) = @_;
345 return 0 if $self->can_process_this_file($file);
346 return undef;
347}
348
349sub end {
350 # potentially called at the end of each plugin pass
351 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
352
353 my ($self) = shift (@_);
354}
355
356sub deinit {
357 # called only once, after all plugin passes have been done
358
359 my ($self) = @_;
360}
361
362# default hashing type is to hash on the original file (or converted file)
363# override this to return hash_on_ga_xml for filetypes where hashing on the
364# file is no good eg video
365sub get_oid_hash_type {
366
367 my $self = shift (@_);
368
369 return "hash_on_file";
370}
371
372
373# this function should be overridden to return 1
374# in recursive plugins
375sub is_recursive {
376 my $self = shift (@_);
377
378 return 0;
379}
380
381sub get_default_block_exp {
382 my $self = shift (@_);
383
384 return "";
385}
386
387sub get_default_process_exp {
388 my $self = shift (@_);
389
390 return "";
391}
392
393
394# rename imported files and assoc files using URL encoding by default
395# as this will work for most plugins and give more legible filenames
396sub get_default_file_rename_method() {
397 my $self = shift (@_);
398 return "url";
399}
400
401# returns this plugin's active (possibly user-selected) file_rename_method
402sub get_file_rename_method() {
403 my $self = shift (@_);
404 my $rename_method = $self->{'file_rename_method'};
405 if($rename_method) {
406 return $rename_method;
407 } else {
408 return $self->get_default_file_rename_method();
409 }
410}
411
412# default implementation is to do nothing
413sub store_block_files {
414
415 my $self =shift (@_);
416 my ($filename_full_path, $block_hash) = @_;
417
418}
419
420# put files to block into hash
421sub use_block_expressions {
422
423 my $self =shift (@_);
424 my ($filename_full_path, $block_hash) = @_;
425
426 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
427
428 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
429 &util::block_filename($block_hash,$filename_full_path);
430 }
431
432}
433
434#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
435sub block_cover_image
436{
437 my $self =shift;
438 my ($filename, $block_hash) = @_;
439
440 $filename = &util::upgrade_if_dos_filename($filename);
441
442 if ($self->{'cover_image'}) {
443 my $coverfile = $filename;
444 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
445
446 #if there is no file extension, coverfile will be the same as filename
447 return if $coverfile eq $filename;
448
449 if (!&util::fd_exists($coverfile)) {
450 $coverfile =~ s/jpg$/JPG/;
451 }
452 if (&util::fd_exists($coverfile)) {
453 &util::block_filename($block_hash,$coverfile);
454 }
455 }
456
457 return;
458}
459
460
461# discover all the files that should be blocked by this plugin
462# check the args ...
463sub file_block_read {
464
465 my $self = shift (@_);
466 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
467 # Keep track of filenames with same root but different extensions
468 # Used to support -associate_ext and the more generalised
469 # -associate_tail_re
470 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
471
472 if (!-d $filename_full_path) {
473 $block_hash->{'all_files'}->{$file} = 1;
474 }
475
476 my $associate_tail_re = $self->{'associate_tail_re'};
477 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
478 my ($file_prefix,$file_ext)
479 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
480 if ((defined $file_prefix) && (defined $file_ext)) {
481 my $shared_fileroot = $block_hash->{'shared_fileroot'};
482 if (!defined $shared_fileroot->{$file_prefix}) {
483 my $file_prefix_rec = { 'tie_to' => undef,
484 'exts' => {} };
485 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
486 }
487
488 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
489
490 if ($self->can_process_this_file($filename_full_path)) {
491 # This is the document the others should be tied to
492 $file_prefix_rec->{'tie_to'} = $file_ext;
493 }
494 else {
495 if ($file_ext =~ m/$associate_tail_re$/) {
496 # this file should be associated to the main one
497 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
498 }
499 }
500
501 }
502 }
503
504 # check block expressions
505 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
506
507 # now check whether we are actually processing this
508 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
509 return undef; # can't recognise
510 }
511
512 # if we have a block_exp, then this overrides the normal 'smart' blocking
513 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
514
515 # block the cover image if there is one
516 if ($self->{'cover_image'}) {
517 $self->block_cover_image($filename_full_path, $block_hash);
518 }
519
520 return 1;
521}
522
523# plugins that rely on more than process_exp (eg XML plugins) can override this method
524sub can_process_this_file {
525 my $self = shift(@_);
526 my ($filename) = @_;
527
528 if (-d $filename && !$self->{'can_process_directories'}) {
529 return 0;
530 }
531
532 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
533 return 1;
534 }
535 return 0;
536
537}
538
539# Even if a plugin can extract metadata in its metadata_read pass,
540# make the default return 'undef' so processing of the file continues
541# down the pipeline, so other plugins can also have the opportunity to
542# locate metadata and set it up in the extrametakeys variables that
543# are passed around.
544
545sub can_process_this_file_for_metadata {
546 my $self = shift(@_);
547
548 return undef;
549}
550
551
552# just converts path as is to utf8.
553sub filepath_to_utf8 {
554 my $self = shift (@_);
555 my ($file, $file_encoding) = @_;
556 my $filemeta = $file;
557
558 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
559
560 # Whenever filename-encoding is set to any of the auto settings, we
561 # check if the filename is already in UTF8. If it is, then we're done.
562 if($filename_encoding =~ m/auto/) {
563 if(&unicode::check_is_utf8($filemeta))
564 {
565 $filename_encoding = "utf8";
566 return $filemeta;
567 }
568 }
569
570 # Auto setting, but filename is not utf8
571 if ($filename_encoding eq "auto")
572 {
573 # try textcat
574 $filename_encoding = $self->textcat_encoding($filemeta);
575
576 # check the locale next
577 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
578
579
580 # now try the encoding of the document, if available
581 if ($filename_encoding eq "undefined" && defined $file_encoding) {
582 $filename_encoding = $file_encoding;
583 }
584
585 }
586
587 elsif ($filename_encoding eq "auto-language-analysis")
588 {
589 $filename_encoding = $self->textcat_encoding($filemeta);
590
591 # now try the encoding of the document, if available
592 if ($filename_encoding eq "undefined" && defined $file_encoding) {
593 $filename_encoding = $file_encoding;
594 }
595 }
596
597 elsif ($filename_encoding eq "auto-filesystem-encoding")
598 {
599 # try locale
600 $filename_encoding = $self->locale_encoding();
601 }
602
603 elsif ($filename_encoding eq "auto-fl")
604 {
605 # filesystem-encoding (locale) then language-analysis (textcat)
606 $filename_encoding = $self->locale_encoding();
607
608 # try textcat
609 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
610
611 # else assume filename encoding is encoding of file content, if that's available
612 if ($filename_encoding eq "undefined" && defined $file_encoding) {
613 $filename_encoding = $file_encoding;
614 }
615 }
616
617 elsif ($filename_encoding eq "auto-lf")
618 {
619 # language-analysis (textcat) then filesystem-encoding (locale)
620 $filename_encoding = $self->textcat_encoding($filemeta);
621
622 # guess filename encoding from encoding of file content, if available
623 if ($filename_encoding eq "undefined" && defined $file_encoding) {
624 $filename_encoding = $file_encoding;
625 }
626
627 # try locale
628 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
629 }
630
631 # if still undefined, use utf8 as fallback
632 if ($filename_encoding eq "undefined") {
633 $filename_encoding = "utf8";
634 }
635
636 #print STDERR "**** UTF8 encoding the filename $filemeta ";
637
638 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
639 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
640 # be always utf8 (in which case the filename's encoding is also set as utf8 even
641 # though the filename need not be if it originates from another system)--in such
642 # cases attempt to make the filename utf8 to match.
643 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
644 &unicode::ensure_utf8(\$filemeta);
645 }
646
647 # convert non-unicode encodings to utf8
648 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
649 $filemeta = &unicode::unicode2utf8(
650 &unicode::convert2unicode($filename_encoding, \$filemeta)
651 );
652 }
653
654 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
655 return $filemeta;
656}
657
658# gets the filename with no path, converts to utf8, and then dm safes it.
659# filename_encoding set by user
660sub filename_to_utf8_metadata
661{
662 my $self = shift (@_);
663 my ($file, $file_encoding) = @_;
664
665 my $outhandle = $self->{'outhandle'};
666
667 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
668 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
669 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
670
671
672 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
673 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
674
675 return $filemeta;
676}
677
678sub locale_encoding {
679 my $self = shift(@_);
680
681 if (!defined $self->{'filesystem_encoding'}) {
682 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
683 }
684
685 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
686 return $self->{'filesystem_encoding'}; # can be the string "undefined"
687}
688
689sub textcat_encoding {
690 my $self = shift(@_);
691 my ($filemeta) = @_;
692
693 # analyse filenames without extensions and digits (and trimmed of
694 # surrounding whitespace), so that irrelevant chars don't confuse
695 # textcat
696 my $strictfilemeta = $filemeta;
697 $strictfilemeta =~ s/\.[^\.]+$//g;
698 $strictfilemeta =~ s/\d//g;
699 $strictfilemeta =~ s/^\s*//g;
700 $strictfilemeta =~ s/\s*$//g;
701
702 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
703 if(!defined $filename_encoding) {
704 $filename_encoding = "undefined";
705 }
706
707 return $filename_encoding; # can be the string "undefined"
708}
709
710# performs textcat
711sub encoding_from_language_analysis {
712 my $self = shift(@_);
713 my ($text) = @_;
714
715 my $outhandle = $self->{'outhandle'};
716 my $best_encoding = undef;
717
718 # get the language/encoding of the textstring using textcat
719 require textcat; # Only load the textcat module if it is required
720 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
721 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
722
723
724 if (scalar @$results < 0) {
725 return undef;
726 }
727
728 # We have some results, we choose the first
729 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
730
731 $best_encoding = $encoding;
732 if (!defined $best_encoding) {
733 return undef;
734 }
735
736 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
737 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
738 $best_encoding = 'utf8';
739 }
740
741
742 # check for equivalents where textcat doesn't have some encodings...
743 # eg MS versions of standard encodings
744 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
745 my $iso = $1; # which variant of the iso standard?
746 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
747 if ($text =~ /[\x80-\x9f]/) {
748 # Western Europe
749 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
750 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
751 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
752 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
753 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
754 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
755 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
756 }
757 }
758
759 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
760 !defined $encodings::encodings->{$best_encoding})
761 {
762 if ($self->{'verbosity'}) {
763 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
764 }
765 $best_encoding = undef;
766 }
767
768 return $best_encoding;
769}
770
771# uses locale
772sub get_filesystem_encoding
773{
774
775 my $self = shift(@_);
776
777 my $outhandle = $self->{'outhandle'};
778 my $filesystem_encoding = undef;
779
780 eval {
781 # Works for Windows as well, returning the DOS code page in use
782 use POSIX qw(locale_h);
783
784 # With only one parameter, setlocale retrieves the
785 # current value
786 my $current_locale = setlocale(LC_CTYPE);
787
788 my $char_encoding = undef;
789 if ($current_locale =~ m/\./) {
790 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
791 $char_encoding = lc($char_encoding);
792 }
793 else {
794 if ($current_locale =~ m/^(posix|c)$/i) {
795 $char_encoding = "ascii";
796 }
797 }
798
799 if (defined $char_encoding) {
800 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
801 $char_encoding = "$1\_$2\_$3";
802 }
803
804 $char_encoding =~ s/-/_/g;
805 $char_encoding =~ s/^utf_8$/utf8/;
806
807 if ($char_encoding =~ m/^\d+$/) {
808 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
809 $char_encoding = "windows_$char_encoding";
810 }
811 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
812 $char_encoding = "dos_$char_encoding";
813 }
814 }
815
816 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
817 || (defined $encodings::encodings->{$char_encoding})) {
818 $filesystem_encoding = $char_encoding;
819 }
820 else {
821 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
822 }
823 }
824
825
826 };
827 if ($@) {
828 print $outhandle "$@\n";
829 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
830
831 }
832
833 return $filesystem_encoding;
834}
835
836
837
838sub deduce_filename_encoding
839{
840 my $self = shift (@_);
841 my ($file,$metadata,$plugin_filename_encoding) = @_;
842
843 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
844 my $deduced_filename_encoding = undef;
845
846 # Start by looking for manually assigned metadata
847 if (defined $gs_filename_encoding) {
848 if (ref ($gs_filename_encoding) eq "ARRAY") {
849 my $outhandle = $self->{'outhandle'};
850
851 $deduced_filename_encoding = $gs_filename_encoding->[0];
852
853 my $num_vals = scalar(@$gs_filename_encoding);
854 if ($num_vals>1) {
855 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
856 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
857 }
858 }
859 else {
860 $deduced_filename_encoding = $gs_filename_encoding;
861 }
862 }
863
864 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
865 # Look to see if plugin specifies this value
866
867 if (defined $plugin_filename_encoding) {
868 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
869 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
870 my $outhandle = $self->{'outhandle'};
871 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
872 print $outhandle " default to 'auto'\n";
873 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
874 }
875
876 if ($plugin_filename_encoding ne "auto") {
877 # We've been given a specific filenamne encoding
878 # => so use it!
879 $deduced_filename_encoding = $plugin_filename_encoding;
880 }
881 }
882 }
883
884 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
885
886 # Look to file system to provide a character encoding
887
888 # If Windows NTFS, then -- assuming we work with long file names got through
889 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
890
891 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
892 # Can do better than working with the DOS character encoding returned by locale
893 $deduced_filename_encoding = "unicode";
894 }
895 else {
896 # Unix of some form or other
897
898 # See if we can determine the file system encoding through locale
899 $deduced_filename_encoding = $self->locale_encoding();
900
901 # if locale shows us filesystem is utf8, check to see filename is consistent
902 # => if not, then we have an "alien" filename on our hands
903
904 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
905 if (!&unicode::check_is_utf8($file)) {
906 # "alien" filename, so revert
907 $deduced_filename_encoding = undef;
908 }
909 }
910 }
911 }
912
913# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
914# # Last chance, apply textcat to deduce filename encoding
915# $deduced_filename_encoding = $self->textcat_encoding($file);
916# }
917
918 if ($self->{'verbosity'}>3) {
919 my $outhandle = $self->{'outhandle'};
920
921 if (defined $deduced_filename_encoding) {
922 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
923 }
924 else {
925 print $outhandle " No filename encoding deduced\n";
926 }
927 }
928
929 return $deduced_filename_encoding;
930}
931
932
933
934
935# Notionally written to be called once for each document, it is however safe to
936# call multiple times (as in the case of ImagePlugin) which calls this later on
937# after the original image has potentially been converted to a *new* source image
938# format (e.g. TIFF to PNG)
939
940sub set_Source_metadata {
941 my $self = shift (@_);
942 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
943
944 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
945 # and (as a fallback) using %xx if not for non-ascii chars
946 # 2. Sets the url ref (SourceFile) to the URL encoded version
947 # of filename for generated files
948
949 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
950
951 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
952
953 my $octet_file = $raw_file;
954
955 # UTF-8 version of filename
956# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
957# print STDERR "**** Setting Source Metadata given: $octet_file\n";
958# }
959
960 # Deal with (on Windows) raw filenames that are in their
961 # abbreviated DOS form
962
963 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
964 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
965 if (-e $raw_filename) {
966 my $unicode_filename = Win32::GetLongPathName($raw_filename);
967
968 my $unused_full_uf;
969 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
970 }
971 }
972 }
973
974 my $url_encoded_filename;
975 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
976 # => Generate a pretty print version of filename that is mapped to Unicode
977
978 # Use filename_encoding to map raw filename to a Perl unicode-aware string
979 $url_encoded_filename = decode($filename_encoding,$octet_file);
980 }
981 else {
982 # otherwise generate %xx encoded version of filename for char > 127
983 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
984 }
985
986# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
987# print STDERR "****** saving Source as: $url_encoded_filename\n";
988# }
989
990 # In the case of converted files and (generalized) exploded documents, there
991 # will already be a source filename => store as OrigSource before overriding
992 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
993 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
994 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
995 }
996
997 # Source is the UTF8 display name - not necessarily the name of the file on the system
998 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
999 # on Darwin want all display strings to be in composed form, then can search on that
1000 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
1001 }
1002 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
1003
1004
1005 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
1006 # If using URL encoding, then SourceFile is the url-reference to url-encoded
1007 # renamed_raw_url: it's a url that refers to the actual file on the system
1008 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
1009
1010 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
1011 $renamed_raw_url);
1012
1013# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1014# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1015# }
1016}
1017
1018# this should be called by all plugins to set the oid of the doc obj, rather
1019# than calling doc_obj->set_OID directly
1020sub add_OID {
1021 my $self = shift (@_);
1022 my ($doc_obj, $force) = @_;
1023
1024 # don't add one if there is one already set, unless we are forced to do so
1025 return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
1026 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1027
1028 # see if there is a plugin specific set_OID function
1029 if (defined ($self->can('set_OID'))) {
1030 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1031 }
1032 else {
1033 # use the default set_OID() in doc.pm
1034 $doc_obj->set_OID();
1035 }
1036
1037}
1038
1039# The BasePlugin read_into_doc_obj() function. This function does all the
1040# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1041# a file and sets up a slew of metadata all saved in doc_obj, which
1042# it then returns as part of a tuple (process_status,doc_obj)
1043#
1044# Much of this functionality used to reside in read, but it was broken
1045# down into a supporting routine to make the code more flexible.
1046#
1047# recursive plugins (e.g. RecPlug) and specialized plugins like those
1048# capable of processing many documents within a single file (e.g.
1049# GMLPlug) will normally want to implement their own version of
1050# read_into_doc_obj()
1051#
1052# Note that $base_dir might be "" and that $file might
1053# include directories
1054
1055# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1056sub read_into_doc_obj {
1057 my $self = shift (@_);
1058 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1059
1060 my $outhandle = $self->{'outhandle'};
1061
1062 # should we move this to read? What about secondary plugins?
1063 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1064 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1065 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1066 if $self->{'verbosity'} > 1;
1067
1068 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1069
1070 # create a new document
1071 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1072 my $top_section = $doc_obj->get_top_section();
1073
1074 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1075 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1076
1077
1078 my $plugin_filename_encoding = $self->{'filename_encoding'};
1079 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1080 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1081
1082 # plugin specific stuff - what args do we need here??
1083 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1084 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1085 return -1;
1086 }
1087
1088 # include any metadata passed in from previous plugins
1089 # note that this metadata is associated with the top level section
1090 my $section = $doc_obj->get_top_section();
1091 # can we merge these two methods??
1092 $self->add_associated_files($doc_obj, $filename_full_path);
1093 $self->extra_metadata ($doc_obj, $section, $metadata);
1094 $self->auto_extract_metadata($doc_obj);
1095
1096 # if we haven't found any Title so far, assign one
1097 # this was shifted to here from inside read()
1098 $self->title_fallback($doc_obj,$section,$filename_no_path);
1099
1100 $self->add_OID($doc_obj);
1101
1102 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1103 return (1,$doc_obj);
1104}
1105
1106sub post_process_doc_obj {
1107 my $self = shift (@_);
1108 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1109
1110 return 1;
1111}
1112
1113sub add_dummy_text {
1114 my $self = shift(@_);
1115 my ($doc_obj, $section) = @_;
1116
1117 # add NoText metadata so we can hide this dummy text in format statements
1118 $doc_obj->add_metadata($section, "NoText", "1");
1119 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1120
1121}
1122
1123# does nothing. Can be overridden by subclass
1124sub auto_extract_metadata {
1125 my $self = shift(@_);
1126 my ($doc_obj) = @_;
1127}
1128
1129# adds cover image, associate_file options stuff. Should be called by sub class
1130# read_into_doc_obj
1131sub add_associated_files {
1132 my $self = shift(@_);
1133 # whatis filename??
1134 my ($doc_obj, $filename) = @_;
1135
1136 # add in the cover image
1137 if ($self->{'cover_image'}) {
1138 $self->associate_cover_image($doc_obj, $filename);
1139 }
1140 # store the original (used for eg TextPlugin to store the original for OAI)
1141 if ($self->{'store_original_file'}) {
1142 $self->associate_source_file($doc_obj, $filename);
1143 }
1144
1145
1146}
1147
1148# implement this if you are extracting metadata for other documents
1149sub metadata_read {
1150 my $self = shift (@_);
1151 my ($pluginfo, $base_dir, $file, $block_hash,
1152 $extrametakeys, $extrametadata, $extrametafile,
1153 $processor, $gli, $aux) = @_;
1154
1155 # can we process this file??
1156 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1157 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
1158
1159 return 1; # we recognise the file, but don't actually do anything with it
1160}
1161
1162
1163# The BasePlugin read() function. This function calls read_into_doc_obj()
1164# to ensure all the right things to make general options work for a
1165# given plugin are done. It then calls the process() function which
1166# does all the work specific to a plugin (like the old read functions
1167# used to do). Most plugins should define their own process() function
1168# and let this read() function keep control.
1169#
1170# recursive plugins (e.g. RecPlug) and specialized plugins like those
1171# capable of processing many documents within a single file (e.g.
1172# GMLPlug) might want to implement their own version of read(), but
1173# more likely need to implement their own version of read_into_doc_obj()
1174#
1175# Return number of files processed, undef if can't recognise, -1 if can't
1176# process
1177
1178sub read {
1179 my $self = shift (@_);
1180 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1181
1182 # can we process this file??
1183 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1184
1185 return undef unless $self->can_process_this_file($filename_full_path);
1186
1187 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1188
1189 if ((defined $process_status) && ($process_status == 1)) {
1190
1191 # process the document
1192 $processor->process($doc_obj);
1193
1194 $self->{'num_processed'} ++;
1195 undef $doc_obj;
1196 }
1197 # delete any temp files that we may have created
1198 $self->clean_up_after_doc_obj_processing();
1199
1200
1201 # if process_status == 1, then the file has been processed.
1202 return $process_status;
1203
1204}
1205
1206# returns undef if file is rejected by the plugin
1207sub process {
1208 my $self = shift (@_);
1209 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1210
1211 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1212
1213 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1214 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1215
1216 die "\n";
1217
1218 return undef; # never gets here
1219}
1220
1221# overwrite this method to delete any temp files that we have created
1222sub clean_up_after_doc_obj_processing {
1223 my $self = shift(@_);
1224
1225}
1226
1227# write_file -- used by ConvertToPlug, for example in post processing
1228#
1229# where should this go, is here the best place??
1230sub utf8_write_file {
1231 my $self = shift (@_);
1232 my ($textref, $filename) = @_;
1233
1234 if (!open (FILE, ">:utf8", $filename)) {
1235 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1236 die "\n";
1237 }
1238 print FILE $$textref;
1239
1240 close FILE;
1241}
1242
1243
1244sub filename_based_title
1245{
1246 my $self = shift (@_);
1247 my ($file) = @_;
1248
1249 my $file_derived_title = $file;
1250 $file_derived_title =~ s/_/ /g;
1251 $file_derived_title =~ s/\.[^.]+$//;
1252
1253 return $file_derived_title;
1254}
1255
1256
1257sub title_fallback
1258{
1259 my $self = shift (@_);
1260 my ($doc_obj,$section,$file) = @_;
1261
1262 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1263 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1264
1265 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1266 my $file_derived_title;
1267 if (defined $source_file) {
1268 $file_derived_title = $self->filename_based_title($source_file);
1269 }
1270 else {
1271 # pp = pretty print
1272 my $pp_file = (defined $source_file) ? $source_file : $file;
1273
1274 my $raw_title = $self->filename_based_title($file);
1275 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1276 }
1277
1278
1279 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1280 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1281 }
1282 else {
1283 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1284 }
1285 }
1286
1287}
1288
1289# add any extra metadata that's been passed around from one
1290# plugin to another.
1291# extra_metadata uses add_utf8_metadata so it expects metadata values
1292# to already be in utf8
1293sub extra_metadata {
1294 my $self = shift (@_);
1295 my ($doc_obj, $cursection, $metadata) = @_;
1296
1297 my $associate_tail_re = $self->{'associate_tail_re'};
1298
1299 foreach my $field (keys(%$metadata)) {
1300 # $metadata->{$field} may be an array reference
1301 if ($field eq "gsdlassocfile_tobe") {
1302 # 'gsdlassocfile_tobe' is artificially introduced metadata
1303 # that is used to signal that certain additional files should
1304 # be tied to this document. Useful in situations where a
1305 # metadata pass in the plugin pipeline works out some files
1306 # need to be associated with a document, but the document hasn't
1307 # been formed yet.
1308 my $equiv_form = "";
1309 foreach my $gaf (@{$metadata->{$field}}) {
1310 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1311 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1312
1313 # we need to make sure the filename is valid utf-8 - we do
1314 # this by url or base64 encoding it
1315 # $tail_filename is the name that we store the file as
1316 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1317 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1318 $doc_obj->associate_source_file($full_filename);
1319 # If the filename is url_encoded, we need to encode the % signs
1320 # in the filename, so that it works in a url
1321 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1322 # work out extended tail extension (i.e. matching tail re)
1323
1324 my ($file_prefix,$file_extended_ext)
1325 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1326 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1327 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1328
1329 # the greenstone 2 stuff
1330 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1331 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1332 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1333
1334 my $srcicon = "_icon".$doc_ext."_";
1335 my $end_doclink = "</a>";
1336
1337 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1338
1339
1340 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1341 # for metadata such as [mp3._edited] [mp3._full] ...
1342 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1343 }
1344
1345 # for multiple metadata such as [mp3.assoclink]
1346 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1347
1348 $equiv_form .= " $assoc_form";
1349
1350 # following are used for greenstone 3,
1351 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1352 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1353 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1354
1355 }
1356 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1357 }
1358 elsif ($field eq "gsdlzipfilename") {
1359 # special case for when files have come out of a zip. source_path
1360 # (used for archives dbs and keeping track for incremental import)
1361 # must be set to the zip file name
1362 my $zip_filename = $metadata->{$field};
1363 # overwrite the source_path
1364 $doc_obj->set_source_path($zip_filename);
1365 # and set the metadata
1366 $zip_filename = &util::filename_within_collection($zip_filename);
1367 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1368 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1369 }
1370 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1371 map {
1372 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1373 } @{$metadata->{$field}};
1374 } else {
1375 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1376 }
1377 }
1378}
1379
1380
1381sub compile_stats {
1382 my $self = shift(@_);
1383 my ($stats) = @_;
1384
1385 $stats->{'num_processed'} += $self->{'num_processed'};
1386 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1387 $stats->{'num_archives'} += $self->{'num_archives'};
1388
1389}
1390sub associate_source_file {
1391 my $self = shift(@_);
1392
1393 my ($doc_obj, $filename) = @_;
1394 my $cursection = $doc_obj->get_top_section();
1395 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1396
1397 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1398 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1399 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1400 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1401}
1402
1403sub associate_cover_image {
1404 my $self = shift(@_);
1405 my ($doc_obj, $filename) = @_;
1406
1407 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1408
1409 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1410 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1411
1412 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1413 # don't stat() for existence e.g. for multiple document input files
1414 # (eg SplitPlug)
1415 return;
1416 }
1417
1418 my $top_section=$doc_obj->get_top_section();
1419
1420 if (&util::fd_exists($upgraded_filename)) {
1421 $doc_obj->associate_source_file($filename);
1422 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1423 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1424 } else {
1425 my $upper_filename = $filename;
1426 my $upgraded_upper_filename = $upgraded_filename;
1427
1428 $upper_filename =~ s/jpg$/JPG/;
1429 $upgraded_upper_filename =~ s/jpg$/JPG/;
1430
1431 if (&util::fd_exists($upgraded_upper_filename)) {
1432 $doc_obj->associate_source_file($upper_filename);
1433 $doc_obj->associate_file($upper_filename, "cover.jpg",
1434 "image/jpeg");
1435 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1436 } else {
1437 # file doesn't exist, so record the fact that it's missing so
1438 # we don't stat() again (stat is slow)
1439 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1440 }
1441 }
1442
1443}
1444
1445
1446# Overridden by exploding plugins (eg. ISISPlug)
1447sub clean_up_after_exploding
1448{
1449 my $self = shift(@_);
1450}
1451
1452
1453
14541;
Note: See TracBrowser for help on using the repository browser.