source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 25345

Last change on this file since 25345 was 25345, checked in by kjdon, 12 years ago

working on equiv link stuff. Have changed xxx.assoclink and equivlink back to having parent(Top):assocfilepath in it as this is needed for gs2. Have left equivDocLink as it was as I think that is used for gs3. I'll change this later.

  • Property svn:keywords set to Author Date Id Revision
File size: 46.1 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use PrintInfo;
43
44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
47
48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
54 'desc' => "{BasePlugin.rename_method.base64}" },
55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
57 'hiddengli' => "yes" } ];
58
59our $encoding_list =
60 [ { 'name' => "ascii",
61 'desc' => "{BasePlugin.encoding.ascii}" },
62 { 'name' => "utf8",
63 'desc' => "{BasePlugin.encoding.utf8}" },
64 { 'name' => "unicode",
65 'desc' => "{BasePlugin.encoding.unicode}" } ];
66
67
68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
75 push(@{$encoding_list},$hashEncode);
76}
77
78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
80 'desc' => "{BasePlugin.filename_encoding.auto}" },
81 { 'name' => "auto-language-analysis",
82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
89
90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
92our $oidtype_list =
93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
96 'desc' => "{import.OIDtype.hash}" },
97 { 'name' => "hash_on_ga_xml",
98 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
99 { 'name' => "assigned",
100 'desc' => "{import.OIDtype.assigned}" },
101 { 'name' => "incremental",
102 'desc' => "{import.OIDtype.incremental}" },
103 { 'name' => "dirname",
104 'desc' => "{import.OIDtype.dirname}" } ];
105
106my $arguments =
107 [ { 'name' => "process_exp",
108 'desc' => "{BasePlugin.process_exp}",
109 'type' => "regexp",
110 'deft' => "",
111 'reqd' => "no" },
112 { 'name' => "no_blocking",
113 'desc' => "{BasePlugin.no_blocking}",
114 'type' => "flag",
115 'reqd' => "no"},
116 { 'name' => "block_exp",
117 'desc' => "{BasePlugin.block_exp}",
118 'type' => "regexp",
119 'deft' => "",
120 'reqd' => "no" },
121 { 'name' => "store_original_file",
122 'desc' => "{BasePlugin.store_original_file}",
123 'type' => "flag",
124 'reqd' => "no" },
125 { 'name' => "associate_ext",
126 'desc' => "{BasePlugin.associate_ext}",
127 'type' => "string",
128 'reqd' => "no" },
129 { 'name' => "associate_tail_re",
130 'desc' => "{BasePlugin.associate_tail_re}",
131 'type' => "string",
132 'reqd' => "no" },
133 { 'name' => "OIDtype",
134 'desc' => "{import.OIDtype}",
135 'type' => "enum",
136 'list' => $oidtype_list,
137 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
138 'deft' => "auto",
139 'reqd' => "no" },
140 { 'name' => "OIDmetadata",
141 'desc' => "{import.OIDmetadata}",
142 'type' => "metadata",
143 'deft' => "dc.Identifier",
144 'reqd' => "no" },
145# { 'name' => "use_as_doc_identifier",
146# 'desc' => "{BasePlugin.use_as_doc_identifier}",
147# 'type' => "string",
148# 'reqd' => "no" ,
149# 'deft' => "" } ,
150 { 'name' => "no_cover_image",
151 'desc' => "{BasePlugin.no_cover_image}",
152 'type' => "flag",
153 'reqd' => "no" },
154 { 'name' => "filename_encoding",
155 'desc' => "{BasePlugin.filename_encoding}",
156 'type' => "enum",
157 'deft' => "auto",
158 'list' => $encoding_plus_auto_list,
159 'reqd' => "no" },
160 { 'name' => "smart_block",
161 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
162 'type' => "flag",
163 'reqd' => "no",
164 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
165 { 'name' => "file_rename_method",
166 'desc' => "{BasePlugin.file_rename_method}",
167 'type' => "enum",
168 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
169 'list' => $file_rename_method_list,
170 'reqd' => "no"
171 }
172
173 ];
174
175
176my $options = { 'name' => "BasePlugin",
177 'desc' => "{BasePlugin.desc}",
178 'abstract' => "yes",
179 'inherits' => "no",
180 'args' => $arguments };
181
182sub new {
183
184 my ($class) = shift (@_);
185 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
186 push(@$pluginlist, $class);
187
188 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
189 push(@{$hashArgOptLists->{"OptList"}},$options);
190
191 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
192
193 if ($self->{'info_only'}) {
194 # don't worry about any options etc
195 return bless $self, $class;
196 }
197
198 if ($self->{'smart_block'}) {
199 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
200 }
201 $self->{'smart_block'} = undef;
202
203 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
204 $self->{'plugin_type'} = $plugin_name;
205
206 # remove ex. from OIDmetadata iff it's the only namespace prefix
207 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
208 $self->{'num_processed'} = 0;
209 $self->{'num_not_processed'} = 0;
210 $self->{'num_blocked'} = 0;
211 $self->{'num_archives'} = 0;
212 $self->{'cover_image'} = 1; # cover image is on by default
213 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
214 $self->{'can_process_directories'} = 0;
215 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
216
217 my $associate_ext = $self->{'associate_ext'};
218 if ((defined $associate_ext) && ($associate_ext ne "")) {
219
220 my $associate_tail_re = $self->{'associate_tail_re'};
221 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
222 my $outhandle = $self->{'outhandle'};
223 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
224 print $outhandle " defaulting to 'associate_tail_re'\n";
225 }
226 else {
227 my @exts = split(/,/,$associate_ext);
228
229 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
230 my $associate_tail_re = join("|",@exts_bracketed);
231 $self->{'associate_tail_re'} = $associate_tail_re;
232 }
233
234 delete $self->{'associate_ext'};
235 }
236
237 return bless $self, $class;
238
239}
240
241sub merge_inheritance
242{
243 my $self = {};
244 my @child_selfs = @_;
245
246 foreach my $child_self (@child_selfs) {
247 foreach my $key (keys %$child_self) {
248 if (defined $self->{$key}) {
249 if ($self->{$key} ne $child_self->{$key}) {
250# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
251# print STDERR "Existing stored value = $self->{$key}\n";
252# print STDERR "New (child) value = $child_self->{$key}\n";
253# print STDERR "Keeping existing value\n";
254 # Existing value seems to be option specified in collect.cfg
255
256 ### $self->{$key} = $child_self->{$key};
257
258 }
259 else {
260## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
261 }
262
263 }
264 else {
265 $self->{$key} = $child_self->{$key};
266 }
267 }
268 }
269
270 return $self;
271}
272
273# initialize BasePlugin options
274# if init() is overridden in a sub-class, remember to call BasePlugin::init()
275sub init {
276 my $self = shift (@_);
277 my ($verbosity, $outhandle, $failhandle) = @_;
278
279 # verbosity is passed through from the processor
280 $self->{'verbosity'} = $verbosity;
281
282 # as are the outhandle and failhandle
283 $self->{'outhandle'} = $outhandle if defined $outhandle;
284 $self->{'failhandle'} = $failhandle;
285# $self->SUPER::init(@_);
286
287 # set process_exp and block_exp to defaults unless they were
288 # explicitly set
289
290 if ((!$self->is_recursive()) and
291 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
292
293 $self->{'process_exp'} = $self->get_default_process_exp ();
294 if ($self->{'process_exp'} eq "") {
295 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
296 }
297 }
298
299 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
300 $self->{'block_exp'} = $self->get_default_block_exp ();
301 }
302
303}
304
305sub begin {
306 my $self = shift (@_);
307 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
308
309 if ($self->{'OIDtype'} eq "auto") {
310 # hasn't been set in the plugin, use the processor values
311 $self->{'OIDtype'} = $processor->{'OIDtype'};
312 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
313 }
314 if ($self->{'OIDtype'} eq "hash") {
315 # should we hash on the file or on the doc xml??
316 $self->{'OIDtype'} = $self->get_oid_hash_type();
317 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
318 $self->{'OIDtype'} = "hash_on_file";
319 }
320 }
321}
322
323# This is called once if removeold is set with import.pl. Most plugins will do
324# nothing but if a plugin does any stuff outside of creating doc obj, then
325# it may need to clear something.
326sub remove_all {
327 my $self = shift (@_);
328 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
329}
330
331# This is called per document for docs that have been deleted from the
332# collection. Most plugins will do nothing
333# but if a plugin does any stuff outside of creating doc obj, then it may need
334# to clear something.
335sub remove_one {
336 my $self = shift (@_);
337
338 my ($file, $oids, $archivedir) = @_;
339 return 0 if $self->can_process_this_file($file);
340 return undef;
341}
342
343sub end {
344 # potentially called at the end of each plugin pass
345 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
346
347 my ($self) = shift (@_);
348}
349
350sub deinit {
351 # called only once, after all plugin passes have been done
352
353 my ($self) = @_;
354}
355
356# default hashing type is to hash on the original file (or converted file)
357# override this to return hash_on_ga_xml for filetypes where hashing on the
358# file is no good eg video
359sub get_oid_hash_type {
360
361 my $self = shift (@_);
362
363 return "hash_on_file";
364}
365
366
367# this function should be overridden to return 1
368# in recursive plugins
369sub is_recursive {
370 my $self = shift (@_);
371
372 return 0;
373}
374
375sub get_default_block_exp {
376 my $self = shift (@_);
377
378 return "";
379}
380
381sub get_default_process_exp {
382 my $self = shift (@_);
383
384 return "";
385}
386
387
388# rename imported files and assoc files using URL encoding by default
389# as this will work for most plugins and give more legible filenames
390sub get_default_file_rename_method() {
391 my $self = shift (@_);
392 return "url";
393}
394
395# returns this plugin's active (possibly user-selected) file_rename_method
396sub get_file_rename_method() {
397 my $self = shift (@_);
398 my $rename_method = $self->{'file_rename_method'};
399 if($rename_method) {
400 return $rename_method;
401 } else {
402 return $self->get_default_file_rename_method();
403 }
404}
405
406# default implementation is to do nothing
407sub store_block_files {
408
409 my $self =shift (@_);
410 my ($filename_full_path, $block_hash) = @_;
411
412}
413
414# put files to block into hash
415sub use_block_expressions {
416
417 my $self =shift (@_);
418 my ($filename_full_path, $block_hash) = @_;
419
420 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
421
422 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
423 &util::block_filename($block_hash,$filename_full_path);
424 }
425
426}
427
428#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
429sub block_cover_image
430{
431 my $self =shift;
432 my ($filename, $block_hash) = @_;
433
434 $filename = &util::upgrade_if_dos_filename($filename);
435
436 if ($self->{'cover_image'}) {
437 my $coverfile = $filename;
438 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
439
440 if (!&util::fd_exists($coverfile)) {
441 $coverfile =~ s/jpg$/JPG/;
442 }
443 if (&util::fd_exists($coverfile)) {
444 &util::block_filename($block_hash,$coverfile);
445 }
446 }
447
448 return;
449}
450
451
452# discover all the files that should be blocked by this plugin
453# check the args ...
454sub file_block_read {
455
456 my $self = shift (@_);
457 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
458 # Keep track of filenames with same root but different extensions
459 # Used to support -associate_ext and the more generalised
460 # -associate_tail_re
461 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
462
463 if (!-d $filename_full_path) {
464 $block_hash->{'all_files'}->{$file} = 1;
465 }
466
467 my $associate_tail_re = $self->{'associate_tail_re'};
468 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
469 my ($file_prefix,$file_ext)
470 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
471 if ((defined $file_prefix) && (defined $file_ext)) {
472 my $shared_fileroot = $block_hash->{'shared_fileroot'};
473 if (!defined $shared_fileroot->{$file_prefix}) {
474 my $file_prefix_rec = { 'tie_to' => undef,
475 'exts' => {} };
476 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
477 }
478
479 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
480
481 if ($self->can_process_this_file($filename_full_path)) {
482 # This is the document the others should be tied to
483 $file_prefix_rec->{'tie_to'} = $file_ext;
484 }
485 else {
486 if ($file_ext =~ m/$associate_tail_re$/) {
487 # this file should be associated to the main one
488 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
489 }
490 }
491
492 }
493 }
494
495 # check block expressions
496 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
497
498 # now check whether we are actually processing this
499 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
500 return undef; # can't recognise
501 }
502
503 # if we have a block_exp, then this overrides the normal 'smart' blocking
504 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
505
506 # block the cover image if there is one
507 if ($self->{'cover_image'}) {
508 $self->block_cover_image($filename_full_path, $block_hash);
509 }
510
511 return 1;
512}
513
514# plugins that rely on more than process_exp (eg XML plugins) can override this method
515sub can_process_this_file {
516 my $self = shift(@_);
517 my ($filename) = @_;
518
519 if (-d $filename && !$self->{'can_process_directories'}) {
520 return 0;
521 }
522
523 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
524 return 1;
525 }
526 return 0;
527
528}
529
530# Even if a plugin can extract metadata in its metadata_read pass,
531# make the default return 'undef' so processing of the file continues
532# down the pipeline, so other plugins can also have the opportunity to
533# locate metadata and set it up in the extrametakeys variables that
534# are passed around.
535
536sub can_process_this_file_for_metadata {
537 my $self = shift(@_);
538
539 return undef;
540}
541
542
543# just converts path as is to utf8.
544sub filepath_to_utf8 {
545 my $self = shift (@_);
546 my ($file, $file_encoding) = @_;
547 my $filemeta = $file;
548
549 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
550
551 # Whenever filename-encoding is set to any of the auto settings, we
552 # check if the filename is already in UTF8. If it is, then we're done.
553 if($filename_encoding =~ m/auto/) {
554 if(&unicode::check_is_utf8($filemeta))
555 {
556 $filename_encoding = "utf8";
557 return $filemeta;
558 }
559 }
560
561 # Auto setting, but filename is not utf8
562 if ($filename_encoding eq "auto")
563 {
564 # try textcat
565 $filename_encoding = $self->textcat_encoding($filemeta);
566
567 # check the locale next
568 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
569
570
571 # now try the encoding of the document, if available
572 if ($filename_encoding eq "undefined" && defined $file_encoding) {
573 $filename_encoding = $file_encoding;
574 }
575
576 }
577
578 elsif ($filename_encoding eq "auto-language-analysis")
579 {
580 $filename_encoding = $self->textcat_encoding($filemeta);
581
582 # now try the encoding of the document, if available
583 if ($filename_encoding eq "undefined" && defined $file_encoding) {
584 $filename_encoding = $file_encoding;
585 }
586 }
587
588 elsif ($filename_encoding eq "auto-filesystem-encoding")
589 {
590 # try locale
591 $filename_encoding = $self->locale_encoding();
592 }
593
594 elsif ($filename_encoding eq "auto-fl")
595 {
596 # filesystem-encoding (locale) then language-analysis (textcat)
597 $filename_encoding = $self->locale_encoding();
598
599 # try textcat
600 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
601
602 # else assume filename encoding is encoding of file content, if that's available
603 if ($filename_encoding eq "undefined" && defined $file_encoding) {
604 $filename_encoding = $file_encoding;
605 }
606 }
607
608 elsif ($filename_encoding eq "auto-lf")
609 {
610 # language-analysis (textcat) then filesystem-encoding (locale)
611 $filename_encoding = $self->textcat_encoding($filemeta);
612
613 # guess filename encoding from encoding of file content, if available
614 if ($filename_encoding eq "undefined" && defined $file_encoding) {
615 $filename_encoding = $file_encoding;
616 }
617
618 # try locale
619 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
620 }
621
622 # if still undefined, use utf8 as fallback
623 if ($filename_encoding eq "undefined") {
624 $filename_encoding = "utf8";
625 }
626
627 #print STDERR "**** UTF8 encoding the filename $filemeta ";
628
629 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
630 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
631 # be always utf8 (in which case the filename's encoding is also set as utf8 even
632 # though the filename need not be if it originates from another system)--in such
633 # cases attempt to make the filename utf8 to match.
634 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
635 &unicode::ensure_utf8(\$filemeta);
636 }
637
638 # convert non-unicode encodings to utf8
639 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
640 $filemeta = &unicode::unicode2utf8(
641 &unicode::convert2unicode($filename_encoding, \$filemeta)
642 );
643 }
644
645 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
646 return $filemeta;
647}
648
649# gets the filename with no path, converts to utf8, and then dm safes it.
650# filename_encoding set by user
651sub filename_to_utf8_metadata
652{
653 my $self = shift (@_);
654 my ($file, $file_encoding) = @_;
655
656 my $outhandle = $self->{'outhandle'};
657
658 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
659 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
660 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
661
662
663 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
664 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
665
666 return $filemeta;
667}
668
669sub locale_encoding {
670 my $self = shift(@_);
671
672 if (!defined $self->{'filesystem_encoding'}) {
673 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
674 }
675
676 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
677 return $self->{'filesystem_encoding'}; # can be the string "undefined"
678}
679
680sub textcat_encoding {
681 my $self = shift(@_);
682 my ($filemeta) = @_;
683
684 # analyse filenames without extensions and digits (and trimmed of
685 # surrounding whitespace), so that irrelevant chars don't confuse
686 # textcat
687 my $strictfilemeta = $filemeta;
688 $strictfilemeta =~ s/\.[^\.]+$//g;
689 $strictfilemeta =~ s/\d//g;
690 $strictfilemeta =~ s/^\s*//g;
691 $strictfilemeta =~ s/\s*$//g;
692
693 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
694 if(!defined $filename_encoding) {
695 $filename_encoding = "undefined";
696 }
697
698 return $filename_encoding; # can be the string "undefined"
699}
700
701# performs textcat
702sub encoding_from_language_analysis {
703 my $self = shift(@_);
704 my ($text) = @_;
705
706 my $outhandle = $self->{'outhandle'};
707 my $best_encoding = undef;
708
709 # get the language/encoding of the textstring using textcat
710 require textcat; # Only load the textcat module if it is required
711 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
712 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
713
714
715 if (scalar @$results < 0) {
716 return undef;
717 }
718
719 # We have some results, we choose the first
720 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
721
722 $best_encoding = $encoding;
723 if (!defined $best_encoding) {
724 return undef;
725 }
726
727 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
728 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
729 $best_encoding = 'utf8';
730 }
731
732
733 # check for equivalents where textcat doesn't have some encodings...
734 # eg MS versions of standard encodings
735 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
736 my $iso = $1; # which variant of the iso standard?
737 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
738 if ($text =~ /[\x80-\x9f]/) {
739 # Western Europe
740 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
741 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
742 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
743 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
744 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
745 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
746 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
747 }
748 }
749
750 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
751 !defined $encodings::encodings->{$best_encoding})
752 {
753 if ($self->{'verbosity'}) {
754 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
755 }
756 $best_encoding = undef;
757 }
758
759 return $best_encoding;
760}
761
762# uses locale
763sub get_filesystem_encoding
764{
765
766 my $self = shift(@_);
767
768 my $outhandle = $self->{'outhandle'};
769 my $filesystem_encoding = undef;
770
771 eval {
772 # Works for Windows as well, returning the DOS code page in use
773 use POSIX qw(locale_h);
774
775 # With only one parameter, setlocale retrieves the
776 # current value
777 my $current_locale = setlocale(LC_CTYPE);
778
779 my $char_encoding = undef;
780 if ($current_locale =~ m/\./) {
781 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
782 $char_encoding = lc($char_encoding);
783 }
784 else {
785 if ($current_locale =~ m/^(posix|c)$/i) {
786 $char_encoding = "ascii";
787 }
788 }
789
790 if (defined $char_encoding) {
791 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
792 $char_encoding = "$1\_$2\_$3";
793 }
794
795 $char_encoding =~ s/-/_/g;
796 $char_encoding =~ s/^utf_8$/utf8/;
797
798 if ($char_encoding =~ m/^\d+$/) {
799 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
800 $char_encoding = "windows_$char_encoding";
801 }
802 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
803 $char_encoding = "dos_$char_encoding";
804 }
805 }
806
807 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
808 || (defined $encodings::encodings->{$char_encoding})) {
809 $filesystem_encoding = $char_encoding;
810 }
811 else {
812 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
813 }
814 }
815
816
817 };
818 if ($@) {
819 print $outhandle "$@\n";
820 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
821
822 }
823
824 return $filesystem_encoding;
825}
826
827
828
829sub deduce_filename_encoding
830{
831 my $self = shift (@_);
832 my ($file,$metadata,$plugin_filename_encoding) = @_;
833
834 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
835 my $deduced_filename_encoding = undef;
836
837 # Start by looking for manually assigned metadata
838 if (defined $gs_filename_encoding) {
839 if (ref ($gs_filename_encoding) eq "ARRAY") {
840 my $outhandle = $self->{'outhandle'};
841
842 $deduced_filename_encoding = $gs_filename_encoding->[0];
843
844 my $num_vals = scalar(@$gs_filename_encoding);
845 if ($num_vals>1) {
846 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
847 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
848 }
849 }
850 else {
851 $deduced_filename_encoding = $gs_filename_encoding;
852 }
853 }
854
855 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
856 # Look to see if plugin specifies this value
857
858 if (defined $plugin_filename_encoding) {
859 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
860 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
861 my $outhandle = $self->{'outhandle'};
862 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
863 print $outhandle " default to 'auto'\n";
864 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
865 }
866
867 if ($plugin_filename_encoding ne "auto") {
868 # We've been given a specific filenamne encoding
869 # => so use it!
870 $deduced_filename_encoding = $plugin_filename_encoding;
871 }
872 }
873 }
874
875 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
876
877 # Look to file system to provide a character encoding
878
879 # If Windows NTFS, then -- assuming we work with long file names got through
880 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
881
882 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
883 # Can do better than working with the DOS character encoding returned by locale
884 $deduced_filename_encoding = "unicode";
885 }
886 else {
887 # Unix of some form or other
888
889 # See if we can determine the file system encoding through locale
890 $deduced_filename_encoding = $self->locale_encoding();
891
892 # if locale shows us filesystem is utf8, check to see filename is consistent
893 # => if not, then we have an "alien" filename on our hands
894
895 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
896 if (!&unicode::check_is_utf8($file)) {
897 # "alien" filename, so revert
898 $deduced_filename_encoding = undef;
899 }
900 }
901 }
902 }
903
904# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
905# # Last chance, apply textcat to deduce filename encoding
906# $deduced_filename_encoding = $self->textcat_encoding($file);
907# }
908
909 if ($self->{'verbosity'}>3) {
910 my $outhandle = $self->{'outhandle'};
911
912 if (defined $deduced_filename_encoding) {
913 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
914 }
915 else {
916 print $outhandle " No filename encoding deduced\n";
917 }
918 }
919
920 return $deduced_filename_encoding;
921}
922
923
924
925
926# Notionally written to be called once for each document, it is however safe to
927# call multiple times (as in the case of ImagePlugin) which calls this later on
928# after the original image has potentially been converted to a *new* source image
929# format (e.g. TIFF to PNG)
930
931sub set_Source_metadata {
932 my $self = shift (@_);
933 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
934
935 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
936 # and (as a fallback) using %xx if not for non-ascii chars
937 # 2. Sets the url ref (SourceFile) to the URL encoded version
938 # of filename for generated files
939
940 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
941
942 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
943
944 my $octet_file = $raw_file;
945
946 # UTF-8 version of filename
947# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
948# print STDERR "**** Setting Source Metadata given: $octet_file\n";
949# }
950
951 # Deal with (on Windows) raw filenames that are in their
952 # abbreviated DOS form
953
954 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
955 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
956 if (-e $raw_filename) {
957 my $unicode_filename = Win32::GetLongPathName($raw_filename);
958
959 my $unused_full_uf;
960 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
961 }
962 }
963 }
964
965 my $url_encoded_filename;
966 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
967 # => Generate a pretty print version of filename that is mapped to Unicode
968
969 # Use filename_encoding to map raw filename to a Perl unicode-aware string
970 $url_encoded_filename = decode($filename_encoding,$octet_file);
971 }
972 else {
973 # otherwise generate %xx encoded version of filename for char > 127
974 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
975 }
976
977# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
978# print STDERR "****** saving Source as: $url_encoded_filename\n";
979# }
980
981 # In the case of converted files and (generalized) exploded documents, there
982 # will already be a source filename => store as OrigSource before overriding
983 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
984 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
985 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
986 }
987
988 # Source is the UTF8 display name - not necessarily the name of the file on the system
989 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
990 # on Darwin want all display strings to be in composed form, then can search on that
991 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
992 }
993 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
994
995
996 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
997 # If using URL encoding, then SourceFile is the url-reference to url-encoded
998 # renamed_raw_url: it's a url that refers to the actual file on the system
999 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
1000
1001 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
1002 $renamed_raw_url);
1003
1004# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1005# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1006# }
1007}
1008
1009# this should be called by all plugins to set the oid of the doc obj, rather
1010# than calling doc_obj->set_OID directly
1011sub add_OID {
1012 my $self = shift (@_);
1013 my ($doc_obj) = @_;
1014
1015 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1016
1017 # see if there is a plugin specific set_OID function
1018 if (defined ($self->can('set_OID'))) {
1019 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1020 }
1021 else {
1022 # use the default set_OID() in doc.pm
1023 $doc_obj->set_OID();
1024 }
1025
1026}
1027
1028# The BasePlugin read_into_doc_obj() function. This function does all the
1029# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1030# a file and sets up a slew of metadata all saved in doc_obj, which
1031# it then returns as part of a tuple (process_status,doc_obj)
1032#
1033# Much of this functionality used to reside in read, but it was broken
1034# down into a supporting routine to make the code more flexible.
1035#
1036# recursive plugins (e.g. RecPlug) and specialized plugins like those
1037# capable of processing many documents within a single file (e.g.
1038# GMLPlug) will normally want to implement their own version of
1039# read_into_doc_obj()
1040#
1041# Note that $base_dir might be "" and that $file might
1042# include directories
1043
1044# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1045sub read_into_doc_obj {
1046 my $self = shift (@_);
1047 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1048
1049 my $outhandle = $self->{'outhandle'};
1050
1051 # should we move this to read? What about secondary plugins?
1052 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1053 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1054 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1055 if $self->{'verbosity'} > 1;
1056
1057 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1058
1059 # create a new document
1060 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1061 my $top_section = $doc_obj->get_top_section();
1062
1063 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1064 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1065
1066
1067 my $plugin_filename_encoding = $self->{'filename_encoding'};
1068 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1069 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1070
1071 # plugin specific stuff - what args do we need here??
1072 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1073 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1074 return -1;
1075 }
1076
1077 # include any metadata passed in from previous plugins
1078 # note that this metadata is associated with the top level section
1079 my $section = $doc_obj->get_top_section();
1080 # can we merge these two methods??
1081 $self->add_associated_files($doc_obj, $filename_full_path);
1082 $self->extra_metadata ($doc_obj, $section, $metadata);
1083 $self->auto_extract_metadata($doc_obj);
1084
1085 # if we haven't found any Title so far, assign one
1086 # this was shifted to here from inside read()
1087 $self->title_fallback($doc_obj,$section,$filename_no_path);
1088
1089 $self->add_OID($doc_obj);
1090
1091 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1092 return (1,$doc_obj);
1093}
1094
1095sub post_process_doc_obj {
1096 my $self = shift (@_);
1097 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1098
1099 return 1;
1100}
1101
1102sub add_dummy_text {
1103 my $self = shift(@_);
1104 my ($doc_obj, $section) = @_;
1105
1106 # add NoText metadata so we can hide this dummy text in format statements
1107 $doc_obj->add_metadata($section, "NoText", "1");
1108 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1109
1110}
1111
1112# does nothing. Can be overridden by subclass
1113sub auto_extract_metadata {
1114 my $self = shift(@_);
1115 my ($doc_obj) = @_;
1116}
1117
1118# adds cover image, associate_file options stuff. Should be called by sub class
1119# read_into_doc_obj
1120sub add_associated_files {
1121 my $self = shift(@_);
1122 # whatis filename??
1123 my ($doc_obj, $filename) = @_;
1124
1125 # add in the cover image
1126 if ($self->{'cover_image'}) {
1127 $self->associate_cover_image($doc_obj, $filename);
1128 }
1129 # store the original (used for eg TextPlugin to store the original for OAI)
1130 if ($self->{'store_original_file'}) {
1131 $self->associate_source_file($doc_obj, $filename);
1132 }
1133
1134
1135}
1136
1137# implement this if you are extracting metadata for other documents
1138sub metadata_read {
1139 my $self = shift (@_);
1140 my ($pluginfo, $base_dir, $file, $block_hash,
1141 $extrametakeys, $extrametadata, $extrametafile,
1142 $processor, $gli, $aux) = @_;
1143
1144 # can we process this file??
1145 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1146 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
1147
1148 return 1; # we recognise the file, but don't actually do anything with it
1149}
1150
1151
1152# The BasePlugin read() function. This function calls read_into_doc_obj()
1153# to ensure all the right things to make general options work for a
1154# given plugin are done. It then calls the process() function which
1155# does all the work specific to a plugin (like the old read functions
1156# used to do). Most plugins should define their own process() function
1157# and let this read() function keep control.
1158#
1159# recursive plugins (e.g. RecPlug) and specialized plugins like those
1160# capable of processing many documents within a single file (e.g.
1161# GMLPlug) might want to implement their own version of read(), but
1162# more likely need to implement their own version of read_into_doc_obj()
1163#
1164# Return number of files processed, undef if can't recognise, -1 if can't
1165# process
1166
1167sub read {
1168 my $self = shift (@_);
1169 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1170
1171 # can we process this file??
1172 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1173
1174 return undef unless $self->can_process_this_file($filename_full_path);
1175
1176 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1177
1178 if ((defined $process_status) && ($process_status == 1)) {
1179
1180 # process the document
1181 $processor->process($doc_obj);
1182
1183 $self->{'num_processed'} ++;
1184 undef $doc_obj;
1185 }
1186 # delete any temp files that we may have created
1187 $self->clean_up_after_doc_obj_processing();
1188
1189
1190 # if process_status == 1, then the file has been processed.
1191 return $process_status;
1192
1193}
1194
1195# returns undef if file is rejected by the plugin
1196sub process {
1197 my $self = shift (@_);
1198 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1199
1200 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1201
1202 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1203 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1204
1205 die "\n";
1206
1207 return undef; # never gets here
1208}
1209
1210# overwrite this method to delete any temp files that we have created
1211sub clean_up_after_doc_obj_processing {
1212 my $self = shift(@_);
1213
1214}
1215
1216# write_file -- used by ConvertToPlug, for example in post processing
1217#
1218# where should this go, is here the best place??
1219sub utf8_write_file {
1220 my $self = shift (@_);
1221 my ($textref, $filename) = @_;
1222
1223 if (!open (FILE, ">:utf8", $filename)) {
1224 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1225 die "\n";
1226 }
1227 print FILE $$textref;
1228
1229 close FILE;
1230}
1231
1232
1233sub filename_based_title
1234{
1235 my $self = shift (@_);
1236 my ($file) = @_;
1237
1238 my $file_derived_title = $file;
1239 $file_derived_title =~ s/_/ /g;
1240 $file_derived_title =~ s/\.[^.]+$//;
1241
1242 return $file_derived_title;
1243}
1244
1245
1246sub title_fallback
1247{
1248 my $self = shift (@_);
1249 my ($doc_obj,$section,$file) = @_;
1250
1251 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1252 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1253
1254 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1255 my $file_derived_title;
1256 if (defined $source_file) {
1257 $file_derived_title = $self->filename_based_title($source_file);
1258 }
1259 else {
1260 # pp = pretty print
1261 my $pp_file = (defined $source_file) ? $source_file : $file;
1262
1263 my $raw_title = $self->filename_based_title($file);
1264 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1265 }
1266
1267
1268 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1269 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1270 }
1271 else {
1272 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1273 }
1274 }
1275
1276}
1277
1278# add any extra metadata that's been passed around from one
1279# plugin to another.
1280# extra_metadata uses add_utf8_metadata so it expects metadata values
1281# to already be in utf8
1282sub extra_metadata {
1283 my $self = shift (@_);
1284 my ($doc_obj, $cursection, $metadata) = @_;
1285
1286 my $associate_tail_re = $self->{'associate_tail_re'};
1287
1288 foreach my $field (keys(%$metadata)) {
1289 # $metadata->{$field} may be an array reference
1290 if ($field eq "gsdlassocfile_tobe") {
1291 # 'gsdlassocfile_tobe' is artificially introduced metadata
1292 # that is used to signal that certain additional files should
1293 # be tied to this document. Useful in situations where a
1294 # metadata pass in the plugin pipeline works out some files
1295 # need to be associated with a document, but the document hasn't
1296 # been formed yet.
1297 my $equiv_form = "";
1298 foreach my $gaf (@{$metadata->{$field}}) {
1299 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1300 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1301
1302 # we need to make sure the filename is valid utf-8 - we do
1303 # this by url or base64 encoding it
1304 # $tail_filename is the name that we store the file as
1305 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1306 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1307 $doc_obj->associate_source_file($full_filename);
1308 # If the filename is url_encoded, we need to encode the % signs
1309 # in the filename, so that it works in a url
1310 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1311 # work out extended tail extension (i.e. matching tail re)
1312
1313 my ($file_prefix,$file_extended_ext)
1314 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1315 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1316 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1317
1318 # the greenstone 2 stuff
1319 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1320 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1321 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1322
1323 my $srcicon = "_icon".$doc_ext."_";
1324 my $end_doclink = "</a>";
1325
1326 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1327
1328
1329 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1330 # for metadata such as [mp3._edited] [mp3._full] ...
1331 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1332 }
1333
1334 # for multiple metadata such as [mp3.assoclink]
1335 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1336
1337 $equiv_form .= " $assoc_form";
1338
1339 # following are used for greenstone 3,
1340 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1341 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1342 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1343
1344 }
1345 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1346 }
1347 elsif ($field eq "gsdlzipfilename") {
1348 # special case for when files have come out of a zip. source_path
1349 # (used for archives dbs and keeping track for incremental import)
1350 # must be set to the zip file name
1351 my $zip_filename = $metadata->{$field};
1352 # overwrite the source_path
1353 $doc_obj->set_source_path($zip_filename);
1354 # and set the metadata
1355 $zip_filename = &util::filename_within_collection($zip_filename);
1356 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1357 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1358 }
1359 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1360 map {
1361 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1362 } @{$metadata->{$field}};
1363 } else {
1364 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1365 }
1366 }
1367}
1368
1369
1370sub compile_stats {
1371 my $self = shift(@_);
1372 my ($stats) = @_;
1373
1374 $stats->{'num_processed'} += $self->{'num_processed'};
1375 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1376 $stats->{'num_archives'} += $self->{'num_archives'};
1377
1378}
1379sub associate_source_file {
1380 my $self = shift(@_);
1381
1382 my ($doc_obj, $filename) = @_;
1383 my $cursection = $doc_obj->get_top_section();
1384 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1385
1386 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1387 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1388 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1389 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1390}
1391
1392sub associate_cover_image {
1393 my $self = shift(@_);
1394 my ($doc_obj, $filename) = @_;
1395
1396 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1397
1398 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1399 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1400
1401 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1402 # don't stat() for existence e.g. for multiple document input files
1403 # (eg SplitPlug)
1404 return;
1405 }
1406
1407 my $top_section=$doc_obj->get_top_section();
1408
1409 if (&util::fd_exists($upgraded_filename)) {
1410 $doc_obj->associate_source_file($filename);
1411 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1412 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1413 } else {
1414 my $upper_filename = $filename;
1415 my $upgraded_upper_filename = $upgraded_filename;
1416
1417 $upper_filename =~ s/jpg$/JPG/;
1418 $upgraded_upper_filename =~ s/jpg$/JPG/;
1419
1420 if (&util::fd_exists($upgraded_upper_filename)) {
1421 $doc_obj->associate_source_file($upper_filename);
1422 $doc_obj->associate_file($upper_filename, "cover.jpg",
1423 "image/jpeg");
1424 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1425 } else {
1426 # file doesn't exist, so record the fact that it's missing so
1427 # we don't stat() again (stat is slow)
1428 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1429 }
1430 }
1431
1432}
1433
1434
1435# Overridden by exploding plugins (eg. ISISPlug)
1436sub clean_up_after_exploding
1437{
1438 my $self = shift(@_);
1439}
1440
1441
1442
14431;
Note: See TracBrowser for help on using the repository browser.