source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 28782

Last change on this file since 28782 was 28375, checked in by davidb, 11 years ago

A set of changes to help Greenstone building code (perl) run under Cygwin. The test is designed to be mutually to when run natively on Windows. In effect the refined test is saying: if you're windows but not cygwin then do as you used to do for Windows, otherwise go with Unix (as Cygwin is effectively giving you a Unix like operating system to run in)

  • Property svn:keywords set to Author Date Id Revision
File size: 47.0 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41use util;
42use FileUtils;
43
44use PrintInfo;
45
46BEGIN {
47 @BasePlugin::ISA = ( 'PrintInfo' );
48}
49
50# the different methods that can be applied when renaming
51# imported documents and their associated files
52our $file_rename_method_list =
53 [ { 'name' => "url",
54 'desc' => "{BasePlugin.rename_method.url}" },
55 { 'name' => "base64",
56 'desc' => "{BasePlugin.rename_method.base64}" },
57 { 'name' => "none",
58 'desc' => "{BasePlugin.rename_method.none}",
59 'hiddengli' => "yes" } ];
60
61our $encoding_list =
62 [ { 'name' => "ascii",
63 'desc' => "{BasePlugin.encoding.ascii}" },
64 { 'name' => "utf8",
65 'desc' => "{BasePlugin.encoding.utf8}" },
66 { 'name' => "unicode",
67 'desc' => "{BasePlugin.encoding.unicode}" } ];
68
69
70my $e = $encodings::encodings;
71foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
72{
73 my $hashEncode =
74 {'name' => $enc,
75 'desc' => $e->{$enc}->{'name'}};
76
77 push(@{$encoding_list},$hashEncode);
78}
79
80our $encoding_plus_auto_list =
81 [ { 'name' => "auto",
82 'desc' => "{BasePlugin.filename_encoding.auto}" },
83 { 'name' => "auto-language-analysis",
84 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
85 { 'name' => "auto-filesystem-encoding",
86 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
87 { 'name' => "auto-fl",
88 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
89 { 'name' => "auto-lf",
90 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
91
92push(@{$encoding_plus_auto_list},@{$encoding_list});
93
94our $oidtype_list =
95 [ { 'name' => "auto",
96 'desc' => "{BasePlugin.OIDtype.auto}" },
97 { 'name' => "hash",
98 'desc' => "{import.OIDtype.hash}" },
99 { 'name' => "hash_on_ga_xml",
100 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
101 { 'name' => "hash_on_full_filename",
102 'desc' => "{import.OIDtype.hash_on_full_filename}" },
103 { 'name' => "assigned",
104 'desc' => "{import.OIDtype.assigned}" },
105 { 'name' => "incremental",
106 'desc' => "{import.OIDtype.incremental}" },
107 { 'name' => "filename",
108 'desc' => "{import.OIDtype.filename}" },
109 { 'name' => "dirname",
110 'desc' => "{import.OIDtype.dirname}" },
111 { 'name' => "full_filename",
112 'desc' => "{import.OIDtype.full_filename}" } ];
113
114my $arguments =
115 [ { 'name' => "process_exp",
116 'desc' => "{BasePlugin.process_exp}",
117 'type' => "regexp",
118 'deft' => "",
119 'reqd' => "no" },
120 { 'name' => "no_blocking",
121 'desc' => "{BasePlugin.no_blocking}",
122 'type' => "flag",
123 'reqd' => "no"},
124 { 'name' => "block_exp",
125 'desc' => "{BasePlugin.block_exp}",
126 'type' => "regexp",
127 'deft' => "",
128 'reqd' => "no" },
129 { 'name' => "store_original_file",
130 'desc' => "{BasePlugin.store_original_file}",
131 'type' => "flag",
132 'reqd' => "no" },
133 { 'name' => "associate_ext",
134 'desc' => "{BasePlugin.associate_ext}",
135 'type' => "string",
136 'reqd' => "no" },
137 { 'name' => "associate_tail_re",
138 'desc' => "{BasePlugin.associate_tail_re}",
139 'type' => "string",
140 'reqd' => "no" },
141 { 'name' => "OIDtype",
142 'desc' => "{import.OIDtype}",
143 'type' => "enum",
144 'list' => $oidtype_list,
145 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
146 'deft' => "auto",
147 'reqd' => "no" },
148 { 'name' => "OIDmetadata",
149 'desc' => "{import.OIDmetadata}",
150 'type' => "metadata",
151 'deft' => "dc.Identifier",
152 'reqd' => "no" },
153# { 'name' => "use_as_doc_identifier",
154# 'desc' => "{BasePlugin.use_as_doc_identifier}",
155# 'type' => "string",
156# 'reqd' => "no" ,
157# 'deft' => "" } ,
158 { 'name' => "no_cover_image",
159 'desc' => "{BasePlugin.no_cover_image}",
160 'type' => "flag",
161 'reqd' => "no" },
162 { 'name' => "filename_encoding",
163 'desc' => "{BasePlugin.filename_encoding}",
164 'type' => "enum",
165 'deft' => "auto",
166 'list' => $encoding_plus_auto_list,
167 'reqd' => "no" },
168 { 'name' => "smart_block",
169 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
170 'type' => "flag",
171 'reqd' => "no",
172 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
173 { 'name' => "file_rename_method",
174 'desc' => "{BasePlugin.file_rename_method}",
175 'type' => "enum",
176 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
177 'list' => $file_rename_method_list,
178 'reqd' => "no"
179 }
180
181 ];
182
183
184my $options = { 'name' => "BasePlugin",
185 'desc' => "{BasePlugin.desc}",
186 'abstract' => "yes",
187 'inherits' => "no",
188 'args' => $arguments };
189
190sub new {
191
192 my ($class) = shift (@_);
193 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
194 push(@$pluginlist, $class);
195
196 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
197 push(@{$hashArgOptLists->{"OptList"}},$options);
198
199 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
200
201 if ($self->{'info_only'}) {
202 # don't worry about any options etc
203 return bless $self, $class;
204 }
205
206 if ($self->{'smart_block'}) {
207 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
208 }
209 $self->{'smart_block'} = undef;
210
211 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
212 $self->{'plugin_type'} = $plugin_name;
213
214 # remove ex. from OIDmetadata iff it's the only namespace prefix
215 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
216 $self->{'num_processed'} = 0;
217 $self->{'num_not_processed'} = 0;
218 $self->{'num_blocked'} = 0;
219 $self->{'num_archives'} = 0;
220 $self->{'cover_image'} = 1; # cover image is on by default
221 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
222 $self->{'can_process_directories'} = 0;
223 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
224
225 my $associate_ext = $self->{'associate_ext'};
226 if ((defined $associate_ext) && ($associate_ext ne "")) {
227
228 my $associate_tail_re = $self->{'associate_tail_re'};
229 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
230 my $outhandle = $self->{'outhandle'};
231 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
232 print $outhandle " defaulting to 'associate_tail_re'\n";
233 }
234 else {
235 my @exts = split(/,/,$associate_ext);
236
237 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
238 my $associate_tail_re = join("|",@exts_bracketed);
239 $self->{'associate_tail_re'} = $associate_tail_re;
240 }
241
242 delete $self->{'associate_ext'};
243 }
244
245 return bless $self, $class;
246
247}
248
249sub merge_inheritance
250{
251 my $self = {};
252 my @child_selfs = @_;
253
254 foreach my $child_self (@child_selfs) {
255 foreach my $key (keys %$child_self) {
256 if (defined $self->{$key}) {
257 if ($self->{$key} ne $child_self->{$key}) {
258# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
259# print STDERR "Existing stored value = $self->{$key}\n";
260# print STDERR "New (child) value = $child_self->{$key}\n";
261# print STDERR "Keeping existing value\n";
262 # Existing value seems to be option specified in collect.cfg
263
264 ### $self->{$key} = $child_self->{$key};
265
266 }
267 else {
268## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
269 }
270
271 }
272 else {
273 $self->{$key} = $child_self->{$key};
274 }
275 }
276 }
277
278 return $self;
279}
280
281# initialize BasePlugin options
282# if init() is overridden in a sub-class, remember to call BasePlugin::init()
283sub init {
284 my $self = shift (@_);
285 my ($verbosity, $outhandle, $failhandle) = @_;
286
287 # verbosity is passed through from the processor
288 $self->{'verbosity'} = $verbosity;
289
290 # as are the outhandle and failhandle
291 $self->{'outhandle'} = $outhandle if defined $outhandle;
292 $self->{'failhandle'} = $failhandle;
293# $self->SUPER::init(@_);
294
295 # set process_exp and block_exp to defaults unless they were
296 # explicitly set
297
298 if ((!$self->is_recursive()) and
299 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
300
301 $self->{'process_exp'} = $self->get_default_process_exp ();
302 if ($self->{'process_exp'} eq "") {
303 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
304 }
305 }
306
307 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
308 $self->{'block_exp'} = $self->get_default_block_exp ();
309 }
310
311}
312
313sub begin {
314 my $self = shift (@_);
315 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
316
317 if ($self->{'OIDtype'} eq "auto") {
318 # hasn't been set in the plugin, use the processor values
319 $self->{'OIDtype'} = $processor->{'OIDtype'};
320 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
321 }
322 if ($self->{'OIDtype'} eq "hash") {
323 # should we hash on the file or on the doc xml??
324 $self->{'OIDtype'} = $self->get_oid_hash_type();
325 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
326 $self->{'OIDtype'} = "hash_on_file";
327 }
328 }
329}
330
331# This is called once if removeold is set with import.pl. Most plugins will do
332# nothing but if a plugin does any stuff outside of creating doc obj, then
333# it may need to clear something.
334sub remove_all {
335 my $self = shift (@_);
336 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
337}
338
339# This is called per document for docs that have been deleted from the
340# collection. Most plugins will do nothing
341# but if a plugin does any stuff outside of creating doc obj, then it may need
342# to clear something.
343sub remove_one {
344 my $self = shift (@_);
345
346 my ($file, $oids, $archivedir) = @_;
347 return 0 if $self->can_process_this_file($file);
348 return undef;
349}
350
351sub end {
352 # potentially called at the end of each plugin pass
353 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
354
355 my ($self) = shift (@_);
356}
357
358sub deinit {
359 # called only once, after all plugin passes have been done
360
361 my ($self) = @_;
362}
363
364# default hashing type is to hash on the original file (or converted file)
365# override this to return hash_on_ga_xml for filetypes where hashing on the
366# file is no good eg video
367sub get_oid_hash_type {
368
369 my $self = shift (@_);
370
371 return "hash_on_file";
372}
373
374
375# this function should be overridden to return 1
376# in recursive plugins
377sub is_recursive {
378 my $self = shift (@_);
379
380 return 0;
381}
382
383sub get_default_block_exp {
384 my $self = shift (@_);
385
386 return "";
387}
388
389sub get_default_process_exp {
390 my $self = shift (@_);
391
392 return "";
393}
394
395
396# rename imported files and assoc files using URL encoding by default
397# as this will work for most plugins and give more legible filenames
398sub get_default_file_rename_method() {
399 my $self = shift (@_);
400 return "url";
401}
402
403# returns this plugin's active (possibly user-selected) file_rename_method
404sub get_file_rename_method() {
405 my $self = shift (@_);
406 my $rename_method = $self->{'file_rename_method'};
407 if($rename_method) {
408 return $rename_method;
409 } else {
410 return $self->get_default_file_rename_method();
411 }
412}
413
414# default implementation is to do nothing
415sub store_block_files {
416
417 my $self =shift (@_);
418 my ($filename_full_path, $block_hash) = @_;
419
420}
421
422# put files to block into hash
423sub use_block_expressions {
424
425 my $self =shift (@_);
426 my ($filename_full_path, $block_hash) = @_;
427
428 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
429
430 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
431 &util::block_filename($block_hash,$filename_full_path);
432 }
433
434}
435
436#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
437sub block_cover_image
438{
439 my $self =shift;
440 my ($filename, $block_hash) = @_;
441
442 $filename = &util::upgrade_if_dos_filename($filename);
443
444 if ($self->{'cover_image'}) {
445 my $coverfile = $filename;
446 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
447
448 #if there is no file extension, coverfile will be the same as filename
449 return if $coverfile eq $filename;
450
451 if (!&FileUtils::fileExists($coverfile)) {
452 $coverfile =~ s/jpg$/JPG/;
453 }
454 if (&FileUtils::fileExists($coverfile)) {
455 &util::block_filename($block_hash,$coverfile);
456 }
457 }
458
459 return;
460}
461
462
463# discover all the files that should be blocked by this plugin
464# check the args ...
465sub file_block_read {
466
467 my $self = shift (@_);
468 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
469 # Keep track of filenames with same root but different extensions
470 # Used to support -associate_ext and the more generalised
471 # -associate_tail_re
472 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
473
474 if (!-d $filename_full_path) {
475 $block_hash->{'all_files'}->{$file} = 1;
476 }
477
478 my $associate_tail_re = $self->{'associate_tail_re'};
479 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
480 my ($file_prefix,$file_ext)
481 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
482 if ((defined $file_prefix) && (defined $file_ext)) {
483 my $shared_fileroot = $block_hash->{'shared_fileroot'};
484 if (!defined $shared_fileroot->{$file_prefix}) {
485 my $file_prefix_rec = { 'tie_to' => undef,
486 'exts' => {} };
487 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
488 }
489
490 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
491
492 if ($self->can_process_this_file($filename_full_path)) {
493 # This is the document the others should be tied to
494 $file_prefix_rec->{'tie_to'} = $file_ext;
495 }
496 else {
497 if ($file_ext =~ m/$associate_tail_re$/) {
498 # this file should be associated to the main one
499 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
500 }
501 }
502
503 }
504 }
505
506 # check block expressions
507 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
508
509 # now check whether we are actually processing this
510 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
511 return undef; # can't recognise
512 }
513
514 # if we have a block_exp, then this overrides the normal 'smart' blocking
515 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
516
517 # block the cover image if there is one
518 if ($self->{'cover_image'}) {
519 $self->block_cover_image($filename_full_path, $block_hash);
520 }
521
522 return 1;
523}
524
525# plugins that rely on more than process_exp (eg XML plugins) can override this method
526sub can_process_this_file {
527 my $self = shift(@_);
528 my ($filename) = @_;
529
530 if (-d $filename && !$self->{'can_process_directories'}) {
531 return 0;
532 }
533
534 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
535 return 1;
536 }
537 return 0;
538
539}
540
541# Even if a plugin can extract metadata in its metadata_read pass,
542# make the default return 'undef' so processing of the file continues
543# down the pipeline, so other plugins can also have the opportunity to
544# locate metadata and set it up in the extrametakeys variables that
545# are passed around.
546
547sub can_process_this_file_for_metadata {
548 my $self = shift(@_);
549
550 return undef;
551}
552
553
554# just converts path as is to utf8.
555sub filepath_to_utf8 {
556 my $self = shift (@_);
557 my ($file, $file_encoding) = @_;
558 my $filemeta = $file;
559
560 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
561
562 # Whenever filename-encoding is set to any of the auto settings, we
563 # check if the filename is already in UTF8. If it is, then we're done.
564 if($filename_encoding =~ m/auto/) {
565 if(&unicode::check_is_utf8($filemeta))
566 {
567 $filename_encoding = "utf8";
568 return $filemeta;
569 }
570 }
571
572 # Auto setting, but filename is not utf8
573 if ($filename_encoding eq "auto")
574 {
575 # try textcat
576 $filename_encoding = $self->textcat_encoding($filemeta);
577
578 # check the locale next
579 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
580
581
582 # now try the encoding of the document, if available
583 if ($filename_encoding eq "undefined" && defined $file_encoding) {
584 $filename_encoding = $file_encoding;
585 }
586
587 }
588
589 elsif ($filename_encoding eq "auto-language-analysis")
590 {
591 $filename_encoding = $self->textcat_encoding($filemeta);
592
593 # now try the encoding of the document, if available
594 if ($filename_encoding eq "undefined" && defined $file_encoding) {
595 $filename_encoding = $file_encoding;
596 }
597 }
598
599 elsif ($filename_encoding eq "auto-filesystem-encoding")
600 {
601 # try locale
602 $filename_encoding = $self->locale_encoding();
603 }
604
605 elsif ($filename_encoding eq "auto-fl")
606 {
607 # filesystem-encoding (locale) then language-analysis (textcat)
608 $filename_encoding = $self->locale_encoding();
609
610 # try textcat
611 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
612
613 # else assume filename encoding is encoding of file content, if that's available
614 if ($filename_encoding eq "undefined" && defined $file_encoding) {
615 $filename_encoding = $file_encoding;
616 }
617 }
618
619 elsif ($filename_encoding eq "auto-lf")
620 {
621 # language-analysis (textcat) then filesystem-encoding (locale)
622 $filename_encoding = $self->textcat_encoding($filemeta);
623
624 # guess filename encoding from encoding of file content, if available
625 if ($filename_encoding eq "undefined" && defined $file_encoding) {
626 $filename_encoding = $file_encoding;
627 }
628
629 # try locale
630 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
631 }
632
633 # if still undefined, use utf8 as fallback
634 if ($filename_encoding eq "undefined") {
635 $filename_encoding = "utf8";
636 }
637
638 #print STDERR "**** UTF8 encoding the filename $filemeta ";
639
640 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
641 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
642 # be always utf8 (in which case the filename's encoding is also set as utf8 even
643 # though the filename need not be if it originates from another system)--in such
644 # cases attempt to make the filename utf8 to match.
645 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
646 &unicode::ensure_utf8(\$filemeta);
647 }
648
649 # convert non-unicode encodings to utf8
650 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
651 $filemeta = &unicode::unicode2utf8(
652 &unicode::convert2unicode($filename_encoding, \$filemeta)
653 );
654 }
655
656 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
657 return $filemeta;
658}
659
660# gets the filename with no path, converts to utf8, and then dm safes it.
661# filename_encoding set by user
662sub filename_to_utf8_metadata
663{
664 my $self = shift (@_);
665 my ($file, $file_encoding) = @_;
666
667 my $outhandle = $self->{'outhandle'};
668
669 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
670 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
671 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
672
673 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
674 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
675
676 return $filemeta;
677}
678
679sub locale_encoding {
680 my $self = shift(@_);
681
682 if (!defined $self->{'filesystem_encoding'}) {
683 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
684 }
685
686 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
687 return $self->{'filesystem_encoding'}; # can be the string "undefined"
688}
689
690sub textcat_encoding {
691 my $self = shift(@_);
692 my ($filemeta) = @_;
693
694 # analyse filenames without extensions and digits (and trimmed of
695 # surrounding whitespace), so that irrelevant chars don't confuse
696 # textcat
697 my $strictfilemeta = $filemeta;
698 $strictfilemeta =~ s/\.[^\.]+$//g;
699 $strictfilemeta =~ s/\d//g;
700 $strictfilemeta =~ s/^\s*//g;
701 $strictfilemeta =~ s/\s*$//g;
702
703 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
704 if(!defined $filename_encoding) {
705 $filename_encoding = "undefined";
706 }
707
708 return $filename_encoding; # can be the string "undefined"
709}
710
711# performs textcat
712sub encoding_from_language_analysis {
713 my $self = shift(@_);
714 my ($text) = @_;
715
716 my $outhandle = $self->{'outhandle'};
717 my $best_encoding = undef;
718
719 # get the language/encoding of the textstring using textcat
720 require textcat; # Only load the textcat module if it is required
721 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
722 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
723
724
725 if (scalar @$results < 0) {
726 return undef;
727 }
728
729 # We have some results, we choose the first
730 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
731
732 $best_encoding = $encoding;
733 if (!defined $best_encoding) {
734 return undef;
735 }
736
737 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
738 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
739 $best_encoding = 'utf8';
740 }
741
742
743 # check for equivalents where textcat doesn't have some encodings...
744 # eg MS versions of standard encodings
745 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
746 my $iso = $1; # which variant of the iso standard?
747 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
748 if ($text =~ /[\x80-\x9f]/) {
749 # Western Europe
750 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
751 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
752 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
753 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
754 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
755 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
756 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
757 }
758 }
759
760 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
761 !defined $encodings::encodings->{$best_encoding})
762 {
763 if ($self->{'verbosity'}) {
764 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
765 }
766 $best_encoding = undef;
767 }
768
769 return $best_encoding;
770}
771
772# uses locale
773sub get_filesystem_encoding
774{
775
776 my $self = shift(@_);
777
778 my $outhandle = $self->{'outhandle'};
779 my $filesystem_encoding = undef;
780
781 eval {
782 # Works for Windows as well, returning the DOS code page in use
783 use POSIX qw(locale_h);
784
785 # With only one parameter, setlocale retrieves the
786 # current value
787 my $current_locale = setlocale(LC_CTYPE);
788
789 my $char_encoding = undef;
790 if ($current_locale =~ m/\./) {
791 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
792 $char_encoding = lc($char_encoding);
793 }
794 else {
795 if ($current_locale =~ m/^(posix|c)$/i) {
796 $char_encoding = "ascii";
797 }
798 }
799
800 if (defined $char_encoding) {
801 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
802 $char_encoding = "$1\_$2\_$3";
803 }
804
805 $char_encoding =~ s/-/_/g;
806 $char_encoding =~ s/^utf_8$/utf8/;
807
808 if ($char_encoding =~ m/^\d+$/) {
809 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
810 $char_encoding = "windows_$char_encoding";
811 }
812 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
813 $char_encoding = "dos_$char_encoding";
814 }
815 }
816
817 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
818 || (defined $encodings::encodings->{$char_encoding})) {
819 $filesystem_encoding = $char_encoding;
820 }
821 else {
822 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
823 }
824 }
825
826
827 };
828 if ($@) {
829 print $outhandle "$@\n";
830 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
831
832 }
833
834 return $filesystem_encoding;
835}
836
837
838
839sub deduce_filename_encoding
840{
841 my $self = shift (@_);
842 my ($file,$metadata,$plugin_filename_encoding) = @_;
843
844 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
845 my $deduced_filename_encoding = undef;
846
847 # Start by looking for manually assigned metadata
848 if (defined $gs_filename_encoding) {
849 if (ref ($gs_filename_encoding) eq "ARRAY") {
850 my $outhandle = $self->{'outhandle'};
851
852 $deduced_filename_encoding = $gs_filename_encoding->[0];
853
854 my $num_vals = scalar(@$gs_filename_encoding);
855 if ($num_vals>1) {
856 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
857 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
858 }
859 }
860 else {
861 $deduced_filename_encoding = $gs_filename_encoding;
862 }
863 }
864
865 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
866 # Look to see if plugin specifies this value
867
868 if (defined $plugin_filename_encoding) {
869 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
870 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
871 my $outhandle = $self->{'outhandle'};
872 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
873 print $outhandle " default to 'auto'\n";
874 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
875 }
876
877 if ($plugin_filename_encoding ne "auto") {
878 # We've been given a specific filenamne encoding
879 # => so use it!
880 $deduced_filename_encoding = $plugin_filename_encoding;
881 }
882 }
883 }
884
885 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
886
887 # Look to file system to provide a character encoding
888
889 # If Windows NTFS, then -- assuming we work with long file names got through
890 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
891
892 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
893 # Can do better than working with the DOS character encoding returned by locale
894 $deduced_filename_encoding = "unicode";
895 }
896 else {
897 # Unix of some form or other
898
899 # See if we can determine the file system encoding through locale
900 $deduced_filename_encoding = $self->locale_encoding();
901
902 # if locale shows us filesystem is utf8, check to see filename is consistent
903 # => if not, then we have an "alien" filename on our hands
904
905 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
906 if (!&unicode::check_is_utf8($file)) {
907 # "alien" filename, so revert
908 $deduced_filename_encoding = undef;
909 }
910 }
911 }
912 }
913
914# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
915# # Last chance, apply textcat to deduce filename encoding
916# $deduced_filename_encoding = $self->textcat_encoding($file);
917# }
918
919 if ($self->{'verbosity'}>3) {
920 my $outhandle = $self->{'outhandle'};
921
922 if (defined $deduced_filename_encoding) {
923 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
924 }
925 else {
926 print $outhandle " No filename encoding deduced\n";
927 }
928 }
929
930 return $deduced_filename_encoding;
931}
932
933
934
935
936# Notionally written to be called once for each document, it is however safe to
937# call multiple times (as in the case of ImagePlugin) which calls this later on
938# after the original image has potentially been converted to a *new* source image
939# format (e.g. TIFF to PNG)
940
941sub set_Source_metadata {
942 my $self = shift (@_);
943 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
944
945 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
946 # and (as a fallback) using %xx if not for non-ascii chars
947 # 2. Sets the url ref (SourceFile) to the URL encoded version
948 # of filename for generated files
949
950 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
951
952 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
953
954 my $octet_file = $raw_file;
955
956 # UTF-8 version of filename
957# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
958# print STDERR "**** Setting Source Metadata given: $octet_file\n";
959# }
960
961 # Deal with (on Windows) raw filenames that are in their
962 # abbreviated DOS form
963
964 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
965 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
966 if (-e $raw_filename) {
967 my $unicode_filename = Win32::GetLongPathName($raw_filename);
968
969 my $unused_full_uf;
970 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
971 }
972 }
973 }
974
975 my $url_encoded_filename;
976 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
977 # => Generate a pretty print version of filename that is mapped to Unicode
978
979 # Use filename_encoding to map raw filename to a Perl unicode-aware string
980 $url_encoded_filename = decode($filename_encoding,$octet_file);
981 }
982 else {
983 # otherwise generate %xx encoded version of filename for char > 127
984 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
985 }
986
987# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
988# print STDERR "****** saving Source as: $url_encoded_filename\n";
989# }
990
991 # In the case of converted files and (generalized) exploded documents, there
992 # will already be a source filename => store as OrigSource before overriding
993 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
994 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
995 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
996 }
997
998 # Source is the UTF8 display name - not necessarily the name of the file on the system
999 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1000 # on Darwin want all display strings to be in composed form, then can search on that
1001 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
1002 }
1003 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
1004
1005
1006 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
1007 # If using URL encoding, then SourceFile is the url-reference to url-encoded
1008 # renamed_raw_url: it's a url that refers to the actual file on the system
1009 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
1010
1011 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
1012 $renamed_raw_url);
1013
1014# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1015# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1016# }
1017}
1018
1019# this should be called by all plugins to set the oid of the doc obj, rather
1020# than calling doc_obj->set_OID directly
1021sub add_OID {
1022 my $self = shift (@_);
1023 my ($doc_obj, $force) = @_;
1024
1025 # don't add one if there is one already set, unless we are forced to do so
1026 return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
1027 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1028
1029 # see if there is a plugin specific set_OID function
1030 if (defined ($self->can('set_OID'))) {
1031 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1032 }
1033 else {
1034 # use the default set_OID() in doc.pm
1035 $doc_obj->set_OID();
1036 }
1037
1038}
1039
1040# The BasePlugin read_into_doc_obj() function. This function does all the
1041# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1042# a file and sets up a slew of metadata all saved in doc_obj, which
1043# it then returns as part of a tuple (process_status,doc_obj)
1044#
1045# Much of this functionality used to reside in read, but it was broken
1046# down into a supporting routine to make the code more flexible.
1047#
1048# recursive plugins (e.g. RecPlug) and specialized plugins like those
1049# capable of processing many documents within a single file (e.g.
1050# GMLPlug) will normally want to implement their own version of
1051# read_into_doc_obj()
1052#
1053# Note that $base_dir might be "" and that $file might
1054# include directories
1055
1056# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1057sub read_into_doc_obj {
1058 my $self = shift (@_);
1059 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1060
1061 my $outhandle = $self->{'outhandle'};
1062
1063 # should we move this to read? What about secondary plugins?
1064 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1065 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1066 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1067 if $self->{'verbosity'} > 1;
1068
1069 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1070
1071 # create a new document
1072 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1073 my $top_section = $doc_obj->get_top_section();
1074
1075 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1076 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1077
1078
1079 my $plugin_filename_encoding = $self->{'filename_encoding'};
1080 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1081 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1082
1083 # plugin specific stuff - what args do we need here??
1084 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1085 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1086 return -1;
1087 }
1088
1089 # include any metadata passed in from previous plugins
1090 # note that this metadata is associated with the top level section
1091 my $section = $doc_obj->get_top_section();
1092 # can we merge these two methods??
1093 $self->add_associated_files($doc_obj, $filename_full_path);
1094 $self->extra_metadata ($doc_obj, $section, $metadata);
1095 $self->auto_extract_metadata($doc_obj);
1096
1097 # if we haven't found any Title so far, assign one
1098 # this was shifted to here from inside read()
1099 $self->title_fallback($doc_obj,$section,$filename_no_path);
1100
1101 $self->add_OID($doc_obj);
1102
1103 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1104 return (1,$doc_obj);
1105}
1106
1107sub post_process_doc_obj {
1108 my $self = shift (@_);
1109 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1110
1111 return 1;
1112}
1113
1114sub add_dummy_text {
1115 my $self = shift(@_);
1116 my ($doc_obj, $section) = @_;
1117
1118 # add NoText metadata so we can hide this dummy text in format statements
1119 $doc_obj->add_metadata($section, "NoText", "1");
1120 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1121
1122}
1123
1124# does nothing. Can be overridden by subclass
1125sub auto_extract_metadata {
1126 my $self = shift(@_);
1127 my ($doc_obj) = @_;
1128}
1129
1130# adds cover image, associate_file options stuff. Should be called by sub class
1131# read_into_doc_obj
1132sub add_associated_files {
1133 my $self = shift(@_);
1134 # whatis filename??
1135 my ($doc_obj, $filename) = @_;
1136
1137 # add in the cover image
1138 if ($self->{'cover_image'}) {
1139 $self->associate_cover_image($doc_obj, $filename);
1140 }
1141 # store the original (used for eg TextPlugin to store the original for OAI)
1142 if ($self->{'store_original_file'}) {
1143 $self->associate_source_file($doc_obj, $filename);
1144 }
1145
1146
1147}
1148
1149# implement this if you are extracting metadata for other documents
1150sub metadata_read {
1151 my $self = shift (@_);
1152 my ($pluginfo, $base_dir, $file, $block_hash,
1153 $extrametakeys, $extrametadata, $extrametafile,
1154 $processor, $gli, $aux) = @_;
1155
1156 # can we process this file??
1157 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1158 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
1159
1160 return 1; # we recognise the file, but don't actually do anything with it
1161}
1162
1163
1164# The BasePlugin read() function. This function calls read_into_doc_obj()
1165# to ensure all the right things to make general options work for a
1166# given plugin are done. It then calls the process() function which
1167# does all the work specific to a plugin (like the old read functions
1168# used to do). Most plugins should define their own process() function
1169# and let this read() function keep control.
1170#
1171# recursive plugins (e.g. RecPlug) and specialized plugins like those
1172# capable of processing many documents within a single file (e.g.
1173# GMLPlug) might want to implement their own version of read(), but
1174# more likely need to implement their own version of read_into_doc_obj()
1175#
1176# Return number of files processed, undef if can't recognise, -1 if can't
1177# process
1178
1179sub read {
1180 my $self = shift (@_);
1181 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1182
1183 # can we process this file??
1184 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1185
1186 return undef unless $self->can_process_this_file($filename_full_path);
1187
1188 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1189
1190 if ((defined $process_status) && ($process_status == 1)) {
1191
1192 # process the document
1193 $processor->process($doc_obj);
1194
1195 $self->{'num_processed'} ++;
1196 undef $doc_obj;
1197 }
1198 # delete any temp files that we may have created
1199 $self->clean_up_after_doc_obj_processing();
1200
1201
1202 # if process_status == 1, then the file has been processed.
1203 return $process_status;
1204
1205}
1206
1207# returns undef if file is rejected by the plugin
1208sub process {
1209 my $self = shift (@_);
1210 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1211
1212 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1213
1214 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1215 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1216
1217 die "\n";
1218
1219 return undef; # never gets here
1220}
1221
1222# overwrite this method to delete any temp files that we have created
1223sub clean_up_after_doc_obj_processing {
1224 my $self = shift(@_);
1225
1226}
1227
1228# write_file -- used by ConvertToPlug, for example in post processing
1229#
1230# where should this go, is here the best place??
1231sub utf8_write_file {
1232 my $self = shift (@_);
1233 my ($textref, $filename) = @_;
1234
1235 if (!open (FILE, ">:utf8", $filename)) {
1236 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1237 die "\n";
1238 }
1239 print FILE $$textref;
1240
1241 close FILE;
1242}
1243
1244
1245sub filename_based_title
1246{
1247 my $self = shift (@_);
1248 my ($file) = @_;
1249
1250 my $file_derived_title = $file;
1251 $file_derived_title =~ s/_/ /g;
1252 $file_derived_title =~ s/\.[^.]+$//;
1253
1254 return $file_derived_title;
1255}
1256
1257
1258sub title_fallback
1259{
1260 my $self = shift (@_);
1261 my ($doc_obj,$section,$file) = @_;
1262
1263 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1264 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1265
1266 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1267 my $file_derived_title;
1268 if (defined $source_file) {
1269 $file_derived_title = $self->filename_based_title($source_file);
1270 }
1271 else {
1272 # pp = pretty print
1273 my $pp_file = (defined $source_file) ? $source_file : $file;
1274
1275 my $raw_title = $self->filename_based_title($file);
1276 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1277 }
1278
1279
1280 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1281 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1282 }
1283 else {
1284 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1285 }
1286 }
1287
1288}
1289
1290# add any extra metadata that's been passed around from one
1291# plugin to another.
1292# extra_metadata uses add_utf8_metadata so it expects metadata values
1293# to already be in utf8
1294sub extra_metadata {
1295 my $self = shift (@_);
1296 my ($doc_obj, $cursection, $metadata) = @_;
1297
1298 my $associate_tail_re = $self->{'associate_tail_re'};
1299
1300# Sort the extra metadata for diffcol so these meta appear in a consistent order
1301# in doc.xml. Necessary for the ex.PDF.* and ex.File.* meta that's extracted in
1302# the PDFBox collection, as the order of these varies between CentOS and Ubuntu.
1303 foreach my $field (sort keys(%$metadata)) {
1304# foreach my $field (keys(%$metadata)) {
1305 # $metadata->{$field} may be an array reference
1306 if ($field eq "gsdlassocfile_tobe") {
1307 # 'gsdlassocfile_tobe' is artificially introduced metadata
1308 # that is used to signal that certain additional files should
1309 # be tied to this document. Useful in situations where a
1310 # metadata pass in the plugin pipeline works out some files
1311 # need to be associated with a document, but the document hasn't
1312 # been formed yet.
1313 my $equiv_form = "";
1314 foreach my $gaf (@{$metadata->{$field}}) {
1315 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1316 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1317
1318 # we need to make sure the filename is valid utf-8 - we do
1319 # this by url or base64 encoding it
1320 # $tail_filename is the name that we store the file as
1321 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1322 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1323 $doc_obj->associate_source_file($full_filename);
1324 # If the filename is url_encoded, we need to encode the % signs
1325 # in the filename, so that it works in a url
1326 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1327 # work out extended tail extension (i.e. matching tail re)
1328
1329 my ($file_prefix,$file_extended_ext)
1330 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1331 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1332 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1333
1334 # the greenstone 2 stuff
1335 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1336 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1337 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1338
1339 my $srcicon = "_icon".$doc_ext."_";
1340 my $end_doclink = "</a>";
1341
1342 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1343
1344
1345 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1346 # for metadata such as [mp3._edited] [mp3._full] ...
1347 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1348 }
1349
1350 # for multiple metadata such as [mp3.assoclink]
1351 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1352
1353 $equiv_form .= " $assoc_form";
1354
1355 # following are used for greenstone 3,
1356 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1357 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1358 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1359
1360 }
1361 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1362 }
1363 elsif ($field eq "gsdlzipfilename") {
1364 # special case for when files have come out of a zip. source_path
1365 # (used for archives dbs and keeping track for incremental import)
1366 # must be set to the zip file name
1367 my $zip_filename = $metadata->{$field};
1368 # overwrite the source_path
1369 $doc_obj->set_source_path($zip_filename);
1370 # and set the metadata
1371 $zip_filename = &util::filename_within_collection($zip_filename);
1372 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1373 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1374 }
1375 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1376 map {
1377 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1378 } @{$metadata->{$field}};
1379 } else {
1380 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1381 }
1382 }
1383}
1384
1385
1386sub compile_stats {
1387 my $self = shift(@_);
1388 my ($stats) = @_;
1389
1390 $stats->{'num_processed'} += $self->{'num_processed'};
1391 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1392 $stats->{'num_archives'} += $self->{'num_archives'};
1393
1394}
1395sub associate_source_file {
1396 my $self = shift(@_);
1397
1398 my ($doc_obj, $filename) = @_;
1399 my $cursection = $doc_obj->get_top_section();
1400 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1401
1402 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1403 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1404 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1405 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1406}
1407
1408sub associate_cover_image {
1409 my $self = shift(@_);
1410 my ($doc_obj, $filename) = @_;
1411
1412 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1413
1414 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1415 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1416
1417 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1418 # don't stat() for existence e.g. for multiple document input files
1419 # (eg SplitPlug)
1420 return;
1421 }
1422
1423 my $top_section=$doc_obj->get_top_section();
1424
1425 if (&FileUtils::fileExists($upgraded_filename)) {
1426 $doc_obj->associate_source_file($filename);
1427 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1428 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1429 } else {
1430 my $upper_filename = $filename;
1431 my $upgraded_upper_filename = $upgraded_filename;
1432
1433 $upper_filename =~ s/jpg$/JPG/;
1434 $upgraded_upper_filename =~ s/jpg$/JPG/;
1435
1436 if (&FileUtils::fileExists($upgraded_upper_filename)) {
1437 $doc_obj->associate_source_file($upper_filename);
1438 $doc_obj->associate_file($upper_filename, "cover.jpg",
1439 "image/jpeg");
1440 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1441 } else {
1442 # file doesn't exist, so record the fact that it's missing so
1443 # we don't stat() again (stat is slow)
1444 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1445 }
1446 }
1447
1448}
1449
1450
1451# Overridden by exploding plugins (eg. ISISPlug)
1452sub clean_up_after_exploding
1453{
1454 my $self = shift(@_);
1455}
1456
1457
1458
14591;
Note: See TracBrowser for help on using the repository browser.