source: main/trunk/greenstone2/perllib/plugins/BasePlugin.pm@ 26221

Last change on this file since 26221 was 26221, checked in by kjdon, 12 years ago

new OIDtype, filename, will use the file name without any folders or file extension. Must be unique filenames in the collection. BasePlugin add_OID method returns if an id has already been set

  • Property svn:keywords set to Author Date Id Revision
File size: 46.4 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use PrintInfo;
43
44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
47
48# the different methods that can be applied when renaming
49# imported documents and their associated files
50our $file_rename_method_list =
51 [ { 'name' => "url",
52 'desc' => "{BasePlugin.rename_method.url}" },
53 { 'name' => "base64",
54 'desc' => "{BasePlugin.rename_method.base64}" },
55 { 'name' => "none",
56 'desc' => "{BasePlugin.rename_method.none}",
57 'hiddengli' => "yes" } ];
58
59our $encoding_list =
60 [ { 'name' => "ascii",
61 'desc' => "{BasePlugin.encoding.ascii}" },
62 { 'name' => "utf8",
63 'desc' => "{BasePlugin.encoding.utf8}" },
64 { 'name' => "unicode",
65 'desc' => "{BasePlugin.encoding.unicode}" } ];
66
67
68my $e = $encodings::encodings;
69foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
70{
71 my $hashEncode =
72 {'name' => $enc,
73 'desc' => $e->{$enc}->{'name'}};
74
75 push(@{$encoding_list},$hashEncode);
76}
77
78our $encoding_plus_auto_list =
79 [ { 'name' => "auto",
80 'desc' => "{BasePlugin.filename_encoding.auto}" },
81 { 'name' => "auto-language-analysis",
82 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
83 { 'name' => "auto-filesystem-encoding",
84 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
85 { 'name' => "auto-fl",
86 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
87 { 'name' => "auto-lf",
88 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
89
90push(@{$encoding_plus_auto_list},@{$encoding_list});
91
92our $oidtype_list =
93 [ { 'name' => "auto",
94 'desc' => "{BasePlugin.OIDtype.auto}" },
95 { 'name' => "hash",
96 'desc' => "{import.OIDtype.hash}" },
97 { 'name' => "hash_on_ga_xml",
98 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
99 { 'name' => "assigned",
100 'desc' => "{import.OIDtype.assigned}" },
101 { 'name' => "incremental",
102 'desc' => "{import.OIDtype.incremental}" },
103 { 'name' => "filename",
104 'desc' => "{import.OIDtype.filename}" },
105 { 'name' => "dirname",
106 'desc' => "{import.OIDtype.dirname}" } ];
107
108my $arguments =
109 [ { 'name' => "process_exp",
110 'desc' => "{BasePlugin.process_exp}",
111 'type' => "regexp",
112 'deft' => "",
113 'reqd' => "no" },
114 { 'name' => "no_blocking",
115 'desc' => "{BasePlugin.no_blocking}",
116 'type' => "flag",
117 'reqd' => "no"},
118 { 'name' => "block_exp",
119 'desc' => "{BasePlugin.block_exp}",
120 'type' => "regexp",
121 'deft' => "",
122 'reqd' => "no" },
123 { 'name' => "store_original_file",
124 'desc' => "{BasePlugin.store_original_file}",
125 'type' => "flag",
126 'reqd' => "no" },
127 { 'name' => "associate_ext",
128 'desc' => "{BasePlugin.associate_ext}",
129 'type' => "string",
130 'reqd' => "no" },
131 { 'name' => "associate_tail_re",
132 'desc' => "{BasePlugin.associate_tail_re}",
133 'type' => "string",
134 'reqd' => "no" },
135 { 'name' => "OIDtype",
136 'desc' => "{import.OIDtype}",
137 'type' => "enum",
138 'list' => $oidtype_list,
139 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
140 'deft' => "auto",
141 'reqd' => "no" },
142 { 'name' => "OIDmetadata",
143 'desc' => "{import.OIDmetadata}",
144 'type' => "metadata",
145 'deft' => "dc.Identifier",
146 'reqd' => "no" },
147# { 'name' => "use_as_doc_identifier",
148# 'desc' => "{BasePlugin.use_as_doc_identifier}",
149# 'type' => "string",
150# 'reqd' => "no" ,
151# 'deft' => "" } ,
152 { 'name' => "no_cover_image",
153 'desc' => "{BasePlugin.no_cover_image}",
154 'type' => "flag",
155 'reqd' => "no" },
156 { 'name' => "filename_encoding",
157 'desc' => "{BasePlugin.filename_encoding}",
158 'type' => "enum",
159 'deft' => "auto",
160 'list' => $encoding_plus_auto_list,
161 'reqd' => "no" },
162 { 'name' => "smart_block",
163 'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
164 'type' => "flag",
165 'reqd' => "no",
166 'hiddengli' => "yes" }, # deprecated, but leave in for old collections
167 { 'name' => "file_rename_method",
168 'desc' => "{BasePlugin.file_rename_method}",
169 'type' => "enum",
170 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
171 'list' => $file_rename_method_list,
172 'reqd' => "no"
173 }
174
175 ];
176
177
178my $options = { 'name' => "BasePlugin",
179 'desc' => "{BasePlugin.desc}",
180 'abstract' => "yes",
181 'inherits' => "no",
182 'args' => $arguments };
183
184sub new {
185
186 my ($class) = shift (@_);
187 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
188 push(@$pluginlist, $class);
189
190 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
191 push(@{$hashArgOptLists->{"OptList"}},$options);
192
193 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
194
195 if ($self->{'info_only'}) {
196 # don't worry about any options etc
197 return bless $self, $class;
198 }
199
200 if ($self->{'smart_block'}) {
201 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
202 }
203 $self->{'smart_block'} = undef;
204
205 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
206 $self->{'plugin_type'} = $plugin_name;
207
208 # remove ex. from OIDmetadata iff it's the only namespace prefix
209 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
210 $self->{'num_processed'} = 0;
211 $self->{'num_not_processed'} = 0;
212 $self->{'num_blocked'} = 0;
213 $self->{'num_archives'} = 0;
214 $self->{'cover_image'} = 1; # cover image is on by default
215 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
216 $self->{'can_process_directories'} = 0;
217 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
218
219 my $associate_ext = $self->{'associate_ext'};
220 if ((defined $associate_ext) && ($associate_ext ne "")) {
221
222 my $associate_tail_re = $self->{'associate_tail_re'};
223 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
224 my $outhandle = $self->{'outhandle'};
225 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
226 print $outhandle " defaulting to 'associate_tail_re'\n";
227 }
228 else {
229 my @exts = split(/,/,$associate_ext);
230
231 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
232 my $associate_tail_re = join("|",@exts_bracketed);
233 $self->{'associate_tail_re'} = $associate_tail_re;
234 }
235
236 delete $self->{'associate_ext'};
237 }
238
239 return bless $self, $class;
240
241}
242
243sub merge_inheritance
244{
245 my $self = {};
246 my @child_selfs = @_;
247
248 foreach my $child_self (@child_selfs) {
249 foreach my $key (keys %$child_self) {
250 if (defined $self->{$key}) {
251 if ($self->{$key} ne $child_self->{$key}) {
252# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
253# print STDERR "Existing stored value = $self->{$key}\n";
254# print STDERR "New (child) value = $child_self->{$key}\n";
255# print STDERR "Keeping existing value\n";
256 # Existing value seems to be option specified in collect.cfg
257
258 ### $self->{$key} = $child_self->{$key};
259
260 }
261 else {
262## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
263 }
264
265 }
266 else {
267 $self->{$key} = $child_self->{$key};
268 }
269 }
270 }
271
272 return $self;
273}
274
275# initialize BasePlugin options
276# if init() is overridden in a sub-class, remember to call BasePlugin::init()
277sub init {
278 my $self = shift (@_);
279 my ($verbosity, $outhandle, $failhandle) = @_;
280
281 # verbosity is passed through from the processor
282 $self->{'verbosity'} = $verbosity;
283
284 # as are the outhandle and failhandle
285 $self->{'outhandle'} = $outhandle if defined $outhandle;
286 $self->{'failhandle'} = $failhandle;
287# $self->SUPER::init(@_);
288
289 # set process_exp and block_exp to defaults unless they were
290 # explicitly set
291
292 if ((!$self->is_recursive()) and
293 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
294
295 $self->{'process_exp'} = $self->get_default_process_exp ();
296 if ($self->{'process_exp'} eq "") {
297 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
298 }
299 }
300
301 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
302 $self->{'block_exp'} = $self->get_default_block_exp ();
303 }
304
305}
306
307sub begin {
308 my $self = shift (@_);
309 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
310
311 if ($self->{'OIDtype'} eq "auto") {
312 # hasn't been set in the plugin, use the processor values
313 $self->{'OIDtype'} = $processor->{'OIDtype'};
314 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
315 }
316 if ($self->{'OIDtype'} eq "hash") {
317 # should we hash on the file or on the doc xml??
318 $self->{'OIDtype'} = $self->get_oid_hash_type();
319 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
320 $self->{'OIDtype'} = "hash_on_file";
321 }
322 }
323}
324
325# This is called once if removeold is set with import.pl. Most plugins will do
326# nothing but if a plugin does any stuff outside of creating doc obj, then
327# it may need to clear something.
328sub remove_all {
329 my $self = shift (@_);
330 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
331}
332
333# This is called per document for docs that have been deleted from the
334# collection. Most plugins will do nothing
335# but if a plugin does any stuff outside of creating doc obj, then it may need
336# to clear something.
337sub remove_one {
338 my $self = shift (@_);
339
340 my ($file, $oids, $archivedir) = @_;
341 return 0 if $self->can_process_this_file($file);
342 return undef;
343}
344
345sub end {
346 # potentially called at the end of each plugin pass
347 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
348
349 my ($self) = shift (@_);
350}
351
352sub deinit {
353 # called only once, after all plugin passes have been done
354
355 my ($self) = @_;
356}
357
358# default hashing type is to hash on the original file (or converted file)
359# override this to return hash_on_ga_xml for filetypes where hashing on the
360# file is no good eg video
361sub get_oid_hash_type {
362
363 my $self = shift (@_);
364
365 return "hash_on_file";
366}
367
368
369# this function should be overridden to return 1
370# in recursive plugins
371sub is_recursive {
372 my $self = shift (@_);
373
374 return 0;
375}
376
377sub get_default_block_exp {
378 my $self = shift (@_);
379
380 return "";
381}
382
383sub get_default_process_exp {
384 my $self = shift (@_);
385
386 return "";
387}
388
389
390# rename imported files and assoc files using URL encoding by default
391# as this will work for most plugins and give more legible filenames
392sub get_default_file_rename_method() {
393 my $self = shift (@_);
394 return "url";
395}
396
397# returns this plugin's active (possibly user-selected) file_rename_method
398sub get_file_rename_method() {
399 my $self = shift (@_);
400 my $rename_method = $self->{'file_rename_method'};
401 if($rename_method) {
402 return $rename_method;
403 } else {
404 return $self->get_default_file_rename_method();
405 }
406}
407
408# default implementation is to do nothing
409sub store_block_files {
410
411 my $self =shift (@_);
412 my ($filename_full_path, $block_hash) = @_;
413
414}
415
416# put files to block into hash
417sub use_block_expressions {
418
419 my $self =shift (@_);
420 my ($filename_full_path, $block_hash) = @_;
421
422 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
423
424 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
425 &util::block_filename($block_hash,$filename_full_path);
426 }
427
428}
429
430#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
431sub block_cover_image
432{
433 my $self =shift;
434 my ($filename, $block_hash) = @_;
435
436 $filename = &util::upgrade_if_dos_filename($filename);
437
438 if ($self->{'cover_image'}) {
439 my $coverfile = $filename;
440 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
441
442 #if there is no file extension, coverfile will be the same as filename
443 return if $coverfile eq $filename;
444
445 if (!&util::fd_exists($coverfile)) {
446 $coverfile =~ s/jpg$/JPG/;
447 }
448 if (&util::fd_exists($coverfile)) {
449 &util::block_filename($block_hash,$coverfile);
450 }
451 }
452
453 return;
454}
455
456
457# discover all the files that should be blocked by this plugin
458# check the args ...
459sub file_block_read {
460
461 my $self = shift (@_);
462 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
463 # Keep track of filenames with same root but different extensions
464 # Used to support -associate_ext and the more generalised
465 # -associate_tail_re
466 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
467
468 if (!-d $filename_full_path) {
469 $block_hash->{'all_files'}->{$file} = 1;
470 }
471
472 my $associate_tail_re = $self->{'associate_tail_re'};
473 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
474 my ($file_prefix,$file_ext)
475 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
476 if ((defined $file_prefix) && (defined $file_ext)) {
477 my $shared_fileroot = $block_hash->{'shared_fileroot'};
478 if (!defined $shared_fileroot->{$file_prefix}) {
479 my $file_prefix_rec = { 'tie_to' => undef,
480 'exts' => {} };
481 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
482 }
483
484 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
485
486 if ($self->can_process_this_file($filename_full_path)) {
487 # This is the document the others should be tied to
488 $file_prefix_rec->{'tie_to'} = $file_ext;
489 }
490 else {
491 if ($file_ext =~ m/$associate_tail_re$/) {
492 # this file should be associated to the main one
493 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
494 }
495 }
496
497 }
498 }
499
500 # check block expressions
501 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
502
503 # now check whether we are actually processing this
504 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
505 return undef; # can't recognise
506 }
507
508 # if we have a block_exp, then this overrides the normal 'smart' blocking
509 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
510
511 # block the cover image if there is one
512 if ($self->{'cover_image'}) {
513 $self->block_cover_image($filename_full_path, $block_hash);
514 }
515
516 return 1;
517}
518
519# plugins that rely on more than process_exp (eg XML plugins) can override this method
520sub can_process_this_file {
521 my $self = shift(@_);
522 my ($filename) = @_;
523
524 if (-d $filename && !$self->{'can_process_directories'}) {
525 return 0;
526 }
527
528 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
529 return 1;
530 }
531 return 0;
532
533}
534
535# Even if a plugin can extract metadata in its metadata_read pass,
536# make the default return 'undef' so processing of the file continues
537# down the pipeline, so other plugins can also have the opportunity to
538# locate metadata and set it up in the extrametakeys variables that
539# are passed around.
540
541sub can_process_this_file_for_metadata {
542 my $self = shift(@_);
543
544 return undef;
545}
546
547
548# just converts path as is to utf8.
549sub filepath_to_utf8 {
550 my $self = shift (@_);
551 my ($file, $file_encoding) = @_;
552 my $filemeta = $file;
553
554 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
555
556 # Whenever filename-encoding is set to any of the auto settings, we
557 # check if the filename is already in UTF8. If it is, then we're done.
558 if($filename_encoding =~ m/auto/) {
559 if(&unicode::check_is_utf8($filemeta))
560 {
561 $filename_encoding = "utf8";
562 return $filemeta;
563 }
564 }
565
566 # Auto setting, but filename is not utf8
567 if ($filename_encoding eq "auto")
568 {
569 # try textcat
570 $filename_encoding = $self->textcat_encoding($filemeta);
571
572 # check the locale next
573 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
574
575
576 # now try the encoding of the document, if available
577 if ($filename_encoding eq "undefined" && defined $file_encoding) {
578 $filename_encoding = $file_encoding;
579 }
580
581 }
582
583 elsif ($filename_encoding eq "auto-language-analysis")
584 {
585 $filename_encoding = $self->textcat_encoding($filemeta);
586
587 # now try the encoding of the document, if available
588 if ($filename_encoding eq "undefined" && defined $file_encoding) {
589 $filename_encoding = $file_encoding;
590 }
591 }
592
593 elsif ($filename_encoding eq "auto-filesystem-encoding")
594 {
595 # try locale
596 $filename_encoding = $self->locale_encoding();
597 }
598
599 elsif ($filename_encoding eq "auto-fl")
600 {
601 # filesystem-encoding (locale) then language-analysis (textcat)
602 $filename_encoding = $self->locale_encoding();
603
604 # try textcat
605 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
606
607 # else assume filename encoding is encoding of file content, if that's available
608 if ($filename_encoding eq "undefined" && defined $file_encoding) {
609 $filename_encoding = $file_encoding;
610 }
611 }
612
613 elsif ($filename_encoding eq "auto-lf")
614 {
615 # language-analysis (textcat) then filesystem-encoding (locale)
616 $filename_encoding = $self->textcat_encoding($filemeta);
617
618 # guess filename encoding from encoding of file content, if available
619 if ($filename_encoding eq "undefined" && defined $file_encoding) {
620 $filename_encoding = $file_encoding;
621 }
622
623 # try locale
624 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
625 }
626
627 # if still undefined, use utf8 as fallback
628 if ($filename_encoding eq "undefined") {
629 $filename_encoding = "utf8";
630 }
631
632 #print STDERR "**** UTF8 encoding the filename $filemeta ";
633
634 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
635 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
636 # be always utf8 (in which case the filename's encoding is also set as utf8 even
637 # though the filename need not be if it originates from another system)--in such
638 # cases attempt to make the filename utf8 to match.
639 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
640 &unicode::ensure_utf8(\$filemeta);
641 }
642
643 # convert non-unicode encodings to utf8
644 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
645 $filemeta = &unicode::unicode2utf8(
646 &unicode::convert2unicode($filename_encoding, \$filemeta)
647 );
648 }
649
650 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
651 return $filemeta;
652}
653
654# gets the filename with no path, converts to utf8, and then dm safes it.
655# filename_encoding set by user
656sub filename_to_utf8_metadata
657{
658 my $self = shift (@_);
659 my ($file, $file_encoding) = @_;
660
661 my $outhandle = $self->{'outhandle'};
662
663 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
664 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
665 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
666
667
668 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
669 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
670
671 return $filemeta;
672}
673
674sub locale_encoding {
675 my $self = shift(@_);
676
677 if (!defined $self->{'filesystem_encoding'}) {
678 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
679 }
680
681 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
682 return $self->{'filesystem_encoding'}; # can be the string "undefined"
683}
684
685sub textcat_encoding {
686 my $self = shift(@_);
687 my ($filemeta) = @_;
688
689 # analyse filenames without extensions and digits (and trimmed of
690 # surrounding whitespace), so that irrelevant chars don't confuse
691 # textcat
692 my $strictfilemeta = $filemeta;
693 $strictfilemeta =~ s/\.[^\.]+$//g;
694 $strictfilemeta =~ s/\d//g;
695 $strictfilemeta =~ s/^\s*//g;
696 $strictfilemeta =~ s/\s*$//g;
697
698 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
699 if(!defined $filename_encoding) {
700 $filename_encoding = "undefined";
701 }
702
703 return $filename_encoding; # can be the string "undefined"
704}
705
706# performs textcat
707sub encoding_from_language_analysis {
708 my $self = shift(@_);
709 my ($text) = @_;
710
711 my $outhandle = $self->{'outhandle'};
712 my $best_encoding = undef;
713
714 # get the language/encoding of the textstring using textcat
715 require textcat; # Only load the textcat module if it is required
716 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
717 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
718
719
720 if (scalar @$results < 0) {
721 return undef;
722 }
723
724 # We have some results, we choose the first
725 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
726
727 $best_encoding = $encoding;
728 if (!defined $best_encoding) {
729 return undef;
730 }
731
732 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
733 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
734 $best_encoding = 'utf8';
735 }
736
737
738 # check for equivalents where textcat doesn't have some encodings...
739 # eg MS versions of standard encodings
740 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
741 my $iso = $1; # which variant of the iso standard?
742 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
743 if ($text =~ /[\x80-\x9f]/) {
744 # Western Europe
745 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
746 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
747 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
748 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
749 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
750 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
751 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
752 }
753 }
754
755 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
756 !defined $encodings::encodings->{$best_encoding})
757 {
758 if ($self->{'verbosity'}) {
759 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
760 }
761 $best_encoding = undef;
762 }
763
764 return $best_encoding;
765}
766
767# uses locale
768sub get_filesystem_encoding
769{
770
771 my $self = shift(@_);
772
773 my $outhandle = $self->{'outhandle'};
774 my $filesystem_encoding = undef;
775
776 eval {
777 # Works for Windows as well, returning the DOS code page in use
778 use POSIX qw(locale_h);
779
780 # With only one parameter, setlocale retrieves the
781 # current value
782 my $current_locale = setlocale(LC_CTYPE);
783
784 my $char_encoding = undef;
785 if ($current_locale =~ m/\./) {
786 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
787 $char_encoding = lc($char_encoding);
788 }
789 else {
790 if ($current_locale =~ m/^(posix|c)$/i) {
791 $char_encoding = "ascii";
792 }
793 }
794
795 if (defined $char_encoding) {
796 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
797 $char_encoding = "$1\_$2\_$3";
798 }
799
800 $char_encoding =~ s/-/_/g;
801 $char_encoding =~ s/^utf_8$/utf8/;
802
803 if ($char_encoding =~ m/^\d+$/) {
804 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
805 $char_encoding = "windows_$char_encoding";
806 }
807 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
808 $char_encoding = "dos_$char_encoding";
809 }
810 }
811
812 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
813 || (defined $encodings::encodings->{$char_encoding})) {
814 $filesystem_encoding = $char_encoding;
815 }
816 else {
817 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
818 }
819 }
820
821
822 };
823 if ($@) {
824 print $outhandle "$@\n";
825 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
826
827 }
828
829 return $filesystem_encoding;
830}
831
832
833
834sub deduce_filename_encoding
835{
836 my $self = shift (@_);
837 my ($file,$metadata,$plugin_filename_encoding) = @_;
838
839 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
840 my $deduced_filename_encoding = undef;
841
842 # Start by looking for manually assigned metadata
843 if (defined $gs_filename_encoding) {
844 if (ref ($gs_filename_encoding) eq "ARRAY") {
845 my $outhandle = $self->{'outhandle'};
846
847 $deduced_filename_encoding = $gs_filename_encoding->[0];
848
849 my $num_vals = scalar(@$gs_filename_encoding);
850 if ($num_vals>1) {
851 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
852 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
853 }
854 }
855 else {
856 $deduced_filename_encoding = $gs_filename_encoding;
857 }
858 }
859
860 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
861 # Look to see if plugin specifies this value
862
863 if (defined $plugin_filename_encoding) {
864 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
865 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
866 my $outhandle = $self->{'outhandle'};
867 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
868 print $outhandle " default to 'auto'\n";
869 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
870 }
871
872 if ($plugin_filename_encoding ne "auto") {
873 # We've been given a specific filenamne encoding
874 # => so use it!
875 $deduced_filename_encoding = $plugin_filename_encoding;
876 }
877 }
878 }
879
880 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
881
882 # Look to file system to provide a character encoding
883
884 # If Windows NTFS, then -- assuming we work with long file names got through
885 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
886
887 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
888 # Can do better than working with the DOS character encoding returned by locale
889 $deduced_filename_encoding = "unicode";
890 }
891 else {
892 # Unix of some form or other
893
894 # See if we can determine the file system encoding through locale
895 $deduced_filename_encoding = $self->locale_encoding();
896
897 # if locale shows us filesystem is utf8, check to see filename is consistent
898 # => if not, then we have an "alien" filename on our hands
899
900 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
901 if (!&unicode::check_is_utf8($file)) {
902 # "alien" filename, so revert
903 $deduced_filename_encoding = undef;
904 }
905 }
906 }
907 }
908
909# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
910# # Last chance, apply textcat to deduce filename encoding
911# $deduced_filename_encoding = $self->textcat_encoding($file);
912# }
913
914 if ($self->{'verbosity'}>3) {
915 my $outhandle = $self->{'outhandle'};
916
917 if (defined $deduced_filename_encoding) {
918 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
919 }
920 else {
921 print $outhandle " No filename encoding deduced\n";
922 }
923 }
924
925 return $deduced_filename_encoding;
926}
927
928
929
930
931# Notionally written to be called once for each document, it is however safe to
932# call multiple times (as in the case of ImagePlugin) which calls this later on
933# after the original image has potentially been converted to a *new* source image
934# format (e.g. TIFF to PNG)
935
936sub set_Source_metadata {
937 my $self = shift (@_);
938 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
939
940 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
941 # and (as a fallback) using %xx if not for non-ascii chars
942 # 2. Sets the url ref (SourceFile) to the URL encoded version
943 # of filename for generated files
944
945 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
946
947 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
948
949 my $octet_file = $raw_file;
950
951 # UTF-8 version of filename
952# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
953# print STDERR "**** Setting Source Metadata given: $octet_file\n";
954# }
955
956 # Deal with (on Windows) raw filenames that are in their
957 # abbreviated DOS form
958
959 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
960 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
961 if (-e $raw_filename) {
962 my $unicode_filename = Win32::GetLongPathName($raw_filename);
963
964 my $unused_full_uf;
965 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
966 }
967 }
968 }
969
970 my $url_encoded_filename;
971 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
972 # => Generate a pretty print version of filename that is mapped to Unicode
973
974 # Use filename_encoding to map raw filename to a Perl unicode-aware string
975 $url_encoded_filename = decode($filename_encoding,$octet_file);
976 }
977 else {
978 # otherwise generate %xx encoded version of filename for char > 127
979 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
980 }
981
982# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
983# print STDERR "****** saving Source as: $url_encoded_filename\n";
984# }
985
986 # In the case of converted files and (generalized) exploded documents, there
987 # will already be a source filename => store as OrigSource before overriding
988 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
989 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
990 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
991 }
992
993 # Source is the UTF8 display name - not necessarily the name of the file on the system
994 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
995 # on Darwin want all display strings to be in composed form, then can search on that
996 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
997 }
998 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
999
1000
1001 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
1002 # If using URL encoding, then SourceFile is the url-reference to url-encoded
1003 # renamed_raw_url: it's a url that refers to the actual file on the system
1004 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
1005
1006 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
1007 $renamed_raw_url);
1008
1009# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1010# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
1011# }
1012}
1013
1014# this should be called by all plugins to set the oid of the doc obj, rather
1015# than calling doc_obj->set_OID directly
1016sub add_OID {
1017 my $self = shift (@_);
1018 my ($doc_obj) = @_;
1019
1020 # don't add one if there is one already set
1021 return unless $doc_obj->get_OID() =~ /^NULL$/;
1022 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
1023
1024 # see if there is a plugin specific set_OID function
1025 if (defined ($self->can('set_OID'))) {
1026 $self->set_OID(@_); # pass through doc_obj and any extra arguments
1027 }
1028 else {
1029 # use the default set_OID() in doc.pm
1030 $doc_obj->set_OID();
1031 }
1032
1033}
1034
1035# The BasePlugin read_into_doc_obj() function. This function does all the
1036# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
1037# a file and sets up a slew of metadata all saved in doc_obj, which
1038# it then returns as part of a tuple (process_status,doc_obj)
1039#
1040# Much of this functionality used to reside in read, but it was broken
1041# down into a supporting routine to make the code more flexible.
1042#
1043# recursive plugins (e.g. RecPlug) and specialized plugins like those
1044# capable of processing many documents within a single file (e.g.
1045# GMLPlug) will normally want to implement their own version of
1046# read_into_doc_obj()
1047#
1048# Note that $base_dir might be "" and that $file might
1049# include directories
1050
1051# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
1052sub read_into_doc_obj {
1053 my $self = shift (@_);
1054 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1055
1056 my $outhandle = $self->{'outhandle'};
1057
1058 # should we move this to read? What about secondary plugins?
1059 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
1060 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
1061 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
1062 if $self->{'verbosity'} > 1;
1063
1064 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1065
1066 # create a new document
1067 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
1068 my $top_section = $doc_obj->get_top_section();
1069
1070 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
1071 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
1072
1073
1074 my $plugin_filename_encoding = $self->{'filename_encoding'};
1075 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
1076 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
1077
1078 # plugin specific stuff - what args do we need here??
1079 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
1080 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
1081 return -1;
1082 }
1083
1084 # include any metadata passed in from previous plugins
1085 # note that this metadata is associated with the top level section
1086 my $section = $doc_obj->get_top_section();
1087 # can we merge these two methods??
1088 $self->add_associated_files($doc_obj, $filename_full_path);
1089 $self->extra_metadata ($doc_obj, $section, $metadata);
1090 $self->auto_extract_metadata($doc_obj);
1091
1092 # if we haven't found any Title so far, assign one
1093 # this was shifted to here from inside read()
1094 $self->title_fallback($doc_obj,$section,$filename_no_path);
1095
1096 $self->add_OID($doc_obj);
1097
1098 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
1099 return (1,$doc_obj);
1100}
1101
1102sub post_process_doc_obj {
1103 my $self = shift (@_);
1104 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1105
1106 return 1;
1107}
1108
1109sub add_dummy_text {
1110 my $self = shift(@_);
1111 my ($doc_obj, $section) = @_;
1112
1113 # add NoText metadata so we can hide this dummy text in format statements
1114 $doc_obj->add_metadata($section, "NoText", "1");
1115 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
1116
1117}
1118
1119# does nothing. Can be overridden by subclass
1120sub auto_extract_metadata {
1121 my $self = shift(@_);
1122 my ($doc_obj) = @_;
1123}
1124
1125# adds cover image, associate_file options stuff. Should be called by sub class
1126# read_into_doc_obj
1127sub add_associated_files {
1128 my $self = shift(@_);
1129 # whatis filename??
1130 my ($doc_obj, $filename) = @_;
1131
1132 # add in the cover image
1133 if ($self->{'cover_image'}) {
1134 $self->associate_cover_image($doc_obj, $filename);
1135 }
1136 # store the original (used for eg TextPlugin to store the original for OAI)
1137 if ($self->{'store_original_file'}) {
1138 $self->associate_source_file($doc_obj, $filename);
1139 }
1140
1141
1142}
1143
1144# implement this if you are extracting metadata for other documents
1145sub metadata_read {
1146 my $self = shift (@_);
1147 my ($pluginfo, $base_dir, $file, $block_hash,
1148 $extrametakeys, $extrametadata, $extrametafile,
1149 $processor, $gli, $aux) = @_;
1150
1151 # can we process this file??
1152 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1153 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
1154
1155 return 1; # we recognise the file, but don't actually do anything with it
1156}
1157
1158
1159# The BasePlugin read() function. This function calls read_into_doc_obj()
1160# to ensure all the right things to make general options work for a
1161# given plugin are done. It then calls the process() function which
1162# does all the work specific to a plugin (like the old read functions
1163# used to do). Most plugins should define their own process() function
1164# and let this read() function keep control.
1165#
1166# recursive plugins (e.g. RecPlug) and specialized plugins like those
1167# capable of processing many documents within a single file (e.g.
1168# GMLPlug) might want to implement their own version of read(), but
1169# more likely need to implement their own version of read_into_doc_obj()
1170#
1171# Return number of files processed, undef if can't recognise, -1 if can't
1172# process
1173
1174sub read {
1175 my $self = shift (@_);
1176 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1177
1178 # can we process this file??
1179 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
1180
1181 return undef unless $self->can_process_this_file($filename_full_path);
1182
1183 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1184
1185 if ((defined $process_status) && ($process_status == 1)) {
1186
1187 # process the document
1188 $processor->process($doc_obj);
1189
1190 $self->{'num_processed'} ++;
1191 undef $doc_obj;
1192 }
1193 # delete any temp files that we may have created
1194 $self->clean_up_after_doc_obj_processing();
1195
1196
1197 # if process_status == 1, then the file has been processed.
1198 return $process_status;
1199
1200}
1201
1202# returns undef if file is rejected by the plugin
1203sub process {
1204 my $self = shift (@_);
1205 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1206
1207 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n");
1208
1209 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1210 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1211
1212 die "\n";
1213
1214 return undef; # never gets here
1215}
1216
1217# overwrite this method to delete any temp files that we have created
1218sub clean_up_after_doc_obj_processing {
1219 my $self = shift(@_);
1220
1221}
1222
1223# write_file -- used by ConvertToPlug, for example in post processing
1224#
1225# where should this go, is here the best place??
1226sub utf8_write_file {
1227 my $self = shift (@_);
1228 my ($textref, $filename) = @_;
1229
1230 if (!open (FILE, ">:utf8", $filename)) {
1231 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1232 die "\n";
1233 }
1234 print FILE $$textref;
1235
1236 close FILE;
1237}
1238
1239
1240sub filename_based_title
1241{
1242 my $self = shift (@_);
1243 my ($file) = @_;
1244
1245 my $file_derived_title = $file;
1246 $file_derived_title =~ s/_/ /g;
1247 $file_derived_title =~ s/\.[^.]+$//;
1248
1249 return $file_derived_title;
1250}
1251
1252
1253sub title_fallback
1254{
1255 my $self = shift (@_);
1256 my ($doc_obj,$section,$file) = @_;
1257
1258 if (!defined $doc_obj->get_metadata_element ($section, "Title")
1259 || $doc_obj->get_metadata_element($section, "Title") eq "") {
1260
1261 my $source_file = $doc_obj->get_metadata_element($section, "Source");
1262 my $file_derived_title;
1263 if (defined $source_file) {
1264 $file_derived_title = $self->filename_based_title($source_file);
1265 }
1266 else {
1267 # pp = pretty print
1268 my $pp_file = (defined $source_file) ? $source_file : $file;
1269
1270 my $raw_title = $self->filename_based_title($file);
1271 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
1272 }
1273
1274
1275 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1276 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
1277 }
1278 else {
1279 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
1280 }
1281 }
1282
1283}
1284
1285# add any extra metadata that's been passed around from one
1286# plugin to another.
1287# extra_metadata uses add_utf8_metadata so it expects metadata values
1288# to already be in utf8
1289sub extra_metadata {
1290 my $self = shift (@_);
1291 my ($doc_obj, $cursection, $metadata) = @_;
1292
1293 my $associate_tail_re = $self->{'associate_tail_re'};
1294
1295 foreach my $field (keys(%$metadata)) {
1296 # $metadata->{$field} may be an array reference
1297 if ($field eq "gsdlassocfile_tobe") {
1298 # 'gsdlassocfile_tobe' is artificially introduced metadata
1299 # that is used to signal that certain additional files should
1300 # be tied to this document. Useful in situations where a
1301 # metadata pass in the plugin pipeline works out some files
1302 # need to be associated with a document, but the document hasn't
1303 # been formed yet.
1304 my $equiv_form = "";
1305 foreach my $gaf (@{$metadata->{$field}}) {
1306 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1307 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1308
1309 # we need to make sure the filename is valid utf-8 - we do
1310 # this by url or base64 encoding it
1311 # $tail_filename is the name that we store the file as
1312 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
1313 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1314 $doc_obj->associate_source_file($full_filename);
1315 # If the filename is url_encoded, we need to encode the % signs
1316 # in the filename, so that it works in a url
1317 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
1318 # work out extended tail extension (i.e. matching tail re)
1319
1320 my ($file_prefix,$file_extended_ext)
1321 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
1322 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1323 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1324
1325 # the greenstone 2 stuff
1326 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
1327 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1328 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
1329
1330 my $srcicon = "_icon".$doc_ext."_";
1331 my $end_doclink = "</a>";
1332
1333 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1334
1335
1336 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
1337 # for metadata such as [mp3._edited] [mp3._full] ...
1338 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1339 }
1340
1341 # for multiple metadata such as [mp3.assoclink]
1342 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1343
1344 $equiv_form .= " $assoc_form";
1345
1346 # following are used for greenstone 3,
1347 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
1348 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
1349 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
1350
1351 }
1352 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1353 }
1354 elsif ($field eq "gsdlzipfilename") {
1355 # special case for when files have come out of a zip. source_path
1356 # (used for archives dbs and keeping track for incremental import)
1357 # must be set to the zip file name
1358 my $zip_filename = $metadata->{$field};
1359 # overwrite the source_path
1360 $doc_obj->set_source_path($zip_filename);
1361 # and set the metadata
1362 $zip_filename = &util::filename_within_collection($zip_filename);
1363 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
1364 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
1365 }
1366 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1367 map {
1368 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1369 } @{$metadata->{$field}};
1370 } else {
1371 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1372 }
1373 }
1374}
1375
1376
1377sub compile_stats {
1378 my $self = shift(@_);
1379 my ($stats) = @_;
1380
1381 $stats->{'num_processed'} += $self->{'num_processed'};
1382 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1383 $stats->{'num_archives'} += $self->{'num_archives'};
1384
1385}
1386sub associate_source_file {
1387 my $self = shift(@_);
1388
1389 my ($doc_obj, $filename) = @_;
1390 my $cursection = $doc_obj->get_top_section();
1391 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1392
1393 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1394 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1395 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1396 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1397}
1398
1399sub associate_cover_image {
1400 my $self = shift(@_);
1401 my ($doc_obj, $filename) = @_;
1402
1403 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1404
1405 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1406 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1407
1408 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1409 # don't stat() for existence e.g. for multiple document input files
1410 # (eg SplitPlug)
1411 return;
1412 }
1413
1414 my $top_section=$doc_obj->get_top_section();
1415
1416 if (&util::fd_exists($upgraded_filename)) {
1417 $doc_obj->associate_source_file($filename);
1418 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1419 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1420 } else {
1421 my $upper_filename = $filename;
1422 my $upgraded_upper_filename = $upgraded_filename;
1423
1424 $upper_filename =~ s/jpg$/JPG/;
1425 $upgraded_upper_filename =~ s/jpg$/JPG/;
1426
1427 if (&util::fd_exists($upgraded_upper_filename)) {
1428 $doc_obj->associate_source_file($upper_filename);
1429 $doc_obj->associate_file($upper_filename, "cover.jpg",
1430 "image/jpeg");
1431 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1432 } else {
1433 # file doesn't exist, so record the fact that it's missing so
1434 # we don't stat() again (stat is slow)
1435 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1436 }
1437 }
1438
1439}
1440
1441
1442# Overridden by exploding plugins (eg. ISISPlug)
1443sub clean_up_after_exploding
1444{
1445 my $self = shift(@_);
1446}
1447
1448
1449
14501;
Note: See TracBrowser for help on using the repository browser.