source: main/trunk/greenstone2/perllib/plugins/BaseImporter.pm@ 37048

Last change on this file since 37048 was 37048, checked in by davidb, 16 months ago

Useful support routine added that only sets the document field to say it has no text if the text field is empty. This routine helps plugins such as ImagePlugin that never used to have any text and would therefore set this field to 'no text'. Newer work such as GoogleVisionImagePlugin in Inherits from ImagePlugin and as a result of calling the Google Visions API can now find text in a image. This new support routine helps with setting dummy text on when needed, but if there is evidence of text that is stored in 'doc_obj' then it does not set dummy text; there is also a useful debugging statement added that prints out which *inheritend* plugin in checking the process expression -- commented out

  • Property svn:keywords set to Author Date Id Revision
File size: 36.4 KB
Line 
1###########################################################################
2#
3# BaseImporter.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BaseImporter;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41use util;
42use FileUtils;
43
44use CommonUtil;
45
46BEGIN {
47 @BaseImporter::ISA = ( 'CommonUtil' );
48}
49
50# the different methods that can be applied when renaming
51# imported documents and their associated files
52our $file_rename_method_list =
53 [ { 'name' => "url",
54 'desc' => "{BaseImporter.rename_method.url}" },
55 { 'name' => "base64",
56 'desc' => "{BaseImporter.rename_method.base64}" },
57 { 'name' => "none",
58 'desc' => "{BaseImporter.rename_method.none}",
59 'hiddengli' => "yes" } ];
60
61# here went encoding list stuff
62
63our $oidtype_list =
64 [ { 'name' => "auto",
65 'desc' => "{BaseImporter.OIDtype.auto}" },
66 { 'name' => "hash",
67 'desc' => "{import.OIDtype.hash}" },
68 { 'name' => "hash_on_ga_xml",
69 'desc' => "{import.OIDtype.hash_on_ga_xml}" },
70 { 'name' => "hash_on_full_filename",
71 'desc' => "{import.OIDtype.hash_on_full_filename}" },
72 { 'name' => "assigned",
73 'desc' => "{import.OIDtype.assigned}" },
74 { 'name' => "incremental",
75 'desc' => "{import.OIDtype.incremental}" },
76 { 'name' => "filename",
77 'desc' => "{import.OIDtype.filename}" },
78 { 'name' => "dirname",
79 'desc' => "{import.OIDtype.dirname}" },
80 { 'name' => "full_filename",
81 'desc' => "{import.OIDtype.full_filename}" } ];
82
83my $arguments =
84 [ { 'name' => "process_exp",
85 'desc' => "{BaseImporter.process_exp}",
86 'type' => "regexp",
87 'deft' => "",
88 'reqd' => "no" },
89 { 'name' => "store_original_file",
90 'desc' => "{BaseImporter.store_original_file}",
91 'type' => "flag",
92 'reqd' => "no" },
93 { 'name' => "associate_ext",
94 'desc' => "{BaseImporter.associate_ext}",
95 'type' => "string",
96 'reqd' => "no" },
97 { 'name' => "associate_tail_re",
98 'desc' => "{BaseImporter.associate_tail_re}",
99 'type' => "string",
100 'reqd' => "no" },
101 { 'name' => "OIDtype",
102 'desc' => "{import.OIDtype}",
103 'type' => "enum",
104 'list' => $oidtype_list,
105 # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
106 'deft' => "auto",
107 'reqd' => "no" },
108 { 'name' => "OIDmetadata",
109 'desc' => "{import.OIDmetadata}",
110 'type' => "metadata",
111 'deft' => "dc.Identifier",
112 'reqd' => "no" },
113# { 'name' => "use_as_doc_identifier",
114# 'desc' => "{BaseImporter.use_as_doc_identifier}",
115# 'type' => "string",
116# 'reqd' => "no" ,
117# 'deft' => "" } ,
118 { 'name' => "no_cover_image",
119 'desc' => "{BaseImporter.no_cover_image}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "file_rename_method",
123 'desc' => "{BaseImporter.file_rename_method}",
124 'type' => "enum",
125 'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
126 'list' => $file_rename_method_list,
127 'reqd' => "no"
128 }
129
130 # These options take a metadata field, and store its values as individual
131 #fields with true value.
132 #eg wmtb.TKLabel=tk.Clan,tk.Attribution =>tk.Clan=true, tk.Attribution=true.
133 #Implemented for tk lables, but then not needed as this is handled by
134 # xslt instead. Leaving this here in case useful in the future
135# { 'name' => "store_metadata_values_as_fields_for",
136# 'desc' => "For these (comma-separated list of) fields, take all values and store them as metadata fields in their own write, value=true ",
137# 'type' => "metadata",
138# 'reqd' => "no" },
139# { 'name' => "namespace_for_new_fields",
140# 'desc' => "add the specified namespace to the new fields (comma separated list, corresponding to the fields above",
141# 'type'=> "string",
142# 'reqd' => "no",
143# 'deft' => ""},
144# { 'name' => "include_section_level",
145# 'desc' => "do we look for this metadata at subsection level too?",
146# 'type' => "string",
147# 'deft' => "false"}
148 ];
149
150
151my $options = { 'name' => "BaseImporter",
152 'desc' => "{BaseImporter.desc}",
153 'abstract' => "yes",
154 'inherits' => "yes",
155 'args' => $arguments };
156
157sub new {
158
159 my ($class) = shift (@_);
160 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
161 push(@$pluginlist, $class);
162
163 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
164 push(@{$hashArgOptLists->{"OptList"}},$options);
165
166 my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
167
168 if ($self->{'info_only'}) {
169 # don't worry about any options etc
170 return bless $self, $class;
171 }
172
173 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
174 $self->{'plugin_type'} = $plugin_name;
175
176 # remove ex. from OIDmetadata iff it's the only namespace prefix
177 $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
178 $self->{'num_processed'} = 0;
179 $self->{'num_not_processed'} = 0;
180 $self->{'num_blocked'} = 0;
181 $self->{'num_archives'} = 0;
182 $self->{'cover_image'} = 1; # cover image is on by default
183 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
184 $self->{'can_process_directories'} = 0;
185 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
186
187 my $associate_ext = $self->{'associate_ext'};
188 if ((defined $associate_ext) && ($associate_ext ne "")) {
189
190 my $associate_tail_re = $self->{'associate_tail_re'};
191 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
192 my $outhandle = $self->{'outhandle'};
193 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
194 print $outhandle " defaulting to 'associate_tail_re'\n";
195 }
196 else {
197 my @exts = split(/,/,$associate_ext);
198
199 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
200 my $associate_tail_re = join("|",@exts_bracketed);
201 $self->{'associate_tail_re'} = $associate_tail_re;
202 }
203
204 delete $self->{'associate_ext'};
205 }
206
207 return bless $self, $class;
208
209}
210
211sub merge_inheritance
212{
213 my $self = {};
214 my @child_selfs = @_;
215
216 foreach my $child_self (@child_selfs) {
217 foreach my $key (keys %$child_self) {
218 if (defined $self->{$key}) {
219 if ($self->{$key} ne $child_self->{$key}) {
220# print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
221# print STDERR "Existing stored value = $self->{$key}\n";
222# print STDERR "New (child) value = $child_self->{$key}\n";
223# print STDERR "Keeping existing value\n";
224 # Existing value seems to be option specified in collect.cfg
225
226 ### $self->{$key} = $child_self->{$key};
227
228 }
229 else {
230## print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
231 }
232
233 }
234 else {
235 $self->{$key} = $child_self->{$key};
236 }
237 }
238 }
239
240 return $self;
241}
242
243# initialize BaseImporter options
244# if init() is overridden in a sub-class, remember to call BaseImporter::init()
245sub init {
246 my $self = shift (@_);
247 my ($verbosity, $outhandle, $failhandle) = @_;
248
249 $self->SUPER::init(@_);
250
251 # set process_exp and block_exp to defaults unless they were
252 # explicitly set
253
254 if ((!$self->is_recursive()) and
255 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
256
257 $self->{'process_exp'} = $self->get_default_process_exp ();
258 if ($self->{'process_exp'} eq "") {
259 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
260 }
261 }
262
263 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
264 $self->{'block_exp'} = $self->get_default_block_exp ();
265 }
266
267}
268
269sub begin {
270 my $self = shift (@_);
271 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
272
273 if ($self->{'OIDtype'} eq "auto") {
274 # hasn't been set in the plugin, use the processor values
275 $self->{'OIDtype'} = $processor->{'OIDtype'};
276 $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
277 }
278 if ($self->{'OIDtype'} eq "hash") {
279 # should we hash on the file or on the doc xml??
280 $self->{'OIDtype'} = $self->get_oid_hash_type();
281 if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
282 $self->{'OIDtype'} = "hash_on_file";
283 }
284 }
285}
286
287# This is called once if removeold is set with import.pl. Most plugins will do
288# nothing but if a plugin does any stuff outside of creating doc obj, then
289# it may need to clear something.
290sub remove_all {
291 my $self = shift (@_);
292 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
293}
294
295# This is called per document for docs that have been deleted from the
296# collection. Most plugins will do nothing
297# but if a plugin does any stuff outside of creating doc obj, then it may need
298# to clear something.
299sub remove_one {
300 my $self = shift (@_);
301
302 my ($file, $oids, $archivedir) = @_;
303 return 0 if $self->can_process_this_file($file);
304 return undef;
305}
306
307sub end {
308 # potentially called at the end of each plugin pass
309 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
310
311 my ($self) = shift (@_);
312}
313
314sub deinit {
315 # called only once, after all plugin passes have been done
316
317 my ($self) = @_;
318}
319
320# default hashing type is to hash on the original file (or converted file)
321# override this to return hash_on_ga_xml for filetypes where hashing on the
322# file is no good eg video
323sub get_oid_hash_type {
324
325 my $self = shift (@_);
326
327 return "hash_on_file";
328}
329
330
331# this function should be overridden to return 1
332# in recursive plugins
333sub is_recursive {
334 my $self = shift (@_);
335
336 return 0;
337}
338
339sub get_default_block_exp {
340 my $self = shift (@_);
341
342 return "";
343}
344
345sub get_default_process_exp {
346 my $self = shift (@_);
347
348 return "";
349}
350
351
352# rename imported files and assoc files using URL encoding by default
353# as this will work for most plugins and give more legible filenames
354sub get_default_file_rename_method() {
355 my $self = shift (@_);
356 return "url";
357}
358
359# returns this plugin's active (possibly user-selected) file_rename_method
360sub get_file_rename_method() {
361 my $self = shift (@_);
362 my $rename_method = $self->{'file_rename_method'};
363 if($rename_method) {
364 return $rename_method;
365 } else {
366 return $self->get_default_file_rename_method();
367 }
368}
369
370# default implementation is to do nothing
371sub store_block_files {
372
373 my $self =shift (@_);
374 my ($filename_full_path, $block_hash) = @_;
375
376}
377
378# put files to block into hash
379sub use_block_expressions {
380
381 my $self =shift (@_);
382 my ($filename_full_path, $block_hash) = @_;
383
384 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
385
386 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
387 $self->block_filename($block_hash,$filename_full_path);
388 }
389
390}
391
392#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
393sub block_cover_image
394{
395 my $self =shift;
396 my ($filename, $block_hash) = @_;
397
398 $filename = &util::upgrade_if_dos_filename($filename);
399
400 if ($self->{'cover_image'}) {
401 my $coverfile = $filename;
402 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
403
404 #if there is no file extension, coverfile will be the same as filename
405 return if $coverfile eq $filename;
406
407 if (!&FileUtils::fileExists($coverfile)) {
408 $coverfile =~ s/jpg$/JPG/;
409 }
410 if (&FileUtils::fileExists($coverfile)) {
411 $self->block_filename($block_hash,$coverfile);
412 }
413 }
414
415 return;
416}
417
418
419# discover all the files that should be blocked by this plugin
420# check the args ...
421sub file_block_read {
422
423 my $self = shift (@_);
424 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
425 # Keep track of filenames with same root but different extensions
426 # Used to support -associate_ext and the more generalised
427 # -associate_tail_re
428 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
429
430 if (!-d $filename_full_path) {
431 $block_hash->{'all_files'}->{$file} = 1;
432 }
433
434 my $associate_tail_re = $self->{'associate_tail_re'};
435 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
436 my ($file_prefix,$file_ext)
437 = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
438 if ((defined $file_prefix) && (defined $file_ext)) {
439 my $shared_fileroot = $block_hash->{'shared_fileroot'};
440 if (!defined $shared_fileroot->{$file_prefix}) {
441 my $file_prefix_rec = { 'tie_to' => undef,
442 'exts' => {} };
443 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
444 }
445
446 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
447
448 if ($self->can_process_this_file($filename_full_path) && $file_ext !~ m/.\./) {
449 # This is the document the others should be tied to
450 $file_prefix_rec->{'tie_to'} = $file_ext;
451 }
452 else {
453 if ($file_ext =~ m/$associate_tail_re$/) {
454 # this file should be associated to the main one
455 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
456 }
457 }
458
459 }
460 }
461
462 # check block expressions
463 $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
464
465 # now check whether we are actually processing this
466 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
467 return undef; # can't recognise
468 }
469
470 # if we have a block_exp, then this overrides the normal 'smart' blocking
471 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
472
473 # block the cover image if there is one
474 if ($self->{'cover_image'}) {
475 $self->block_cover_image($filename_full_path, $block_hash);
476 }
477
478 return 1;
479}
480
481# plugins that rely on more than process_exp (eg XML plugins) can override this method
482sub can_process_this_file {
483 my $self = shift(@_);
484 my ($filename) = @_;
485
486 if (-d $filename && !$self->{'can_process_directories'}) {
487 return 0;
488 }
489
490 # print STDERR "**** BaseImport::can_process_this_file(): ", ref($self), " checking $filename =~ /$self->{'process_exp'}/\n";
491
492 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
493 return 1;
494 }
495 return 0;
496
497}
498
499# Even if a plugin can extract metadata in its metadata_read pass,
500# make the default return 'undef' so processing of the file continues
501# down the pipeline, so other plugins can also have the opportunity to
502# locate metadata and set it up in the extrametakeys variables that
503# are passed around.
504
505sub can_process_this_file_for_metadata {
506 my $self = shift(@_);
507
508 return undef;
509}
510
511
512
513# Notionally written to be called once for each document, it is however safe to
514# call multiple times (as in the case of ImagePlugin) which calls this later on
515# after the original image has potentially been converted to a *new* source image
516# format (e.g. TIFF to PNG)
517
518sub set_Source_metadata {
519 my $self = shift (@_);
520 my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
521
522 # 1. Sets the filename (Source) for display encoded as Unicode if possible,
523 # and (as a fallback) using %xx if not for non-ascii chars
524 # 2. Sets the url ref (SourceFile) to the URL encoded version
525 # of filename for generated files
526
527 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
528
529 my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
530
531 my $octet_file = $raw_file;
532
533 # UTF-8 version of filename
534# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
535# print STDERR "**** Setting Source Metadata given: $octet_file\n";
536# }
537
538 # Deal with (on Windows) raw filenames that are in their
539 # abbreviated DOS form
540
541 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
542 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
543 if (-e $raw_filename) {
544 my $unicode_filename = Win32::GetLongPathName($raw_filename);
545
546 my $unused_full_uf;
547 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
548 }
549 }
550 }
551
552 my $url_encoded_filename;
553 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
554 # => Generate a pretty print version of filename that is mapped to Unicode
555
556 # Use filename_encoding to map raw filename to a Perl unicode-aware string
557 $url_encoded_filename = decode($filename_encoding,$octet_file);
558 }
559 else {
560 # otherwise generate %xx encoded version of filename for char > 127
561 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
562 }
563
564# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
565# print STDERR "****** saving Source as: $url_encoded_filename\n";
566# }
567
568 # In the case of converted files and (generalized) exploded documents, there
569 # will already be a source filename => store as OrigSource before overriding
570 my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
571 if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
572 $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
573 }
574
575 # Source is the UTF8 display name - not necessarily the name of the file on the system
576 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
577 # on Darwin want all display strings to be in composed form, then can search on that
578 $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
579 }
580 # set_utf8_metadata actually sets perl unicode aware strings. not utf8
581 $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
582
583
584 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
585 # If using URL encoding, then SourceFile is the url-reference to url-encoded
586 # renamed_raw_url: it's a url that refers to the actual file on the system
587 # this call just replaces % with %25
588 my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
589
590 $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
591 $renamed_raw_url);
592
593# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
594# print STDERR "****** saving SourceFile as: $renamed_raw_url\n";
595# }
596}
597
598
599# this should be called by all plugins to set the oid of the doc obj, rather
600# than calling doc_obj->set_OID directly
601sub add_OID {
602 my $self = shift (@_);
603 my ($doc_obj, $force) = @_;
604
605 # don't add one if there is one already set, unless we are forced to do so
606 return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
607 $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
608
609 # see if there is a plugin specific set_OID function
610 if (defined ($self->can('set_OID'))) {
611 $self->set_OID(@_); # pass through doc_obj and any extra arguments
612 }
613 else {
614 # use the default set_OID() in doc.pm
615 $doc_obj->set_OID();
616 }
617
618}
619
620# The BaseImporter read_into_doc_obj() function. This function does all the
621# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
622# a file and sets up a slew of metadata all saved in doc_obj, which
623# it then returns as part of a tuple (process_status,doc_obj)
624#
625# Much of this functionality used to reside in read, but it was broken
626# down into a supporting routine to make the code more flexible.
627#
628# recursive plugins (e.g. RecPlug) and specialized plugins like those
629# capable of processing many documents within a single file (e.g.
630# GMLPlug) will normally want to implement their own version of
631# read_into_doc_obj()
632#
633# Note that $base_dir might be "" and that $file might
634# include directories
635
636# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
637sub read_into_doc_obj {
638 my $self = shift (@_);
639 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
640
641 my $outhandle = $self->{'outhandle'};
642
643 # should we move this to read? What about secondary plugins?
644 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
645 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
646 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
647 if $self->{'verbosity'} > 1;
648
649 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
650
651 # create a new document
652 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
653 my $top_section = $doc_obj->get_top_section();
654
655 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
656 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
657
658
659 my $plugin_filename_encoding = $self->{'filename_encoding'};
660 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
661 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
662
663 # plugin specific stuff - what args do we need here??
664 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
665 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
666 return -1;
667 }
668
669 # include any metadata passed in from previous plugins
670 # note that this metadata is associated with the top level section
671 my $section = $doc_obj->get_top_section();
672 # can we merge these two methods??
673 $self->add_associated_files($doc_obj, $filename_full_path);
674 $self->extra_metadata ($doc_obj, $section, $metadata);
675 $self->auto_extract_metadata($doc_obj);
676
677 # to use tk label options commented out at top
678 #$self->convert_special_metadata($doc_obj);
679
680 # if we haven't found any Title so far, assign one
681 # this was shifted to here from inside read()
682 $self->title_fallback($doc_obj,$section,$filename_no_path);
683
684 $self->add_OID($doc_obj);
685
686 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
687 return (1,$doc_obj);
688}
689
690sub post_process_doc_obj {
691 my $self = shift (@_);
692 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
693
694 return 1;
695}
696
697sub add_dummy_text {
698 my $self = shift(@_);
699 my ($doc_obj, $section) = @_;
700
701 # add NoText metadata so we can hide this dummy text in format statements
702 $doc_obj->add_metadata($section, "NoText", "1");
703
704 # lookup_string with extra '1' arg returns perl internal unicode aware text, so we use add_utf8_text so no encoding is done on it.
705 $doc_obj->add_utf8_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
706 #$doc_obj->add_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
707}
708
709sub add_dummy_text_if_empty {
710 my $self = shift(@_);
711 my ($doc_obj, $section) = @_;
712
713 my $section_text_len = $doc_obj->get_text_length($section);
714
715 if ($section_text_len == 0) {
716 $self->add_dummy_text($doc_obj,$section);
717 }
718}
719
720# does nothing. Can be overridden by subclass
721sub auto_extract_metadata {
722 my $self = shift(@_);
723 my ($doc_obj) = @_;
724}
725
726# used with the tk label options commented out at the start
727sub convert_special_metadata {
728 my $self = shift(@_);
729 my ($doc_obj) = @_;
730 print STDERR "in convert special meta\n";
731 return unless defined $self->{'store_metadata_values_as_fields_for'};
732 print STDERR "we have fields to convert\n";
733
734 my @existing_meta_fields = split(',', $self->{'store_metadata_values_as_fields_for'});
735 my @new_ns = split(',', $self->{'namespace_for_new_fields'});
736 my $include_section_level = $self->{'include_section_level'};
737
738 foreach my $old_meta (@existing_meta_fields) {
739 my $ns = shift(@new_ns);
740 my $ns_prefix = "";
741 if (defined $ns && $ns =~/\S/) {
742 $ns_prefix = "$ns.";
743 }
744
745
746 my $section = $doc_obj->get_top_section();
747 while (defined $section) {
748 my @section_meta = @{$doc_obj->get_metadata($section, $old_meta)};
749 foreach my $meta (@section_meta) {
750 foreach my $submeta (split /,/, $meta) {
751 # add meta, ns+submeta, value = true
752 # remove whitepace
753 $submeta =~ s/\s+//g;
754 $doc_obj->add_utf8_metadata($section, "$ns_prefix$submeta", "true");
755 }
756 }
757
758 last if (!$include_section_level);
759 $section = $doc_obj->get_next_section($section);
760 }
761 }
762}
763
764# adds cover image, associate_file options stuff. Should be called by sub class
765# read_into_doc_obj
766sub add_associated_files {
767 my $self = shift(@_);
768 # whatis filename??
769 my ($doc_obj, $filename) = @_;
770
771 # add in the cover image
772 if ($self->{'cover_image'}) {
773 $self->associate_cover_image($doc_obj, $filename);
774 }
775 # store the original (used for eg TextPlugin to store the original for OAI)
776 if ($self->{'store_original_file'}) {
777 $self->associate_source_file($doc_obj, $filename);
778 }
779
780
781}
782
783# implement this if you are extracting metadata for other documents
784sub metadata_read {
785 my $self = shift (@_);
786 my ($pluginfo, $base_dir, $file, $block_hash,
787 $extrametakeys, $extrametadata, $extrametafile,
788 $processor, $gli, $aux) = @_;
789
790 # can we process this file??
791 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
792 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
793
794 return 1; # we recognise the file, but don't actually do anything with it
795}
796
797
798# The BaseImporter read() function. This function calls read_into_doc_obj()
799# to ensure all the right things to make general options work for a
800# given plugin are done. It then calls the process() function which
801# does all the work specific to a plugin (like the old read functions
802# used to do). Most plugins should define their own process() function
803# and let this read() function keep control.
804#
805# recursive plugins (e.g. RecPlug) and specialized plugins like those
806# capable of processing many documents within a single file (e.g.
807# GMLPlug) might want to implement their own version of read(), but
808# more likely need to implement their own version of read_into_doc_obj()
809#
810# Return number of files processed, undef if can't recognise, -1 if can't
811# process
812
813sub read {
814 my $self = shift (@_);
815 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
816
817 # can we process this file??
818 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
819
820 return undef unless $self->can_process_this_file($filename_full_path);
821
822 #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
823 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
824 #print STDERR "**** AFTER READ INTO DOC OBJ: $file\n";
825
826 if ((defined $process_status) && ($process_status == 1)) {
827
828 # process the document
829 $processor->process($doc_obj);
830
831 $self->{'num_processed'} ++;
832 undef $doc_obj;
833 }
834 # delete any temp files that we may have created
835 $self->clean_up_after_doc_obj_processing();
836
837
838 # if process_status == 1, then the file has been processed.
839 return $process_status;
840
841}
842
843# returns undef if file is rejected by the plugin
844sub process {
845 my $self = shift (@_);
846 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
847
848 gsprintf(STDERR, "BaseImporter::process {common.must_be_implemented}\n");
849
850 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
851 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
852
853 die "\n";
854
855 return undef; # never gets here
856}
857
858# overwrite this method to delete any temp files that we have created
859sub clean_up_after_doc_obj_processing {
860 my $self = shift(@_);
861
862}
863
864
865
866sub filename_based_title
867{
868 my $self = shift (@_);
869 my ($file) = @_;
870
871 my $file_derived_title = $file;
872 $file_derived_title =~ s/_/ /g;
873 $file_derived_title =~ s/\.[^.]+$//;
874
875 return $file_derived_title;
876}
877
878
879sub title_fallback
880{
881 my $self = shift (@_);
882 my ($doc_obj,$section,$file) = @_;
883
884 if (!defined $doc_obj->get_metadata_element ($section, "Title")
885 || $doc_obj->get_metadata_element($section, "Title") eq "") {
886
887 my $source_file = $doc_obj->get_metadata_element($section, "Source");
888 my $file_derived_title;
889 if (defined $source_file) {
890 $file_derived_title = $self->filename_based_title($source_file);
891 }
892 else {
893 # pp = pretty print
894 my $pp_file = (defined $source_file) ? $source_file : $file;
895
896 my $raw_title = $self->filename_based_title($file);
897 my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
898 }
899
900
901 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
902 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
903 }
904 else {
905 $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
906 }
907 }
908
909}
910
911# add any extra metadata that's been passed around from one
912# plugin to another.
913# extra_metadata uses add_utf8_metadata so it expects metadata values
914# to already be in utf8
915sub extra_metadata {
916 my $self = shift (@_);
917 my ($doc_obj, $cursection, $metadata) = @_;
918
919 my $associate_tail_re = $self->{'associate_tail_re'};
920
921# Sort the extra metadata for diffcol so these meta appear in a consistent order
922# in doc.xml. Necessary for the ex.PDF.* and ex.File.* meta that's extracted in
923# the PDFBox collection, as the order of these varies between CentOS and Ubuntu.
924 foreach my $field (sort keys(%$metadata)) {
925# foreach my $field (keys(%$metadata)) {
926 # $metadata->{$field} may be an array reference
927 if ($field eq "gsdlassocfile_tobe") {
928 # 'gsdlassocfile_tobe' is artificially introduced metadata
929 # that is used to signal that certain additional files should
930 # be tied to this document. Useful in situations where a
931 # metadata pass in the plugin pipeline works out some files
932 # need to be associated with a document, but the document hasn't
933 # been formed yet.
934 my $equiv_form = "";
935 foreach my $gaf (@{$metadata->{$field}}) {
936 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
937 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
938
939 # we need to make sure the filename is valid utf-8 - we do
940 # this by url or base64 encoding it
941 # $tail_filename is the name that we store the file as
942 $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
943 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
944 $doc_obj->associate_source_file($full_filename);
945 # If the filename is url_encoded, we need to encode the % signs
946 # in the filename, so that it works in a url
947 my $url_tail_filename = &unicode::filename_to_url($tail_filename);
948 # work out extended tail extension (i.e. matching tail re)
949
950 my ($file_prefix,$file_extended_ext)
951 = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
952 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
953 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
954
955 # the greenstone 2 stuff
956 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
957 #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
958 my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
959
960 my $srcicon = "_icon".$doc_ext."_";
961 my $end_doclink = "</a>";
962
963 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
964
965
966 if (defined $pre_doc_ext && $pre_doc_ext ne "") {
967 # for metadata such as [mp3._edited] [mp3._full] ...
968 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
969 }
970
971 # for multiple metadata such as [mp3.assoclink]
972 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
973
974 $equiv_form .= " $assoc_form";
975
976 # following are used for greenstone 3,
977 $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
978 $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
979 $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
980
981 }
982 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
983 }
984 elsif ($field eq "gsdlzipfilename") {
985 # special case for when files have come out of a zip. source_path
986 # (used for archives dbs and keeping track for incremental import)
987 # must be set to the zip file name
988 my $zip_filename = $metadata->{$field};
989 # overwrite the source_path
990 $doc_obj->set_source_path($zip_filename);
991 # and set the metadata
992 $zip_filename = &util::filename_within_collection($zip_filename);
993 $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
994 $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
995 }
996 elsif (ref ($metadata->{$field}) eq "ARRAY") {
997 if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
998 my $field_new_name = $1;
999 my $specified_section = $2;
1000 map {
1001 $doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $_);
1002 } @{$metadata->{$field}};
1003 } else {
1004 map {
1005 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1006 } @{$metadata->{$field}};
1007 }
1008
1009 } else {
1010 if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
1011 my $field_new_name = $1;
1012 my $specified_section = $2;
1013 $doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $metadata->{$field});
1014 } else {
1015 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1016 }
1017 }
1018 }
1019}
1020
1021
1022sub compile_stats {
1023 my $self = shift(@_);
1024 my ($stats) = @_;
1025
1026 $stats->{'num_processed'} += $self->{'num_processed'};
1027 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1028 $stats->{'num_archives'} += $self->{'num_archives'};
1029
1030}
1031sub associate_source_file {
1032 my $self = shift(@_);
1033
1034 my ($doc_obj, $filename) = @_;
1035 my $cursection = $doc_obj->get_top_section();
1036 my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
1037
1038 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
1039 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
1040 $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
1041 $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
1042}
1043
1044sub associate_cover_image {
1045 my $self = shift(@_);
1046 my ($doc_obj, $filename) = @_;
1047
1048 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
1049
1050 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1051 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1052
1053 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
1054 # don't stat() for existence e.g. for multiple document input files
1055 # (eg SplitPlug)
1056 return;
1057 }
1058
1059 my $top_section=$doc_obj->get_top_section();
1060
1061 if (&FileUtils::fileExists($upgraded_filename)) {
1062 $doc_obj->associate_source_file($filename);
1063 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1064 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1065 } else {
1066 my $upper_filename = $filename;
1067 my $upgraded_upper_filename = $upgraded_filename;
1068
1069 $upper_filename =~ s/jpg$/JPG/;
1070 $upgraded_upper_filename =~ s/jpg$/JPG/;
1071
1072 if (&FileUtils::fileExists($upgraded_upper_filename)) {
1073 $doc_obj->associate_source_file($upper_filename);
1074 $doc_obj->associate_file($upper_filename, "cover.jpg",
1075 "image/jpeg");
1076 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1077 } else {
1078 # file doesn't exist, so record the fact that it's missing so
1079 # we don't stat() again (stat is slow)
1080 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1081 }
1082 }
1083
1084}
1085
1086
1087# Overridden by exploding plugins (eg. ISISPlug)
1088sub clean_up_after_exploding
1089{
1090 my $self = shift(@_);
1091}
1092
1093
1094
10951;
Note: See TracBrowser for help on using the repository browser.