root/main/trunk/greenstone2/perllib/plugins/BaseImporter.pm @ 32501

Revision 32501, 33.6 KB (checked in by litvinovg, 22 months ago)

Workaround to set assign metadata via csv metadata plugin. "Section" column could be used in csv file to specify section for metadata to assign

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BaseImporter.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BaseImporter;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33use Encode;
34use Unicode::Normalize 'normalize';
35
36use encodings;
37use unicode;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41use util;
42use FileUtils;
43
44use CommonUtil;
45
46BEGIN {
47    @BaseImporter::ISA = ( 'CommonUtil' );
48}
49
50# the different methods that can be applied when renaming
51# imported documents and their associated files
52our $file_rename_method_list =
53    [ { 'name' => "url",
54    'desc' => "{BaseImporter.rename_method.url}" },
55      { 'name' => "base64",
56    'desc' => "{BaseImporter.rename_method.base64}" },
57      { 'name' => "none",
58    'desc' => "{BaseImporter.rename_method.none}",
59    'hiddengli' => "yes" } ];
60
61# here went encoding list stuff
62
63our $oidtype_list =
64    [ { 'name' => "auto",
65    'desc' => "{BaseImporter.OIDtype.auto}" },
66      { 'name' => "hash",
67        'desc' => "{import.OIDtype.hash}" },
68      { 'name' => "hash_on_ga_xml",
69        'desc' => "{import.OIDtype.hash_on_ga_xml}" },
70      { 'name' => "hash_on_full_filename",
71        'desc' => "{import.OIDtype.hash_on_full_filename}" },
72      { 'name' => "assigned",
73        'desc' => "{import.OIDtype.assigned}" },
74      { 'name' => "incremental",
75        'desc' => "{import.OIDtype.incremental}" },
76      { 'name' => "filename",
77        'desc' => "{import.OIDtype.filename}" },
78      { 'name' => "dirname",
79        'desc' => "{import.OIDtype.dirname}" },
80      { 'name' => "full_filename",
81        'desc' => "{import.OIDtype.full_filename}" } ];
82
83my $arguments =
84    [ { 'name' => "process_exp",
85    'desc' => "{BaseImporter.process_exp}",
86    'type' => "regexp",
87    'deft' => "",
88    'reqd' => "no" },
89     { 'name' => "store_original_file",
90    'desc' => "{BaseImporter.store_original_file}",
91    'type' => "flag",
92    'reqd' => "no" },
93      { 'name' => "associate_ext",
94    'desc' => "{BaseImporter.associate_ext}",
95    'type' => "string",
96    'reqd' => "no" },
97      { 'name' => "associate_tail_re",
98    'desc' => "{BaseImporter.associate_tail_re}",
99    'type' => "string",
100    'reqd' => "no" },
101      { 'name' => "OIDtype",
102    'desc' => "{import.OIDtype}",
103    'type' => "enum",
104    'list' => $oidtype_list,
105    # leave default empty so we can tell if its been set or not - if not set will use option from import.pl
106    'deft' => "auto",
107    'reqd' => "no" },
108      { 'name' => "OIDmetadata",
109    'desc' => "{import.OIDmetadata}",
110    'type' => "metadata",
111    'deft' => "dc.Identifier",
112    'reqd' => "no" },     
113#      { 'name' => "use_as_doc_identifier",
114#   'desc' => "{BaseImporter.use_as_doc_identifier}",
115#   'type' => "string",
116#   'reqd' => "no" ,
117#   'deft' => "" } ,
118      { 'name' => "no_cover_image",
119    'desc' => "{BaseImporter.no_cover_image}",
120    'type' => "flag",
121    'reqd' => "no" },
122     { 'name' => "file_rename_method",
123    'desc' => "{BaseImporter.file_rename_method}",
124    'type' => "enum",
125    'deft' => &get_default_file_rename_method(), # by default rename imported files and assoc files using this encoding
126    'list' => $file_rename_method_list,
127    'reqd' => "no"
128    }
129     
130      ];
131
132
133my $options = { 'name'     => "BaseImporter",
134        'desc'     => "{BaseImporter.desc}",
135        'abstract' => "yes",
136        'inherits' => "yes",
137        'args'     => $arguments };
138
139sub new {
140
141    my ($class) = shift (@_);
142    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
143    push(@$pluginlist, $class);
144
145    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
146    push(@{$hashArgOptLists->{"OptList"}},$options);
147
148    my $self = new CommonUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
149   
150    if ($self->{'info_only'}) {
151        # don't worry about any options etc
152        return bless $self, $class;
153    }
154
155    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
156    $self->{'plugin_type'} = $plugin_name;
157
158    # remove ex. from OIDmetadata iff it's the only namespace prefix
159    $self->{'OIDmetadata'} =~ s/^ex\.([^.]+)$/$1/ if defined $self->{'OIDmetadata'};
160    $self->{'num_processed'} = 0;
161    $self->{'num_not_processed'} = 0;
162    $self->{'num_blocked'} = 0;
163    $self->{'num_archives'} = 0;
164    $self->{'cover_image'} = 1; # cover image is on by default
165    $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
166    $self->{'can_process_directories'} = 0;
167    #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
168   
169    my $associate_ext = $self->{'associate_ext'};
170    if ((defined $associate_ext) && ($associate_ext ne "")) {
171
172    my $associate_tail_re = $self->{'associate_tail_re'};
173    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
174        my $outhandle = $self->{'outhandle'};
175        print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
176        print $outhandle "         defaulting to 'associate_tail_re'\n";
177    }
178    else {
179        my @exts = split(/,/,$associate_ext);
180
181        my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
182        my $associate_tail_re = join("|",@exts_bracketed);
183        $self->{'associate_tail_re'} = $associate_tail_re;
184    }
185
186    delete $self->{'associate_ext'};
187    }
188
189    return bless $self, $class;
190
191}
192
193sub merge_inheritance
194{
195    my $self = {};
196    my @child_selfs = @_;
197
198    foreach my $child_self (@child_selfs) {
199    foreach my $key (keys %$child_self) {
200        if (defined $self->{$key}) {
201        if ($self->{$key} ne $child_self->{$key}) {
202#           print STDERR "Warning: Conflicting value in multiple inheritance for '$key'\n";
203#           print STDERR "Existing stored value = $self->{$key}\n";
204#           print STDERR "New (child) value     = $child_self->{$key}\n";
205#           print STDERR "Keeping existing value\n";
206            # Existing value seems to be option specified in collect.cfg
207
208            ### $self->{$key} = $child_self->{$key};
209           
210        }
211        else {
212##          print STDERR "****Info: Value $self->{$key} for $key already defined through multiple inheritance as the same value\n";
213        }
214
215        }
216        else {
217        $self->{$key} = $child_self->{$key};
218        }
219    }
220    }
221
222    return $self;   
223}
224
225# initialize BaseImporter options
226# if init() is overridden in a sub-class, remember to call BaseImporter::init()
227sub init {
228    my $self = shift (@_);
229    my ($verbosity, $outhandle, $failhandle) = @_;
230   
231    $self->SUPER::init(@_);
232   
233    # set process_exp and block_exp to defaults unless they were
234    # explicitly set
235
236    if ((!$self->is_recursive()) and
237    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
238
239    $self->{'process_exp'} = $self->get_default_process_exp ();
240    if ($self->{'process_exp'} eq "") {
241        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
242    }
243    }
244
245    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
246    $self->{'block_exp'} = $self->get_default_block_exp ();
247    }
248
249}
250
251sub begin {
252    my $self = shift (@_);
253    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
254
255    if ($self->{'OIDtype'} eq "auto") {
256    # hasn't been set in the plugin, use the processor values
257    $self->{'OIDtype'} = $processor->{'OIDtype'};
258    $self->{'OIDmetadata'} = $processor->{'OIDmetadata'};
259    }
260    if ($self->{'OIDtype'} eq "hash") {
261    # should we hash on the file or on the doc xml??
262    $self->{'OIDtype'} = $self->get_oid_hash_type();
263    if ($self->{'OIDtype'} !~ /^(hash_on_file|hash_on_ga_xml)$/) {
264        $self->{'OIDtype'} = "hash_on_file";
265    }
266    }
267}
268
269# This is called once if removeold is set with import.pl. Most plugins will do
270# nothing but if a plugin does any stuff outside of creating doc obj, then
271# it may need to clear something.
272sub remove_all {
273    my $self = shift (@_);
274    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
275}
276
277# This is called per document for docs that have been deleted from the
278# collection. Most plugins will do nothing
279# but if a plugin does any stuff outside of creating doc obj, then it may need
280# to clear something.
281sub remove_one {
282    my $self = shift (@_);
283   
284    my ($file, $oids, $archivedir) = @_;
285    return 0 if $self->can_process_this_file($file);
286    return undef;
287}
288
289sub end {
290    # potentially called at the end of each plugin pass
291    # import.pl only has one plugin pass, but buildcol.pl has multiple ones
292
293    my ($self) = shift (@_);
294}
295
296sub deinit {
297    # called only once, after all plugin passes have been done
298
299    my ($self) = @_;
300}
301
302# default hashing type is to hash on the original file (or converted file)
303# override this to return hash_on_ga_xml for filetypes where hashing on the
304# file is no good eg video
305sub get_oid_hash_type {
306
307    my $self = shift (@_);
308
309    return "hash_on_file";
310}
311
312
313# this function should be overridden to return 1
314# in recursive plugins
315sub is_recursive {
316    my $self = shift (@_);
317
318    return 0;
319}
320
321sub get_default_block_exp {
322    my $self = shift (@_);
323
324    return "";
325}
326
327sub get_default_process_exp {
328    my $self = shift (@_);
329
330    return "";
331}
332
333
334# rename imported files and assoc files using URL encoding by default
335# as this will work for most plugins and give more legible filenames
336sub get_default_file_rename_method() {
337    my $self = shift (@_);
338    return "url";
339}
340
341# returns this plugin's active (possibly user-selected) file_rename_method
342sub get_file_rename_method() {
343    my $self = shift (@_);
344    my $rename_method = $self->{'file_rename_method'};
345    if($rename_method) {
346    return $rename_method;
347    } else {   
348    return $self->get_default_file_rename_method();
349    }
350}
351
352# default implementation is to do nothing
353sub store_block_files {
354   
355    my $self =shift (@_);
356    my ($filename_full_path, $block_hash) = @_;
357
358}
359
360# put files to block into hash
361sub use_block_expressions {
362
363    my $self =shift (@_);
364    my ($filename_full_path, $block_hash) = @_;
365
366    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
367
368    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
369    $self->block_filename($block_hash,$filename_full_path);
370    }
371
372}
373
374#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
375sub block_cover_image
376{
377    my $self =shift;
378    my ($filename, $block_hash) = @_;
379
380    $filename = &util::upgrade_if_dos_filename($filename);
381
382    if ($self->{'cover_image'}) {
383    my $coverfile = $filename;
384    $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
385
386    #if there is no file extension, coverfile will be the same as filename
387    return if $coverfile eq $filename;
388   
389    if (!&FileUtils::fileExists($coverfile)) {
390        $coverfile =~ s/jpg$/JPG/;
391    }   
392    if (&FileUtils::fileExists($coverfile)) {
393        $self->block_filename($block_hash,$coverfile);
394    }
395    }
396
397    return;
398}
399
400
401# discover all the files that should be blocked by this plugin
402# check the args ...
403sub file_block_read {
404
405    my $self = shift (@_); 
406    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
407    # Keep track of filenames with same root but different extensions
408    # Used to support -associate_ext and the more generalised
409    # -associate_tail_re
410    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
411
412    if (!-d $filename_full_path) {
413    $block_hash->{'all_files'}->{$file} = 1;
414    }
415
416    my $associate_tail_re = $self->{'associate_tail_re'};
417    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
418    my ($file_prefix,$file_ext)
419        = &util::get_prefix_and_tail_by_regex($filename_full_path,$associate_tail_re);
420    if ((defined $file_prefix) && (defined $file_ext)) {
421        my $shared_fileroot = $block_hash->{'shared_fileroot'};
422        if (!defined $shared_fileroot->{$file_prefix}) {
423        my $file_prefix_rec = { 'tie_to'  => undef,
424                        'exts'    => {} };
425        $shared_fileroot->{$file_prefix} = $file_prefix_rec;
426        }
427       
428        my $file_prefix_rec = $shared_fileroot->{$file_prefix};
429
430        if ($self->can_process_this_file($filename_full_path) && $file_ext !~ m/.\./) {
431        # This is the document the others should be tied to
432        $file_prefix_rec->{'tie_to'} = $file_ext;
433        }
434        else {
435        if ($file_ext =~ m/$associate_tail_re$/) {
436            # this file should be associated to the main one
437            $file_prefix_rec->{'exts'}->{$file_ext} = 1;
438        }
439        }
440
441    }
442    }
443
444    # check block expressions
445    $self->use_block_expressions($filename_full_path, $block_hash) unless $self->{'no_blocking'};
446
447    # now check whether we are actually processing this
448    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
449    return undef; # can't recognise
450    }
451   
452    # if we have a block_exp, then this overrides the normal 'smart' blocking
453    $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne "");
454
455    # block the cover image if there is one
456    if ($self->{'cover_image'}) {
457    $self->block_cover_image($filename_full_path, $block_hash);
458    }
459   
460    return 1;
461}
462
463# plugins that rely on more than process_exp (eg XML plugins) can override this method
464sub can_process_this_file {
465    my $self = shift(@_);
466    my ($filename) = @_;
467
468    if (-d $filename && !$self->{'can_process_directories'}) {
469    return 0;
470    }
471
472    if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
473    return 1;
474    }
475    return 0;
476   
477}
478
479# Even if a plugin can extract metadata in its metadata_read pass,
480# make the default return 'undef' so processing of the file continues
481# down the pipeline, so other plugins can also have the opportunity to
482# locate metadata and set it up in the extrametakeys variables that
483# are passed around.
484
485sub can_process_this_file_for_metadata {
486    my $self = shift(@_);
487
488    return undef;
489}
490
491
492
493# Notionally written to be called once for each document, it is however safe to
494# call multiple times (as in the case of ImagePlugin) which calls this later on
495# after the original image has potentially been converted to a *new* source image
496# format (e.g. TIFF to PNG)
497
498sub set_Source_metadata {
499    my $self = shift (@_); 
500    my ($doc_obj, $raw_filename, $filename_encoding, $section) = @_;
501   
502    # 1. Sets the filename (Source) for display encoded as Unicode if possible,
503    #    and (as a fallback) using %xx if not for non-ascii chars
504    # 2. Sets the url ref (SourceFile) to the URL encoded version
505    #    of filename for generated files
506   
507    my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
508
509    my $this_section = (defined $section)? $section : $doc_obj->get_top_section();
510
511    my $octet_file = $raw_file;
512
513    # UTF-8 version of filename
514#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
515#   print STDERR "**** Setting Source Metadata given: $octet_file\n";
516#    }
517   
518    # Deal with (on Windows) raw filenames that are in their
519    # abbreviated DOS form
520
521    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
522    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
523        if (-e $raw_filename) {
524        my $unicode_filename = Win32::GetLongPathName($raw_filename);
525       
526        my $unused_full_uf;
527        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
528        }
529    }
530    }
531
532    my $url_encoded_filename;
533    if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
534    # => Generate a pretty print version of filename that is mapped to Unicode
535   
536    # Use filename_encoding to map raw filename to a Perl unicode-aware string
537    $url_encoded_filename = decode($filename_encoding,$octet_file);     
538    }
539    else {
540    # otherwise generate %xx encoded version of filename for char > 127
541    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
542    }
543   
544#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
545#   print STDERR "****** saving Source as:             $url_encoded_filename\n";
546#    }
547
548    # In the case of converted files and (generalized) exploded documents, there
549    # will already be a source filename => store as OrigSource before overriding
550    my $orig_source = $doc_obj->get_metadata_element ($this_section, "Source");
551    if ((defined $orig_source) && ($orig_source !~ m/^\s*$/)) {
552    $doc_obj->set_utf8_metadata_element($this_section, "OrigSource", $orig_source);
553    }
554       
555    # Source is the UTF8 display name - not necessarily the name of the file on the system
556    if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
557    # on Darwin want all display strings to be in composed form, then can search on that
558    $url_encoded_filename = normalize('C', $url_encoded_filename); # Normalisation Form 'C' (composition)
559    }
560    # set_utf8_metadata actually sets perl unicode aware strings. not utf8
561    $doc_obj->set_utf8_metadata_element($this_section, "Source", $url_encoded_filename);
562
563   
564    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
565    # If using URL encoding, then SourceFile is the url-reference to url-encoded
566    # renamed_raw_url: it's a url that refers to the actual file on the system
567    # this call just replaces % with %25
568    my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
569
570    $doc_obj->set_utf8_metadata_element($this_section, "SourceFile",
571                    $renamed_raw_url);
572
573#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
574#   print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
575#    }
576}
577
578# this should be called by all plugins to set the oid of the doc obj, rather
579# than calling doc_obj->set_OID directly
580sub add_OID {
581    my $self = shift (@_); 
582    my ($doc_obj, $force) = @_;
583
584    # don't add one if there is one already set, unless we are forced to do so
585    return unless ($doc_obj->get_OID() =~ /^NULL$/ || $force);
586    $doc_obj->set_OIDtype($self->{'OIDtype'}, $self->{'OIDmetadata'});
587
588    # see if there is a plugin specific set_OID function
589    if (defined ($self->can('set_OID'))) {
590    $self->set_OID(@_); # pass through doc_obj and any extra arguments
591    }
592    else {
593    # use the default set_OID() in doc.pm
594    $doc_obj->set_OID();
595    }
596
597}
598
599# The BaseImporter read_into_doc_obj() function. This function does all the
600# right things to make general options work for a given plugin.  It doesn't do anything with the file other than setting reads in
601# a file and sets up a slew of metadata all saved in doc_obj, which
602# it then returns as part of a tuple (process_status,doc_obj)
603#
604# Much of this functionality used to reside in read, but it was broken
605# down into a supporting routine to make the code more flexible. 
606#
607# recursive plugins (e.g. RecPlug) and specialized plugins like those
608# capable of processing many documents within a single file (e.g.
609# GMLPlug) will normally want to implement their own version of
610# read_into_doc_obj()
611#
612# Note that $base_dir might be "" and that $file might
613# include directories
614
615# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
616sub read_into_doc_obj {
617    my $self = shift (@_); 
618    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
619
620    my $outhandle = $self->{'outhandle'};
621
622    # should we move this to read? What about secondary plugins?
623    my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
624    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
625    print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
626    if $self->{'verbosity'} > 1;
627
628    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
629   
630    # create a new document
631    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
632    my $top_section = $doc_obj->get_top_section();
633
634    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
635    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
636   
637
638    my $plugin_filename_encoding = $self->{'filename_encoding'};
639    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
640    $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$top_section);
641
642    # plugin specific stuff - what args do we need here??
643    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
644    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
645    return -1;
646    }
647   
648    # include any metadata passed in from previous plugins
649    # note that this metadata is associated with the top level section
650    my $section = $doc_obj->get_top_section();
651    # can we merge these two methods??
652    $self->add_associated_files($doc_obj, $filename_full_path);
653    $self->extra_metadata ($doc_obj, $section, $metadata);
654    $self->auto_extract_metadata($doc_obj);
655
656    # if we haven't found any Title so far, assign one
657    # this was shifted to here from inside read()
658    $self->title_fallback($doc_obj,$section,$filename_no_path);
659   
660    $self->add_OID($doc_obj);
661   
662    $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
663    return (1,$doc_obj);
664}
665
666sub post_process_doc_obj {
667    my $self = shift (@_); 
668    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
669
670    return 1;
671}
672
673sub add_dummy_text {
674    my $self = shift(@_);
675    my ($doc_obj, $section) = @_;
676
677    # add NoText metadata so we can hide this dummy text in format statements
678    $doc_obj->add_metadata($section, "NoText", "1");
679
680    # lookup_string with extra '1' arg returns perl internal unicode aware text, so we use add_utf8_text so no encoding is done on it.
681    $doc_obj->add_utf8_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
682    #$doc_obj->add_text($section, &gsprintf::lookup_string("{BaseImporter.dummy_text}",1));
683   
684   
685}
686
687# does nothing. Can be overridden by subclass
688sub auto_extract_metadata {
689    my $self = shift(@_);
690    my ($doc_obj) = @_;
691}
692
693# adds cover image, associate_file options stuff. Should be called by sub class
694# read_into_doc_obj
695sub add_associated_files {
696    my $self = shift(@_);
697    # whatis filename??
698    my ($doc_obj, $filename) = @_;
699   
700    # add in the cover image
701    if ($self->{'cover_image'}) {
702    $self->associate_cover_image($doc_obj, $filename);
703    }
704    # store the original (used for eg TextPlugin to store the original for OAI)
705    if ($self->{'store_original_file'}) {
706    $self->associate_source_file($doc_obj, $filename);
707    }
708   
709
710}
711
712# implement this if you are extracting metadata for other documents
713sub metadata_read {
714    my $self = shift (@_);
715    my ($pluginfo, $base_dir, $file, $block_hash,
716    $extrametakeys, $extrametadata, $extrametafile,
717    $processor, $gli, $aux) = @_;
718   
719    # can we process this file??
720    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
721    return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
722
723    return 1; # we recognise the file, but don't actually do anything with it
724}
725
726
727# The BaseImporter read() function. This function calls read_into_doc_obj()
728# to ensure all the right things to make general options work for a
729# given plugin are done. It then calls the process() function which
730# does all the work specific to a plugin (like the old read functions
731# used to do). Most plugins should define their own process() function
732# and let this read() function keep control. 
733#
734# recursive plugins (e.g. RecPlug) and specialized plugins like those
735# capable of processing many documents within a single file (e.g.
736# GMLPlug) might want to implement their own version of read(), but
737# more likely need to implement their own version of read_into_doc_obj()
738#
739# Return number of files processed, undef if can't recognise, -1 if can't
740# process
741
742sub read {
743    my $self = shift (@_); 
744    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
745
746    # can we process this file??
747    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
748
749    return undef unless $self->can_process_this_file($filename_full_path);
750   
751    #print STDERR "**** BEFORE READ INTO DOC OBJ: $file\n";
752    my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
753    #print STDERR "**** AFTER READ INTO DOC OBJ: $file\n";
754   
755    if ((defined $process_status) && ($process_status == 1)) {
756   
757    # process the document
758    $processor->process($doc_obj);
759
760    $self->{'num_processed'} ++;
761    undef $doc_obj;
762    }
763    # delete any temp files that we may have created
764    $self->clean_up_after_doc_obj_processing();
765
766
767    # if process_status == 1, then the file has been processed.
768    return $process_status;
769
770}
771
772# returns undef if file is rejected by the plugin
773sub process {
774    my $self = shift (@_);
775    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
776
777    gsprintf(STDERR, "BaseImporter::process {common.must_be_implemented}\n");
778
779    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
780    print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
781
782    die "\n";
783
784    return undef; # never gets here
785}
786
787# overwrite this method to delete any temp files that we have created
788sub clean_up_after_doc_obj_processing {
789    my $self = shift(@_);
790
791}
792
793
794
795sub filename_based_title
796{
797    my $self = shift (@_);
798    my ($file) = @_;
799
800    my $file_derived_title = $file;
801    $file_derived_title =~ s/_/ /g;
802    $file_derived_title =~ s/\.[^.]+$//;
803
804    return $file_derived_title;
805}
806
807
808sub title_fallback
809{
810    my $self = shift (@_);
811    my ($doc_obj,$section,$file) = @_;
812
813    if (!defined $doc_obj->get_metadata_element ($section, "Title")
814    || $doc_obj->get_metadata_element($section, "Title") eq "") {
815
816    my $source_file = $doc_obj->get_metadata_element($section, "Source");
817    my $file_derived_title;
818    if (defined $source_file) {
819        $file_derived_title =  $self->filename_based_title($source_file);
820    }
821    else {
822        # pp = pretty print
823        my $pp_file = (defined $source_file) ? $source_file : $file;
824
825        my $raw_title = $self->filename_based_title($file);
826        my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
827    }
828
829
830    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
831        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
832    }
833    else {
834        $doc_obj->set_utf8_metadata_element ($section, "Title", $file_derived_title);
835    }
836    }
837   
838}
839
840# add any extra metadata that's been passed around from one
841# plugin to another.
842# extra_metadata uses add_utf8_metadata so it expects metadata values
843# to already be in utf8
844sub extra_metadata {
845    my $self = shift (@_);
846    my ($doc_obj, $cursection, $metadata) = @_;
847
848    my $associate_tail_re = $self->{'associate_tail_re'};
849
850# Sort the extra metadata for diffcol so these meta appear in a consistent order
851# in doc.xml. Necessary for the ex.PDF.* and ex.File.* meta that's extracted in
852# the PDFBox collection, as the order of these varies between CentOS and Ubuntu.
853    foreach my $field (sort keys(%$metadata)) {
854#    foreach my $field (keys(%$metadata)) {
855    # $metadata->{$field} may be an array reference
856    if ($field eq "gsdlassocfile_tobe") {
857        # 'gsdlassocfile_tobe' is artificially introduced metadata
858        # that is used to signal that certain additional files should
859        # be tied to this document.  Useful in situations where a
860        # metadata pass in the plugin pipeline works out some files
861        # need to be associated with a document, but the document hasn't
862        # been formed yet.
863        my $equiv_form = "";
864        foreach my $gaf (@{$metadata->{$field}}) {
865        my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
866        my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
867       
868        # we need to make sure the filename is valid utf-8 - we do
869        # this by url or base64 encoding it
870        # $tail_filename is the name that we store the file as
871        $tail_filename = &util::rename_file($tail_filename, $self->{'file_rename_method'});
872        $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
873        $doc_obj->associate_source_file($full_filename);
874        # If the filename is url_encoded, we need to encode the % signs
875        # in the filename, so that it works in a url
876        my $url_tail_filename = &unicode::filename_to_url($tail_filename);
877        # work out extended tail extension (i.e. matching tail re)
878
879        my ($file_prefix,$file_extended_ext)
880            = &util::get_prefix_and_tail_by_regex($tail_filename,$associate_tail_re);
881        my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
882        my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
883
884        # the greenstone 2 stuff
885        my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):assocfilepath],[assocfilepath]}/$url_tail_filename\">";
886        #my $start_doclink = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
887        my $start_doclink_gs3 = "<a href=\'_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$url_tail_filename\'>";
888
889        my $srcicon = "_icon".$doc_ext."_";
890        my $end_doclink = "</a>";
891       
892        my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
893
894
895        if (defined $pre_doc_ext && $pre_doc_ext ne "") {
896            # for metadata such as [mp3._edited] [mp3._full] ...
897            $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
898        }
899
900        # for multiple metadata such as [mp3.assoclink]
901        $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
902
903        $equiv_form .= " $assoc_form"; 
904
905        # following are used for greenstone 3,
906        $doc_obj->add_utf8_metadata ($cursection, "equivDocLink", $start_doclink_gs3);
907        $doc_obj->add_utf8_metadata ($cursection, "equivDocIcon", $srcicon);
908        $doc_obj->add_utf8_metadata ($cursection, "/equivDocLink", $end_doclink);
909
910        }
911        $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
912    }
913    elsif ($field eq "gsdlzipfilename") {
914        # special case for when files have come out of a zip. source_path
915        # (used for archives dbs and keeping track for incremental import)
916        # must be set to the zip file name
917        my $zip_filename = $metadata->{$field};
918        # overwrite the source_path
919        $doc_obj->set_source_path($zip_filename);
920        # and set the metadata
921        $zip_filename = &util::filename_within_collection($zip_filename);
922        $zip_filename = $doc_obj->encode_filename($zip_filename, $self->{'file_rename_method'});
923        $doc_obj->add_utf8_metadata ($cursection, $field, $zip_filename);
924    }
925    elsif (ref ($metadata->{$field}) eq "ARRAY") {
926        if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
927            my $field_new_name = $1;
928            my $specified_section = $2;
929            map {
930                $doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $_);
931            } @{$metadata->{$field}};
932        } else {
933            map {
934                $doc_obj->add_utf8_metadata ($cursection, $field, $_);
935            } @{$metadata->{$field}};
936        }
937       
938    } else {
939        if ($field =~ /(.+?)\/\/\/Section\/([\d.]*)/m){
940            my $field_new_name = $1;
941            my $specified_section = $2;
942            $doc_obj->add_utf8_metadata ($specified_section, $field_new_name, $metadata->{$field});
943        } else {
944            $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
945        }
946    }
947    }
948}
949
950
951sub compile_stats {
952    my $self = shift(@_);
953    my ($stats) = @_;
954
955    $stats->{'num_processed'} += $self->{'num_processed'};
956    $stats->{'num_not_processed'} += $self->{'num_not_processed'};
957    $stats->{'num_archives'} += $self->{'num_archives'};
958
959}
960sub associate_source_file {
961    my $self = shift(@_);
962   
963    my ($doc_obj, $filename) = @_;
964    my $cursection = $doc_obj->get_top_section();
965    my $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
966   
967    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
968    # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
969    $doc_obj->add_utf8_metadata ($cursection, "srclink_file", $doc_obj->get_sourcefile());
970    $doc_obj->add_utf8_metadata ($cursection, "srclinkFile", $doc_obj->get_sourcefile());
971}
972
973sub associate_cover_image {
974    my $self = shift(@_);
975    my ($doc_obj, $filename) = @_;
976
977    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
978
979    $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
980    $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
981
982    if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
983    # don't stat() for existence e.g. for multiple document input files
984    # (eg SplitPlug)
985    return;
986    }
987
988    my $top_section=$doc_obj->get_top_section();
989
990    if (&FileUtils::fileExists($upgraded_filename)) {
991    $doc_obj->associate_source_file($filename);
992        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
993    $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
994    } else {
995    my $upper_filename = $filename;
996    my $upgraded_upper_filename = $upgraded_filename;
997
998    $upper_filename =~ s/jpg$/JPG/;
999    $upgraded_upper_filename =~ s/jpg$/JPG/;
1000
1001    if (&FileUtils::fileExists($upgraded_upper_filename)) {
1002        $doc_obj->associate_source_file($upper_filename);
1003        $doc_obj->associate_file($upper_filename, "cover.jpg",
1004                     "image/jpeg");
1005        $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
1006    } else {
1007        # file doesn't exist, so record the fact that it's missing so
1008        # we don't stat() again (stat is slow)
1009        $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
1010    }
1011    }
1012
1013}
1014
1015
1016# Overridden by exploding plugins (eg. ISISPlug)
1017sub clean_up_after_exploding
1018{
1019    my $self = shift(@_);
1020}
1021
1022
1023
10241;
Note: See TracBrowser for help on using the browser.