source: gsdl/trunk/perllib/plugins/BasePlugin.pm@ 15868

Last change on this file since 15868 was 15868, checked in by kjdon, 16 years ago

plugin overhaul: BasPlug has been split into several base plugins: PrintInfo just does the printing for pluginfo.pl, and does the argument parsing in the constructor. All plugins and supporting extractors etc inherit directly or indirectly from this. AbstractPlugin adds a few methods to this, is used by Directory and ArchivesInf plugins. These are not really plugins so can we remove them? anyway, not sure if AbstractPlugin will live for very long. BasePlugin is a proper base plugin, has read and read_into_doc_obj methods. It does nothing with reading in the file or textcat stuff. Makes a basic doc obj and adds some metadata. It also handles all the blocking stuff, associate ext stuff etc. Binary plugins can implement the process method to do file specific stuff. AutoExtractMetadata inherits BasePlugin and adds automatic metadata extraction using hte new Extractor plugins. ReadTextFile is the equivalent in functionality to the old BasPlug - does lang and encoding extraction, and reading in the file. It inherits from AutoExtractMetadata. If your file type is binary and will have no text, then inherit from BasePlugin. If its binary but ends up with text (eg using convert_to) then inherit from AutoExtractMetadata. If your file is a text type file, then inherit from ReadTextFile.

  • Property svn:keywords set to Author Date Id Revision
File size: 27.1 KB
Line 
1###########################################################################
2#
3# BasePlugin.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasePlugin;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use File::Basename;
33
34use multiread;
35use encodings;
36use unicode;
37use doc;
38eval "require diagnostics"; # some perl distros (eg mac) don't have this
39use ghtml;
40use gsprintf 'gsprintf';
41
42use PrintInfo;
43
44BEGIN {
45 @BasePlugin::ISA = ( 'PrintInfo' );
46}
47
48our $encoding_list =
49 [ { 'name' => "ascii",
50 'desc' => "{ReadTextFile.input_encoding.ascii}" },
51 { 'name' => "utf8",
52 'desc' => "{ReadTextFile.input_encoding.utf8}" },
53 { 'name' => "unicode",
54 'desc' => "{ReadTextFile.input_encoding.unicode}" } ];
55
56my $e = $encodings::encodings;
57foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
58{
59 my $hashEncode =
60 {'name' => $enc,
61 'desc' => $e->{$enc}->{'name'}};
62
63 push(@{$encoding_list},$hashEncode);
64}
65
66our $encoding_plus_auto_list =
67 [ { 'name' => "auto",
68 'desc' => "{ReadTextFile.input_encoding.auto}" } ];
69
70push(@{$encoding_plus_auto_list},@{$encoding_list});
71
72my $arguments =
73 [ { 'name' => "process_exp",
74 'desc' => "{BasePlugin.process_exp}",
75 'type' => "regexp",
76 'deft' => "",
77 'reqd' => "no" },
78 { 'name' => "block_exp",
79 'desc' => "{BasePlugin.block_exp}",
80 'type' => "regexp",
81 'deft' => "",
82 'reqd' => "no" },
83 { 'name' => "smart_block",
84 'desc' => "{BasePlugin.smart_block}",
85 'type' => "flag",
86 'reqd' => "no" },
87 { 'name' => "associate_ext",
88 'desc' => "{BasePlugin.associate_ext}",
89 'type' => "string",
90 'reqd' => "no" },
91 { 'name' => "associate_tail_re",
92 'desc' => "{BasePlugin.associate_tail_re}",
93 'type' => "string",
94 'reqd' => "no" },
95 { 'name' => "use_as_doc_identifier",
96 'desc' => "{BasePlugin.use_as_doc_identifier}",
97 'type' => "string",
98 'reqd' => "no" ,
99 'deft' => "" } ,
100 { 'name' => "no_cover_image",
101 'desc' => "{BasePlugin.no_cover_image}",
102 'type' => "flag",
103 'reqd' => "no" },
104 { 'name' => "filename_encoding",
105 'desc' => "{BasePlugin.filename_encoding}",
106 'type' => "enum",
107 'deft' => "auto",
108 'list' => $encoding_plus_auto_list,
109 'reqd' => "no" }
110
111 ];
112
113
114my $options = { 'name' => "BasePlugin",
115 'desc' => "{BasePlugin.desc}",
116 'abstract' => "yes",
117 'inherits' => "no",
118 'args' => $arguments };
119
120
121sub new {
122
123 my ($class) = shift (@_);
124 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
125 push(@$pluginlist, $class);
126
127 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}},$options);
129
130 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
131
132 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
133 $self->{'plugin_type'} = $plugin_name;
134
135 $self->{'num_processed'} = 0;
136 $self->{'num_not_processed'} = 0;
137 $self->{'num_blocked'} = 0;
138 $self->{'num_archives'} = 0;
139 $self->{'cover_image'} = 1; # cover image is on by default
140 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
141 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
142
143 my $associate_ext = $self->{'associate_ext'};
144 if ((defined $associate_ext) && ($associate_ext ne "")) {
145
146 my $associate_tail_re = $self->{'associate_tail_re'};
147 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
148 my $outhandle = $self->{'outhandle'};
149 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
150 print $outhandle " defaulting to 'associate_tail_re'\n";
151 }
152 else {
153 my @exts = split(/,/,$associate_ext);
154
155 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
156 my $associate_tail_re = join("|",@exts_bracketed);
157 $self->{'associate_tail_re'} = $associate_tail_re;
158 }
159
160 delete $self->{'associate_ext'};
161 }
162
163 $self->{'shared_fileroot'} = {};
164 $self->{'file_blocks'} = {};
165
166
167 return bless $self, $class;
168
169}
170
171# initialize BasePlugin options
172# if init() is overridden in a sub-class, remember to call BasePlugin::init()
173sub init {
174 my $self = shift (@_);
175 my ($verbosity, $outhandle, $failhandle) = @_;
176
177 # verbosity is passed through from the processor
178 $self->{'verbosity'} = $verbosity;
179
180 # as are the outhandle and failhandle
181 $self->{'outhandle'} = $outhandle if defined $outhandle;
182 $self->{'failhandle'} = $failhandle;
183
184 # set process_exp and block_exp to defaults unless they were
185 # explicitly set
186
187 if ((!$self->is_recursive()) and
188 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
189
190 $self->{'process_exp'} = $self->get_default_process_exp ();
191 if ($self->{'process_exp'} eq "") {
192 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
193 }
194 }
195
196 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
197 $self->{'block_exp'} = $self->get_default_block_exp ();
198 }
199
200}
201
202sub begin {
203 my $self = shift (@_);
204 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
205}
206
207sub end {
208 # potentially called at the end of each plugin pass
209 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
210
211 my ($self) = shift (@_);
212}
213
214sub deinit {
215 # called only once, after all plugin passes have been done
216
217 my ($self) = @_;
218}
219
220sub set_incremental {
221 my $self = shift(@_);
222 my ($incremental) = @_;
223
224 $self->{'incremental'} = $incremental;
225}
226
227# this function should be overridden to return 1
228# in recursive plugins
229sub is_recursive {
230 my $self = shift (@_);
231
232 return 0;
233}
234
235sub get_default_block_exp {
236 my $self = shift (@_);
237
238 return "";
239}
240
241sub get_default_process_exp {
242 my $self = shift (@_);
243
244 return "";
245}
246
247# default implementation is to do nothing.
248sub store_block_files
249{
250 my $self =shift (@_);
251 my ($filename) = @_;
252 return;
253}
254
255#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
256sub block_cover_image
257{
258 my $self =shift;
259 my $filename = shift;
260
261 if ($self->{'cover_image'}) {
262 my $coverfile = $filename;
263 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
264 if (!-e $coverfile) {
265 $coverfile =~ s/jpg$/JPG/;
266 }
267 if (-e $coverfile) {
268 $self->{'file_blocks'}->{$coverfile} = 1;
269 }
270 }
271
272 return;
273}
274
275sub root_ext_split
276{
277 my $self = shift (@_);
278 my ($filename,$tail_re) = @_;
279
280 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
281
282 if ((!defined $file_prefix) || (!defined $file_ext)) {
283 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
284 }
285
286 return ($file_prefix,$file_ext);
287}
288
289sub metadata_read {
290 my $self = shift (@_);
291 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
292 # Keep track of filenames with same root but different extensions
293 # Used to support -associate_ext and the more generalised
294 # -associate_tail_re
295
296 my $associate_tail_re = $self->{'associate_tail_re'};
297 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
298
299 my ($file_prefix,$file_ext)
300 = $self->root_ext_split($file,$associate_tail_re);
301
302 if ((defined $file_prefix) && (defined $file_ext)) {
303
304 my $shared_fileroot = $self->{'shared_fileroot'};
305 if (!defined $shared_fileroot->{$file_prefix}) {
306 my $file_prefix_rec = { 'tie_to' => undef,
307 'exts' => {} };
308 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
309 }
310
311 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
312
313 my $process_exp = $self->{'process_exp'};
314
315 if ($file =~ m/$process_exp/) {
316 # This is the document the others should be tied to
317 $file_prefix_rec->{'tie_to'} = $file_ext;
318 }
319 else {
320 if ($file_ext =~ m/$associate_tail_re$/) {
321 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
322 }
323 }
324
325 }
326 }
327
328 # now check whether we are actually processing this
329 my $filename = $file;
330 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
331 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
332 return undef; # can't recognise
333 }
334
335 # do smart blocking if appropriate
336 if ($self->{'smart_block'}) {
337 $self->store_block_files($filename);
338 }
339 # block the cover image if there is one
340 if ($self->{'cover_image'}) {
341 $self->block_cover_image($filename);
342 }
343
344 return 1;
345}
346
347sub tie_to_filename
348{
349 my $self = shift (@_);
350
351 my ($file_ext,$file_prefix_rec) = @_;
352
353 if (defined $file_prefix_rec) {
354 my $tie_to = $file_prefix_rec->{'tie_to'};
355
356 if (defined $tie_to) {
357 if ($tie_to eq $file_ext) {
358 return 1;
359 }
360 }
361 }
362
363 return 0;
364}
365
366sub tie_to_assoc_file
367{
368 my $self = shift (@_);
369 my ($file_ext,$file_prefix_rec) = @_;
370
371 if (defined $file_prefix_rec) {
372 my $tie_to = $file_prefix_rec->{'tie_to'};
373 if (defined $tie_to) {
374
375 my $exts = $file_prefix_rec->{'exts'};
376
377 my $has_file_ext = $exts->{$file_ext};
378
379 if ($has_file_ext) {
380 return 1;
381 }
382 }
383 }
384
385 return 0;
386}
387
388
389sub associate_with
390{
391 my $self = shift (@_);
392 my ($file, $filename, $metadata) = @_;
393
394 my $associate_tail_re = $self->{'associate_tail_re'};
395 return 0 if (!$associate_tail_re);
396
397 # If file, see if matches with "tie_to" doc or is one of the
398 # associated filename extensions.
399
400 my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
401
402 if ((defined $file_prefix) && (defined $file_ext)) {
403
404 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
405
406 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
407
408 # Set up gsdlassocfile_tobe
409
410 my $exts = $file_prefix_rec->{'exts'};
411
412 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
413 $metadata->{'gsdlassocfile_tobe'} = [];
414 }
415
416 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
417
418 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
419 foreach my $e (keys %$exts) {
420 my $assoc_file = "$full_prefix$e";
421 print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
422 my $mime_type = ""; # let system auto detect this
423 push(@$assoc_tobe,"$assoc_file:$mime_type:");
424 }
425
426 }
427 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
428
429
430 # a form of smart block
431 return 1;
432 }
433 }
434
435 return 0;
436}
437
438sub get_full_filenames {
439 my $self = shift (@_);
440 my ($base_dir, $file) = @_;
441
442 my $filename_full_path = $file;
443 # add on directory if present
444 $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
445 my $filename_no_path = $file;
446 # remove directory if present
447 $filename_no_path =~ s/^.*[\/\\]//;
448 return ($filename_full_path, $filename_no_path);
449}
450
451sub read_block {
452 my $self = shift (@_);
453
454 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
455
456
457 my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
458
459 if ($self->associate_with($file,$filename_full_path,$metadata)) {
460 # a form of smart block
461 $self->{'num_blocked'} ++;
462 return (0,undef); # blocked
463 }
464
465 my $smart_block = $self->{'smart_block'};
466 my $smart_block_BN = $self->{'smart_block_BN'};
467
468 if ($smart_block || $smart_block_BN) {
469 if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
470 $self->{'num_blocked'} ++;
471 return (0,undef); # blocked
472 }
473 } else {
474 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
475 $self->{'num_blocked'} ++;
476 return (0,undef); # blocked
477 }
478 if ($self->{'cover_image'}) {
479 if (defined $self->{'file_blocks'}->{$filename_full_path} && $self->{'file_blocks'}->{$filename_full_path} == 1){
480 $self->{'num_blocked'} ++;
481 return (0,undef); # blocked
482 }
483 }
484 }
485
486 if ($filename_full_path !~ /$self->{'process_exp'}/ || !-f $filename_full_path) {
487 return (undef,undef); # can't recognise
488 }
489
490 ##why are we returning the full filename - do we need this??
491 return (1,$filename_full_path);
492}
493
494
495#filename_encoding set by user
496sub filename_to_utf8_metadata
497{
498 my $self = shift (@_);
499 my ($file, $file_encoding) = @_;
500
501 my $outhandle = $self->{'outhandle'};
502
503 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
504
505 my $filename_encoding = $self->{'filename_encoding'};
506 if ($filename_encoding eq "auto") {
507 # we check the locale first
508 if (!defined $self->{'filesystem_encoding'}) {
509 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
510 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
511 }
512 if ($self->{'filesystem_encoding'} ne "undefined") {
513 $filename_encoding = $self->{'filesystem_encoding'};
514 } else {
515 # try the encoding of the document, if available
516 if (defined $file_encoding) {
517 $filename_encoding = $file_encoding;
518 } else {
519 # use utf8
520 $filename_encoding = "utf8";
521 }
522 }
523
524 }
525
526 if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) {
527 $filemeta = unicode::unicode2utf8(
528 unicode::convert2unicode($filename_encoding, \$filemeta)
529 );
530 }
531 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
532
533 return $dmsafe_filemeta;
534
535}
536
537
538sub get_filesystem_encoding {
539
540 my $self = shift(@_);
541
542 my $outhandle = $self->{'outhandle'};
543 my $filesystem_encoding = undef;
544
545 eval {
546 use POSIX qw(locale_h);
547
548 # With only one parameter, setlocale retrieves the
549 # current value
550 my $current_locale = setlocale(LC_CTYPE);
551
552 if ($current_locale =~ m/^.*\.(.*?)$/) {
553 my $char_encoding = lc($1);
554 if ($char_encoding =~ m/^(iso)(8859)(\d{1,2})$/) {
555 $char_encoding = "$1\_$2\_$3";
556 }
557
558 $char_encoding =~ s/-/_/g;
559 $char_encoding =~ s/^utf_8$/utf8/;
560
561 if ($char_encoding =~ m/^\d+$/) {
562 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
563 $char_encoding = "windows_$char_encoding";
564 }
565 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
566 $char_encoding = "dos_$char_encoding";
567 }
568 }
569
570 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
571 || (defined $encodings::encodings->{$char_encoding})) {
572 $filesystem_encoding = $char_encoding;
573 }
574 else {
575 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
576 }
577 }
578
579
580 };
581 if ($@) {
582 print $outhandle "$@\n";
583 print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
584
585 }
586 return $filesystem_encoding;
587}
588
589# is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
590sub set_Source_metadata {
591 my $self = shift (@_);
592 my ($doc_obj, $filename_no_path, $file_encoding) = @_;
593
594 my $top_section = $doc_obj->get_top_section();
595
596 # the original encoding filename
597 $doc_obj->set_metadata_element($top_section, "Source", $filename_no_path);
598 # UTF-8 version of filename
599 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
600 $doc_obj->set_utf8_metadata_element($top_section, "SourceUTF8", $filemeta);
601
602}
603
604sub add_OID {
605
606 my $self = shift (@_);
607 my ($doc_obj) = @_;
608
609 # See if a metadata field is specified as the field
610 if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
611 my $metadata_doc_id = $self->{'use_as_doc_identifier'};
612
613 # Consider "tidying" up metadata_doc_id to be something
614 # suitable in a URL
615 # Could even support a user specified plugin RE for this.
616
617 my $top_section = $doc_obj->get_top_section();
618 my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
619## print STDERR "**** oid = $oid\n";
620 $doc_obj->set_OID($oid);
621 }
622 # See if there is a plugin-specific set_OID function...
623 elsif (defined ($self->can('set_OID'))) {
624 # it will need $doc_obj to set the Identifier metadata...
625 $self->set_OID(@_); # pass through any extra arguments supplied
626 } else {
627 # use the default set_OID() in doc.pm
628 $doc_obj->set_OID();
629 }
630}
631
632
633
634# The BasePlugin read_into_doc_obj() function. This function does all the
635# right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in
636# a file and sets up a slew of metadata all saved in doc_obj, which
637# it then returns as part of a tuple (process_status,doc_obj)
638#
639# Much of this functionality used to reside in read, but it was broken
640# down into a supporting routine to make the code more flexible.
641#
642# recursive plugins (e.g. RecPlug) and specialized plugins like those
643# capable of processing many documents within a single file (e.g.
644# GMLPlug) will normally want to implement their own version of
645# read_into_doc_obj()
646#
647# Note that $base_dir might be "" and that $file might
648# include directories
649
650# currently blocking has been done before it gets here - does this affect secondary plugin stuff??
651sub read_into_doc_obj {
652 my $self = shift (@_);
653 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
654
655 my $outhandle = $self->{'outhandle'};
656
657 # should we move this to read? What about secondary plugins?
658 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
659 print $outhandle "$self->{'plugin_type'} processing $file\n"
660 if $self->{'verbosity'} > 1;
661
662 my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
663 # create a new document
664 my $doc_obj = new doc ($filename_full_path, "indexed_doc");
665 my $top_section = $doc_obj->get_top_section();
666
667 # this should look at the plugin option too...
668 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
669 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
670 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
671
672 $self->Set_Source_metadata($doc_obj, $filename_no_path);
673
674 # plugin specific stuff - what args do we need here??
675 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
676 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
677 return -1;
678 }
679
680 # include any metadata passed in from previous plugins
681 # note that this metadata is associated with the top level section
682 my $section = $doc_obj->get_top_section();
683 # can we merge these two methods??
684 $self->add_associated_files($doc_obj, $filename_full_path);
685 $self->extra_metadata ($doc_obj, $section, $metadata);
686 $self->auto_extract_metadata($doc_obj);
687
688 # if we haven't found any Title so far, assign one
689 # this was shifted to here from inside read()
690 $self->title_fallback($doc_obj,$section,$filename_no_path);
691
692 $self->add_OID($doc_obj);
693
694 return (1,$doc_obj);
695}
696
697sub add_dummy_text {
698 my $self = shift(@_);
699 my ($doc_obj, $section) = @_;
700
701 # add NoText metadata so we can hide this dummy text in format statements
702 $doc_obj->add_metadata($section, "NoText", "1");
703 $doc_obj->add_text($section, &gsprintf::lookup_string("{BasePlugin.dummy_text}",1));
704
705}
706
707# does nothing. Can be overridden by subclass
708sub auto_extract_metadata {
709 my $self = shift(@_);
710 my ($doc_obj) = @_;
711}
712
713# adds cover image, associate_file options stuff. Should be called by sub class
714# read_into_doc_obj
715sub add_associated_files {
716 my $self = shift(@_);
717 # whatis filename??
718 my ($doc_obj, $filename) = @_;
719
720 # add in the cover image
721 if ($self->{'cover_image'}) {
722 $self->associate_cover_image($doc_obj, $filename);
723 }
724
725
726}
727
728# The BasePlugin read() function. This function calls read_into_doc_obj()
729# to ensure all the right things to make general options work for a
730# given plugin are done. It then calls the process() function which
731# does all the work specific to a plugin (like the old read functions
732# used to do). Most plugins should define their own process() function
733# and let this read() function keep control.
734#
735# recursive plugins (e.g. RecPlug) and specialized plugins like those
736# capable of processing many documents within a single file (e.g.
737# GMLPlug) might want to implement their own version of read(), but
738# more likely need to implement their own version of read_into_doc_obj()
739#
740# Return number of files processed, undef if can't recognise, -1 if can't
741# process
742
743sub read {
744 my $self = shift (@_);
745 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
746
747 # check that we are not blocked
748 my ($block_status,$filename) = $self->read_block(@_);
749 return $block_status if ((!defined $block_status) || ($block_status==0));
750
751 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
752
753 if ((defined $process_status) && ($process_status == 1)) {
754
755 # process the document
756 $processor->process($doc_obj);
757
758 $self->{'num_processed'} ++;
759 undef $doc_obj;
760 }
761 # delete any temp files that we may have created
762 $self->clean_up_after_doc_obj_processing();
763
764 # if process_status == 1, then the file has been processed.
765 return $process_status;
766
767}
768
769# returns undef if file is rejected by the plugin
770sub process {
771 my $self = shift (@_);
772 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
773
774 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n") && die "\n";
775 # die "BasePlugin::process function must be implemented in sub-class\n";
776
777 return undef; # never gets here
778}
779
780# overwrite this method to delete any temp files that we have created
781sub clean_up_after_doc_obj_processing {
782 my $self = shift(@_);
783
784}
785# write_file -- used by ConvertToPlug, for example in post processing
786#
787# where should this go, is here the best place??
788sub utf8_write_file {
789 my $self = shift (@_);
790 my ($textref, $filename) = @_;
791
792 if (!open (FILE, ">$filename")) {
793 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
794 die "\n";
795 }
796 print FILE $$textref;
797
798 close FILE;
799}
800
801
802sub filename_based_title
803{
804 my $self = shift (@_);
805 my ($file) = @_;
806
807 my $file_derived_title = $file;
808 $file_derived_title =~ s/_/ /g;
809 $file_derived_title =~ s/\..*?$//;
810
811 return $file_derived_title;
812}
813
814
815sub title_fallback
816{
817 my $self = shift (@_);
818 my ($doc_obj,$section,$file) = @_;
819
820 if (!defined $doc_obj->get_metadata_element ($section, "Title") or $doc_obj->get_metadata_element($section, "Title") eq "") {
821
822 my $file_derived_title = $self->filename_to_metadata($self->filename_based_title($file));
823 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
824 $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
825 }
826 else {
827 $doc_obj->set_utf8_metadata ($section, "Title", $file_derived_title);
828 }
829 }
830
831}
832
833# add any extra metadata that's been passed around from one
834# plugin to another.
835# extra_metadata uses add_utf8_metadata so it expects metadata values
836# to already be in utf8
837sub extra_metadata {
838 my $self = shift (@_);
839 my ($doc_obj, $cursection, $metadata) = @_;
840
841 my $associate_tail_re = $self->{'associate_tail_re'};
842
843 foreach my $field (keys(%$metadata)) {
844 # $metadata->{$field} may be an array reference
845 if ($field eq "gsdlassocfile_tobe") {
846 # 'gsdlassocfile_tobe' is artificially introduced metadata
847 # that is used to signal that certain additional files should
848 # be tied to this document. Useful in situations where a
849 # metadata pass in the plugin pipeline works out some files
850 # need to be associated with a document, but the document hasn't
851 # been formed yet.
852
853 my $equiv_form = "";
854 foreach my $gaf (@{$metadata->{$field}}) {
855 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
856 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
857 my $filename = $full_filename;
858
859 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
860
861 # work out extended tail extension (i.e. matching tail re)
862
863 my ($file_prefix,$file_extended_ext)
864 = $self->root_ext_split($tail_filename,$associate_tail_re);
865 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
866
867 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
868 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
869 my $srcicon = "_icon".$doc_ext."_";
870 my $end_doclink = "</a>";
871
872 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
873
874 if (defined $pre_doc_ext) {
875 # for metadata such as [mp3._edited] [mp3._full] ...
876 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
877 }
878
879 # for multiple metadata such as [mp3.assoclink]
880 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
881
882 $equiv_form .= " $assoc_form";
883 }
884 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
885 }
886 elsif (ref ($metadata->{$field}) eq "ARRAY") {
887 map {
888 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
889 } @{$metadata->{$field}};
890 } else {
891 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
892 }
893 }
894}
895
896
897sub compile_stats {
898 my $self = shift(@_);
899 my ($stats) = @_;
900
901 $stats->{'num_processed'} += $self->{'num_processed'};
902 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
903 $stats->{'num_archives'} += $self->{'num_archives'};
904
905}
906
907sub associate_cover_image {
908 my $self = shift;
909 my ($doc_obj, $filename) = @_;
910
911 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
912 if (exists $self->{'covers_missing_cache'}->{$filename}) {
913 # don't stat() for existence eg for multiple document input files
914 # (eg SplitPlug)
915 return;
916 }
917
918 my $top_section=$doc_obj->get_top_section();
919
920 if (-e $filename) {
921 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
922 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
923 } else {
924 my $upper_filename = $filename;
925 $upper_filename =~ s/jpg$/JPG/;
926 if (-e $upper_filename) {
927 $doc_obj->associate_file($upper_filename, "cover.jpg",
928 "image/jpeg");
929 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
930 } else {
931 # file doesn't exist, so record the fact that it's missing so
932 # we don't stat() again (stat is slow)
933 $self->{'covers_missing_cache'}->{$filename} = 1;
934 }
935 }
936
937}
938
939
940# Overridden by exploding plugins (eg. ISISPlug)
941sub clean_up_after_exploding
942{
943 my $self = shift(@_);
944}
945
946
9471;
Note: See TracBrowser for help on using the repository browser.