source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10155

Last change on this file since 10155 was 10155, checked in by davidb, 19 years ago

deinit subroutine added that balances out init routine. 'init' called only
once when pipeline is set up. deinit now called when *every* pass using
the pipeline has finished. Note this is different to the 'begin' and 'end'
subroutines that can potentially be called just before a new round of
file processing with the pipeline (note: presently only buildcol.pl makes
multiple passes using the pipeline, importing only uses one pass).

  • Property svn:keywords set to Author Date Id Revision
File size: 39.3 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict; no strict 'subs';
39
40use File::Basename;
41
42use Kea;
43use parsargv;
44use multiread;
45use encodings;
46use cnseg;
47use acronym;
48use textcat;
49use doc;
50eval "require diagnostics"; # some perl distros (eg mac) don't have this
51use DateExtract;
52use ghtml;
53use gsprintf 'gsprintf';
54use printusage;
55
56use GISBasPlug;
57
58@BasPlug::ISA = ( GISBasPlug );
59
60my $unicode_list =
61 [ { 'name' => "auto",
62 'desc' => "{BasPlug.input_encoding.auto}" },
63 { 'name' => "ascii",
64 'desc' => "{BasPlug.input_encoding.ascii}" },
65 { 'name' => "utf8",
66 'desc' => "{BasPlug.input_encoding.utf8}" },
67 { 'name' => "unicode",
68 'desc' => "{BasPlug.input_encoding.unicode}" } ];
69
70my $arguments =
71 [ { 'name' => "process_exp",
72 'desc' => "{BasPlug.process_exp}",
73 'type' => "regexp",
74 'deft' => "",
75 'reqd' => "no" },
76 { 'name' => "block_exp",
77 'desc' => "{BasPlug.block_exp}",
78 'type' => "regexp",
79 'deft' => "",
80 'reqd' => "no" },
81 { 'name' => "smart_block",
82 'desc' => "{BasPlug.smart_block}",
83 'type' => "flag",
84 'reqd' => "no" },
85 { 'name' => "associate_ext",
86 'desc' => "{BasPlug.associate_ext}",
87 'type' => "string",
88 'reqd' => "no" },
89 { 'name' => "input_encoding",
90 'desc' => "{BasPlug.input_encoding}",
91 'type' => "enum",
92 'list' => $unicode_list,
93 'reqd' => "no" ,
94 'deft' => "auto" } ,
95 { 'name' => "default_encoding",
96 'desc' => "{BasPlug.default_encoding}",
97 'type' => "enum",
98 'list' => $unicode_list,
99 'reqd' => "no",
100 'deft' => "utf8" },
101 { 'name' => "extract_language",
102 'desc' => "{BasPlug.extract_language}",
103 'type' => "flag",
104 'reqd' => "no" },
105 { 'name' => "default_language",
106 'desc' => "{BasPlug.default_language}",
107 'type' => "language",
108 'deft' => "en",
109 'reqd' => "no" },
110 { 'name' => "extract_acronyms",
111 'desc' => "{BasPlug.extract_acronyms}",
112 'type' => "flag",
113 'reqd' => "no" },
114 { 'name' => "markup_acronyms",
115 'desc' => "{BasPlug.markup_acronyms}",
116 'type' => "flag",
117 'reqd' => "no" },
118 { 'name' => "extract_keyphrases",
119 'desc' => "{BasPlug.extract_keyphrases}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "extract_keyphrase_options",
123 'desc' => "{BasPlug.extract_keyphrase_options}",
124 'type' => "string",
125 'deft' => "",
126 'reqd' => "no" },
127 { 'name' => "first",
128 'desc' => "{BasPlug.first}",
129 'type' => "string",
130 'reqd' => "no" },
131 { 'name' => "extract_email",
132 'desc' => "{BasPlug.extract_email}",
133 'type' => "flag",
134 'reqd' => "no" },
135 { 'name' => "extract_historical_years",
136 'desc' => "{BasPlug.extract_historical_years}",
137 'type' => "flag",
138 'reqd' => "no" },
139 { 'name' => "maximum_year",
140 'desc' => "{BasPlug.maximum_year}",
141 'type' => "int",
142 'deft' => (localtime)[5]+1900,
143 'reqd' => "no"},
144 { 'name' => "maximum_century",
145 'desc' => "{BasPlug.maximum_century}",
146 'type' => "string",
147 'deft' => "",
148 'reqd' => "no" },
149 { 'name' => "no_bibliography",
150 'desc' => "{BasPlug.no_bibliography}",
151 'type' => "flag",
152 'reqd' => "no"},
153 { 'name' => "no_cover_image",
154 'desc' => "{BasPlug.no_cover_image}",
155 'type' => "flag",
156 'reqd' => "no" } ];
157
158my $gis_arguments =
159 [ { 'name' => "extract_placenames",
160 'desc' => "{GISBasPlug.extract_placenames}",
161 'type' => "flag",
162 'reqd' => "no" },
163 { 'name' => "gazetteer",
164 'desc' => "{GISBasPlug.gazetteer}",
165 'type' => "string",
166 'reqd' => "no" },
167 { 'name' => "place_list",
168 'desc' => "{GISBasPlug.place_list}",
169 'type' => "flag",
170 'reqd' => "no" } ];
171
172
173my $options = { 'name' => "BasPlug",
174 'desc' => "{BasPlug.desc}",
175 'abstract' => "yes",
176 'inherits' => "no",
177 'args' => $arguments };
178
179
180
181
182sub get_arguments
183{
184 my $self = shift(@_);
185 my $optionlistref = $self->{'option_list'};
186 my @optionlist = @$optionlistref;
187 my $pluginoptions = pop(@$optionlistref);
188 my $pluginarguments = $pluginoptions->{'args'};
189 return $pluginarguments;
190}
191
192
193sub print_xml_usage
194{
195 my $self = shift(@_);
196
197 # XML output is always in UTF-8
198 gsprintf::output_strings_in_UTF8;
199
200 PrintUsage::print_xml_header();
201 $self->print_xml();
202}
203
204
205sub print_xml
206{
207 my $self = shift(@_);
208
209 my $optionlistref = $self->{'option_list'};
210 my @optionlist = @$optionlistref;
211 my $pluginoptions = pop(@$optionlistref);
212 return if (!defined($pluginoptions));
213
214 gsprintf(STDERR, "<PlugInfo>\n");
215 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
216 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
217 $desc =~ s/</&amp;lt;/g; # doubly escaped
218 $desc =~ s/>/&amp;gt;/g;
219
220 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
222 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
223 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
224 gsprintf(STDERR, " <Arguments>\n");
225 if (defined($pluginoptions->{'args'})) {
226 &PrintUsage::print_options_xml($pluginoptions->{'args'});
227 }
228
229 # Recurse up the plugin hierarchy
230 $self->print_xml();
231
232 gsprintf(STDERR, " </Arguments>\n");
233 gsprintf(STDERR, "</PlugInfo>\n");
234}
235
236
237sub print_txt_usage
238{
239 my $self = shift(@_);
240
241 # Print the usage message for a plugin (recursively)
242 my $descoffset = $self->determine_description_offset(0);
243 $self->print_plugin_usage($descoffset, 1);
244}
245
246
247sub determine_description_offset
248{
249 my $self = shift(@_);
250 my $maxoffset = shift(@_);
251
252 my $optionlistref = $self->{'option_list'};
253 my @optionlist = @$optionlistref;
254 my $pluginoptions = pop(@$optionlistref);
255 return $maxoffset if (!defined($pluginoptions));
256
257 # Find the length of the longest option string of this plugin
258 my $pluginargs = $pluginoptions->{'args'};
259 if (defined($pluginargs)) {
260 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
261 if ($longest > $maxoffset) {
262 $maxoffset = $longest;
263 }
264 }
265
266 # Recurse up the plugin hierarchy
267 $maxoffset = $self->determine_description_offset($maxoffset);
268 $self->{'option_list'} = \@optionlist;
269 return $maxoffset;
270}
271
272
273sub print_plugin_usage
274{
275 my $self = shift(@_);
276 my $descoffset = shift(@_);
277 my $isleafclass = shift(@_);
278
279 my $optionlistref = $self->{'option_list'};
280 my @optionlist = @$optionlistref;
281 my $pluginoptions = pop(@$optionlistref);
282 return if (!defined($pluginoptions));
283
284 my $pluginname = $pluginoptions->{'name'};
285 my $pluginargs = $pluginoptions->{'args'};
286 my $plugindesc = $pluginoptions->{'desc'};
287
288 # Produce the usage information using the data structure above
289 if ($isleafclass) {
290 if (defined($plugindesc)) {
291 gsprintf(STDERR, "$plugindesc\n\n");
292 }
293 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
294 }
295
296 # Display the plugin options, if there are some
297 if (defined($pluginargs)) {
298 # Calculate the column offset of the option descriptions
299 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
300
301 if ($isleafclass) {
302 gsprintf(STDERR, " {common.specific_options}:\n");
303 }
304 else {
305 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
306 }
307
308 # Display the plugin options
309 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
310 }
311
312 # Recurse up the plugin hierarchy
313 $self->print_plugin_usage($descoffset, 0);
314 $self->{'option_list'} = \@optionlist;
315}
316
317
318sub new {
319 my $class = shift (@_);
320 my $plugin_name = shift (@_);
321 my $self = {};
322 $self->{'plugin_type'} = "BasPlug";
323
324 if (GISBasPlug::has_mapdata()) {
325 push(@$arguments,@$gis_arguments);
326 }
327
328 my $enc = "^(";
329 map {$enc .= "$_|";} keys %$encodings::encodings;
330 my $denc = $enc . "ascii|utf8|unicode)\$";
331 $enc .= "ascii|utf8|unicode|auto)\$";
332
333 $self->{'outhandle'} = STDERR;
334 my $year = (localtime)[5]+1900;
335
336 $self->{'textcat'} = new textcat();
337
338 $self->{'num_processed'} = 0;
339 $self->{'num_not_processed'} = 0;
340 $self->{'num_blocked'} = 0;
341 $self->{'num_archives'} = 0;
342 $self->{'cover_image'} = 1; # cover image is on by default
343
344 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
345 $self->{'option_list'} = [ $options ];
346
347 my $no_cover_image = 0;
348 # general options available to all plugins
349 if (!parsargv::parse(\@_,
350 q^process_exp/.*/^, \$self->{'process_exp'},
351 q^block_exp/.*/^, \$self->{'block_exp'},
352 q^associate_ext/.*/^, \$self->{'associate_ext'},
353 q^extract_language^, \$self->{'extract_language'},
354 q^extract_acronyms^, \$self->{'extract_acronyms'},
355 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
356 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
357 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
358 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
359 q^extract_email^, \$self->{'extract_email'},
360 q^extract_placenames^, \$self->{'extract_placenames'},
361 q^gazetteer/.*/^, \$self->{'gazetteer'},
362 q^place_list^, \$self->{'place_list'},
363 q^markup_acronyms^, \$self->{'markup_acronyms'},
364 q^default_language/.{2}/en^, \$self->{'default_language'},
365 q^first/.*/^, \$self->{'first'},
366 q^extract_historical_years^, \$self->{'date_extract'},
367 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
368 q^no_bibliography^, \$self->{'no_biblio'},
369 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
370 q^no_cover_image^, \$no_cover_image,
371 q^separate_cjk^, \$self->{'separate_cjk'},
372 q^smart_block^, \$self->{'smart_block'},
373 q^smart_block_BN^, \$self->{'smart_block_BN'},
374 "allow_extra_options")) {
375
376 gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
377 bless $self, $class;
378 $self->print_txt_usage(""); # Use default resource bundle
379 die "\n";
380 }
381
382 my $associate_ext = $self->{'associate_ext'};
383 if ((defined $associate_ext) && ($associate_ext ne "")) {
384 my @exts = split(/,/,$associate_ext);
385
386 my %associate_ext_lookup = ();
387 foreach my $e (@exts) {
388 $associate_ext_lookup{$e} = 1;
389 }
390
391 $self->{'associate_ext_lookup'} = \%associate_ext_lookup;
392 }
393
394 $self->{'shared_fileroot'} = {};
395 $self->{'file_blocks'} = {};
396
397 $self->{'cover_image'} = 0 if ($no_cover_image);
398
399 if ($self->{'extract_placenames'}) {
400
401 my $outhandle = $self->{'outhandle'};
402
403 my $places_ref
404 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
405
406 if (!defined $places_ref) {
407 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
408 print $outhandle " No placename extraction will take place.\n";
409 $self->{'extract_placenames'} = undef;
410 }
411 else {
412 $self->{'places'} = $places_ref;
413 }
414 }
415 return bless $self, $class;
416}
417
418# initialize BasPlug options
419# if init() is overridden in a sub-class, remember to call BasPlug::init()
420sub init {
421 my $self = shift (@_);
422 my ($verbosity, $outhandle, $failhandle) = @_;
423
424 # verbosity is passed through from the processor
425 $self->{'verbosity'} = $verbosity;
426
427 # as are the outhandle and failhandle
428 $self->{'outhandle'} = $outhandle if defined $outhandle;
429 $self->{'failhandle'} = $failhandle;
430
431 # set process_exp and block_exp to defaults unless they were
432 # explicitly set
433
434 if ((!$self->is_recursive()) and
435 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
436
437 $self->{'process_exp'} = $self->get_default_process_exp ();
438 if ($self->{'process_exp'} eq "") {
439 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
440 }
441 }
442
443 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
444 $self->{'block_exp'} = $self->get_default_block_exp ();
445 }
446}
447
448sub begin {
449 my $self = shift (@_);
450 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
451 $self->initialise_extractors();
452}
453
454sub end {
455 # potentially called at the end of each plugin pass
456 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
457
458 my ($self) = @_;
459 $self->finalise_extractors();
460}
461
462sub deinit {
463 # called only once, after all plugin passes have been done
464
465 my ($self) = @_;
466}
467
468# this function should be overridden to return 1
469# in recursive plugins
470sub is_recursive {
471 my $self = shift (@_);
472
473 return 0;
474}
475
476sub get_default_block_exp {
477 my $self = shift (@_);
478
479 return "";
480}
481
482sub get_default_process_exp {
483 my $self = shift (@_);
484
485 return "";
486}
487
488# default implementation is to do nothing.
489sub store_block_files
490{
491 my $self =shift (@_);
492 my ($filename) = @_;
493 return;
494}
495
496#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
497sub block_cover_image
498{
499 my $self =shift (@_);
500 my ($filename) = @_;
501 if ($self->{'cover_image'}) {
502 my $coverfile = $filename;
503 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
504 if (!-e $coverfile) {
505 $coverfile =~ s/jpg$/JPG/;
506 }
507 if (-e $coverfile) {
508 $self->{'file_blocks'}->{$coverfile} = 1;
509 }
510 }
511
512 return;
513}
514
515sub metadata_read {
516 my $self = shift (@_);
517 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
518 # Keep track of filenames with same root but different extensions
519 # Used to support -associate_ext
520
521 my $associate_ext = $self->{'associate_ext'};
522 if ((defined $associate_ext) && ($associate_ext ne "")) {
523
524 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
525 if ((defined $file_prefix) && (defined $file_ext)) {
526
527 my $shared_fileroot = $self->{'shared_fileroot'};
528 if (!defined $shared_fileroot->{$file_prefix}) {
529 my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
530 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
531 }
532
533 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
534
535 my $process_exp = $self->{'process_exp'};
536
537 if ($file =~ m/$self->{'process_exp'}/) {
538 # This is the document the others should be tied to
539 $file_prefix_rec->{'tie_to'} = $file_ext;
540 }
541 else {
542 if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
543 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
544 }
545 }
546 }
547 }
548
549 # now check whether we are actually processing this
550 my $filename = $file;
551 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
552 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
553 return undef; # can't recognise
554 }
555
556 # do smart blocking if appropriate
557 if (defined $self->{'smart_block'}) {
558 $self->block_cover_image($filename);
559 $self->store_block_files($filename);
560 }
561
562 return 1;
563}
564
565sub tie_to_filename
566{
567 my $self = shift (@_);
568
569 my ($file_ext,$file_prefix_rec) = @_;
570
571 if (defined $file_prefix_rec) {
572 my $tie_to = $file_prefix_rec->{'tie_to'};
573
574 if (defined $tie_to) {
575 if ($tie_to eq $file_ext) {
576 return 1;
577 }
578 }
579 }
580
581 return 0;
582}
583
584sub tie_to_assoc_file
585{
586 my $self = shift (@_);
587 my ($file_ext,$file_prefix_rec) = @_;
588
589 if (defined $file_prefix_rec) {
590 my $tie_to = $file_prefix_rec->{'tie_to'};
591 if (defined $tie_to) {
592
593 my $exts = $file_prefix_rec->{'exts'};
594
595 my $has_file_ext = $exts->{$file_ext};
596
597 if ($has_file_ext) {
598 return 1;
599 }
600 }
601 }
602
603 return 0;
604}
605
606
607sub associate_with
608{
609 my $self = shift (@_);
610 my ($file, $filename, $metadata) = @_;
611
612 my $associate_ext = $self->{'associate_ext'};
613
614
615 return 0 if (!$associate_ext);
616
617 # If file, see if matches with "tie_to" doc or is one of the
618 # associated filename extensions.
619
620 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
621 if ((defined $file_prefix) && (defined $file_ext)) {
622
623 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
624
625 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
626
627 # Set up gsdlassocfile_tobe
628
629 my $exts = $file_prefix_rec->{'exts'};
630
631 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
632 $metadata->{'gsdlassocfile_tobe'} = [];
633 }
634
635 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
636
637 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
638 foreach my $e (keys %$exts) {
639 my $assoc_file = "$full_prefix.$e";
640 my $mime_type = ""; # let system auto detect this
641 push(@$assoc_tobe,"$assoc_file:$mime_type:");
642 }
643 }
644 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
645 # a form of smart block
646
647 return 1;
648 }
649 }
650
651 return 0;
652}
653
654
655# The BasPlug read() function. This function does all the right things
656# to make general options work for a given plugin. It calls the process()
657# function which does all the work specific to a plugin (like the old
658# read functions used to do). Most plugins should define their own
659# process() function and let this read() function keep control.
660#
661# recursive plugins (e.g. RecPlug) and specialized plugins like those
662# capable of processing many documents within a single file (e.g.
663# GMLPlug) should normally implement their own version of read()
664#
665# Return number of files processed, undef if can't recognise, -1 if can't
666# process
667# Note that $base_dir might be "" and that $file might
668# include directories
669
670sub read {
671 my $self = shift (@_);
672
673 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
674
675 if ($self->is_recursive()) {
676 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
677 }
678
679 my $outhandle = $self->{'outhandle'};
680 my $smart_block = $self->{'smart_block'};
681 my $smart_block_BN = $self->{'smart_block_BN'};
682
683 my $filename = $file;
684 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
685
686 if ($self->associate_with($file,$filename,$metadata)) {
687 # a form of smart block
688 $self->{'num_blocked'} ++;
689 return 0; # blocked
690 }
691
692 if ($smart_block || $smart_block_BN) {
693
694 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
695 $self->{'num_blocked'} ++;
696 return 0; # blocked
697 }
698 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
699 $self->{'num_blocked'} ++;
700 return 0; # blocked
701 }
702
703 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
704 return undef; # can't recognise
705 }
706 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
707
708 # Do encoding stuff
709 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
710
711 # create a new document
712 my $doc_obj = new doc ($filename, "indexed_doc");
713 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
714 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
715 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
716 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
717 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
718
719 my ($filemeta) = $file =~ /([^\\\/]+)$/;
720 # how do we know what encoding the filename is in?
721 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
722 if ($self->{'cover_image'}) {
723 $self->associate_cover_image($doc_obj, $filename);
724 }
725
726 # read in file ($text will be in utf8)
727 my $text = "";
728 $self->read_file ($filename, $encoding, $language, \$text);
729
730 if (!length ($text)) {
731 my $plugin_name = ref ($self);
732 if ($gli) {
733 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
734 }
735 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
736
737 my $failhandle = $self->{'failhandle'};
738 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
739 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
740 $self->{'num_not_processed'} ++;
741
742 return 0; # what should we return here?? error but don't want to pass it on
743 }
744
745 # include any metadata passed in from previous plugins
746 # note that this metadata is associated with the top level section
747
748 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
749
750 # do plugin specific processing of doc_obj
751 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
752 $text = '';
753 undef $text;
754 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
755 return -1;
756 }
757 $text='';
758 undef $text;
759
760 # do any automatic metadata extraction
761 $self->auto_extract_metadata ($doc_obj);
762
763 # add an OID
764 # see if there is a plugin-specific set_OID function...
765 if (defined ($self->can('set_OID'))) {
766 # it will need $doc_obj to set the Identifier metadata...
767 $self->set_OID($doc_obj);
768 } else {
769 # use the default set_OID() in doc.pm
770 $doc_obj->set_OID();
771 }
772
773 # process the document
774 $processor->process($doc_obj);
775
776 if(defined($self->{'places_filename'})){
777 &util::rm($self->{'places_filename'});
778 $self->{'places_filename'} = undef;
779 }
780
781 $self->{'num_processed'} ++;
782 undef $doc_obj;
783 return 1; # processed the file
784}
785
786# returns undef if file is rejected by the plugin
787sub process {
788 my $self = shift (@_);
789 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
790
791 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
792 # die "Basplug::process function must be implemented in sub-class\n";
793
794 return undef; # never gets here
795}
796
797# uses the multiread package to read in the entire file pointed to
798# by filename and loads the resulting text into $$textref. Input text
799# may be in any of the encodings handled by multiread, output text
800# will be in utf8
801sub read_file {
802 my $self = shift (@_);
803 my ($filename, $encoding, $language, $textref) = @_;
804
805 if (!-r $filename)
806 {
807 my $outhandle = $self->{'outhandle'};
808 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
809 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
810 return;
811 }
812
813 $$textref = "";
814
815 if (!open (FILE, $filename)) {
816 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
817 die "\n";
818 }
819
820 if ($encoding eq "ascii") {
821 undef $/;
822 $$textref = <FILE>;
823 $/ = "\n";
824 } else {
825 my $reader = new multiread();
826 $reader->set_handle ('BasPlug::FILE');
827 $reader->set_encoding ($encoding);
828 $reader->read_file ($textref);
829
830 #Now segments chinese if the separate_cjk option is set
831 if ($self->{'separate_cjk'}) {
832 # segment the Chinese words
833 $$textref = &cnseg::segment($$textref);
834 }
835 }
836
837 close FILE;
838}
839
840sub filename_based_title
841{
842 my $self = shift (@_);
843 my ($file) = @_;
844
845 my $file_derived_title = $file;
846 $file_derived_title =~ s/_/ /g;
847 $file_derived_title =~ s/\..*?$//;
848
849 return $file_derived_title;
850}
851
852
853sub title_fallback
854{
855 my $self = shift (@_);
856 my ($doc_obj,$section,$file) = @_;
857
858 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
859
860 my $file_derived_title = $self->filename_based_title($file);
861 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
862 }
863}
864
865sub textcat_get_language_encoding {
866 my $self = shift (@_);
867 my ($filename) = @_;
868
869 my ($language, $encoding, $extracted_encoding);
870 if ($self->{'input_encoding'} eq "auto") {
871 # use textcat to automatically work out the input encoding and language
872 ($language, $encoding) = $self->get_language_encoding ($filename);
873 } elsif ($self->{'extract_language'}) {
874 # use textcat to get language metadata
875 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
876 $encoding = $self->{'input_encoding'};
877 # don't print this message for english... english in utf8 is identical
878 # to english in iso-8859-1 (except for some punctuation). We don't have
879 # a language model for en_utf8, so textcat always says iso-8859-1!
880 if ($extracted_encoding ne $encoding && $language ne "en"
881 && $self->{'verbosity'}) {
882 my $plugin_name = ref ($self);
883 my $outhandle = $self->{'outhandle'};
884 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
885 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
886 # print $outhandle "appears to be encoded as $extracted_encoding.\n";
887 }
888 } else {
889 $language = $self->{'default_language'};
890 $encoding = $self->{'input_encoding'};
891 }
892 return ($language, $encoding);
893}
894
895# Uses textcat to work out the encoding and language of the text in
896# $filename. All html tags are removed before processing.
897# returns an array containing "language" and "encoding"
898sub get_language_encoding {
899 my $self = shift (@_);
900 my ($filename) = @_;
901 my $outhandle = $self->{'outhandle'};
902 my $unicode_format = "";
903 # read in file
904 open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
905 undef $/;
906 my $text = <FILE>;
907 $/ = "\n";
908 close FILE;
909
910 # check if first few bytes have a Byte Order Marker
911 my $bom=substr($text,0,2); # check 16bit unicode
912 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
913 $unicode_format="unicode";
914 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
915 $unicode_format="unicode";
916 } else {
917 $bom=substr($text,0,3); # check utf-8
918 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
919 $unicode_format="utf8";
920# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
921# $unicode_format="utf8";
922 }
923 }
924
925
926 # remove <title>stuff</title> -- as titles tend often to be in English
927 # for foreign language documents
928 $text =~ s/<title>(.|\n)*?<\/title>//i;
929
930 # remove all HTML tags
931 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
932 if (ref($self) eq 'HTMLPlug' ||
933 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
934 $text =~ s/<[^>]*>//sg;
935 }
936
937 # get the language/encoding
938 my $results = $self->{'textcat'}->classify(\$text);
939
940 # if textcat returns 3 or less possibilities we'll use the
941 # first one in the list - otherwise use the defaults
942 if (scalar @$results > 3) {
943 my $best_encoding="";
944 if ($unicode_format) { # in case the first had a BOM
945 $best_encoding=$unicode_format;
946 } else {
947 my %guessed_encodings = ();
948 foreach my $result (@$results) {
949 $result =~ /([^\-]+)$/;
950 my $enc=$1;
951 if (!defined($guessed_encodings{$enc})) {
952 $guessed_encodings{$enc}=0;
953 }
954 $guessed_encodings{$enc}++;
955 }
956
957 $guessed_encodings{""}=-1; # for default best_encoding of ""
958 foreach my $enc (keys %guessed_encodings) {
959 if ($guessed_encodings{$enc} >
960 $guessed_encodings{$best_encoding}){
961 $best_encoding=$enc;
962 }
963 }
964 }
965
966 if ($self->{'input_encoding'} ne 'auto') {
967 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
968 gsprintf($outhandle,
969 "BasPlug: {BasPlug.could_not_extract_language}\n",
970 $filename, $self->{'default_language'});
971 }
972 return ($self->{'default_language'}, $self->{'input_encoding'});
973
974 } else {
975 if ($self->{'verbosity'}>2) {
976 gsprintf($outhandle,
977 "BasPlug: {BasPlug.could_not_extract_language}\n",
978 $filename, $self->{'default_language'});
979 }
980 return ($self->{'default_language'}, $best_encoding);
981 }
982 }
983
984 # format language/encoding
985 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
986 if (!defined $language) {
987 if ($self->{'verbosity'}>2) {
988 gsprintf($outhandle,
989 "BasPlug: {BasPlug.could_not_extract_language}\n",
990 $filename, $self->{'default_language'});
991 }
992 $language = $self->{'default_language'};
993 }
994 if (!defined $encoding) {
995 if ($self->{'verbosity'}>2) {
996 gsprintf($outhandle,
997 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
998 $filename, $self->{'default_encoding'});
999 }
1000 $encoding = $self->{'default_encoding'};
1001 }
1002
1003
1004 # check for equivalents where textcat doesn't have some encodings...
1005 # eg MS versions of standard encodings
1006 if ($encoding =~ /^iso_8859_(\d+)/) {
1007 my $iso = $1; # which variant of the iso standard?
1008 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1009 if ($text =~ /[\x80-\x9f]/) {
1010 # Western Europe
1011 if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1012 elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1013 elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1014 elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1015 elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1016 elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1017 elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1018 }
1019 }
1020
1021 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
1022 !defined $encodings::encodings->{$encoding}) {
1023 if ($self->{'verbosity'}) {
1024 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1025 $filename, $encoding, $self->{'default_encoding'});
1026 }
1027 $encoding = $self->{'default_encoding'};
1028 }
1029
1030 return ($language, $encoding);
1031}
1032
1033# add any extra metadata that's been passed around from one
1034# plugin to another.
1035# extra_metadata uses add_utf8_metadata so it expects metadata values
1036# to already be in utf8
1037sub extra_metadata {
1038 my $self = shift (@_);
1039 my ($doc_obj, $cursection, $metadata) = @_;
1040
1041 foreach my $field (keys(%$metadata)) {
1042 # $metadata->{$field} may be an array reference
1043 if ($field eq "gsdlassocfile_tobe") {
1044 # 'gsdlassocfile_tobe' is artificially introduced metadata
1045 # that is used to signal that certain additional files should
1046 # be tied to this document. Useful in situations where a
1047 # metadata pass in the plugin pipeline works out some files
1048 # need to be associated with a document, but the document hasn't
1049 # been formed yet.
1050
1051 my $equiv_form = "";
1052 foreach my $gaf (@{$metadata->{$field}}) {
1053 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1054 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1055 my $filename = $full_filename;
1056
1057 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1058
1059 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1060 my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1061 my $srcicon = "_icon".$doc_ext."_";
1062 my $end_doclink = "</a>";
1063
1064 $equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1065 }
1066 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1067 }
1068 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1069 map {
1070 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1071 } @{$metadata->{$field}};
1072 } else {
1073 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1074 }
1075 }
1076}
1077
1078# initialise metadata extractors
1079sub initialise_extractors {
1080 my $self = shift (@_);
1081
1082 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1083 &acronym::initialise_acronyms();
1084 }
1085}
1086
1087# finalise metadata extractors
1088sub finalise_extractors {
1089 my $self = shift (@_);
1090
1091 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1092 &acronym::finalise_acronyms();
1093 }
1094}
1095
1096# FIRSTNNN: extract the first NNN characters as metadata
1097sub extract_first_NNNN_characters {
1098 my $self = shift (@_);
1099 my ($textref, $doc_obj, $thissection) = @_;
1100
1101 foreach my $size (split /,/, $self->{'first'}) {
1102 my $tmptext = $$textref;
1103 $tmptext =~ s/^\s+//;
1104 $tmptext =~ s/\s+$//;
1105 $tmptext =~ s/\s+/ /gs;
1106 $tmptext = substr ($tmptext, 0, $size);
1107 $tmptext =~ s/\s\S*$/&#8230;/;
1108 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1109 }
1110}
1111
1112sub extract_email {
1113 my $self = shift (@_);
1114 my ($textref, $doc_obj, $thissection) = @_;
1115 my $outhandle = $self->{'outhandle'};
1116
1117 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1118 if ($self->{'verbosity'} > 2);
1119
1120 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1121 @email = sort @email;
1122
1123 my @email2 = ();
1124 foreach my $address (@email) {
1125 if (!(join(" ",@email2) =~ m/$address/ )) {
1126 push @email2, $address;
1127 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1128 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1129 if ($self->{'verbosity'} > 3);
1130 }
1131 }
1132 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1133 if ($self->{'verbosity'} > 2);
1134}
1135
1136# extract metadata
1137sub auto_extract_metadata {
1138
1139 my $self = shift (@_);
1140 my ($doc_obj) = @_;
1141
1142 if ($self->{'extract_email'}) {
1143 my $thissection = $doc_obj->get_top_section();
1144 while (defined $thissection) {
1145 my $text = $doc_obj->get_text($thissection);
1146 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1147 $thissection = $doc_obj->get_next_section ($thissection);
1148 }
1149 }
1150 if ($self->{'extract_placenames'}) {
1151 my $thissection = $doc_obj->get_top_section();
1152 while (defined $thissection) {
1153 my $text = $doc_obj->get_text($thissection);
1154 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1155 $thissection = $doc_obj->get_next_section ($thissection);
1156 }
1157 }
1158
1159 # adding kea keyphrases
1160 if ($self->{'kea'}) {
1161
1162 my $thissection = $doc_obj->get_top_section();
1163 my $text = "";
1164 my $list;
1165
1166 #loop through sections to gather whole doc
1167 while (defined $thissection) {
1168 my $sectiontext = $doc_obj->get_text($thissection);
1169 $text = $text.$sectiontext;
1170 $thissection = $doc_obj->get_next_section ($thissection);
1171 }
1172
1173 if ($self->{'kea_options'}) {
1174 #if kea options flag is set, call Kea with specified options
1175 $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
1176 } else {
1177 #otherwise call Kea with no options
1178 $list = &Kea::extract_KeyPhrases ($text);
1179 }
1180 if ($list){
1181 # if a list of kea keyphrases was returned (ie not empty)
1182 if ($self->{'verbosity'}) {
1183 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1184 }
1185
1186 #add metadata to top section
1187 $thissection = $doc_obj->get_top_section();
1188
1189 # add all key phrases as one metadata
1190 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1191
1192 # add individual key phrases as multiple metadata
1193 foreach my $keyphrase (split(',', $list)) {
1194 $keyphrase =~ s/^\s+|\s+$//g;
1195 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1196 }
1197 }
1198 } #end of kea
1199
1200 if ($self->{'first'}) {
1201 my $thissection = $doc_obj->get_top_section();
1202 while (defined $thissection) {
1203 my $text = $doc_obj->get_text($thissection);
1204 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1205 $thissection = $doc_obj->get_next_section ($thissection);
1206 }
1207 }
1208
1209 if ($self->{'extract_acronyms'}) {
1210 my $thissection = $doc_obj->get_top_section();
1211 while (defined $thissection) {
1212 my $text = $doc_obj->get_text($thissection);
1213 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1214 $thissection = $doc_obj->get_next_section ($thissection);
1215 }
1216 }
1217
1218 if ($self->{'markup_acronyms'}) {
1219 my $thissection = $doc_obj->get_top_section();
1220 while (defined $thissection) {
1221 my $text = $doc_obj->get_text($thissection);
1222 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1223 $doc_obj->delete_text($thissection);
1224 $doc_obj->add_text($thissection, $text);
1225 $thissection = $doc_obj->get_next_section ($thissection);
1226 }
1227 }
1228
1229 if($self->{'date_extract'}) {
1230 my $thissection = $doc_obj->get_top_section();
1231 while (defined $thissection) {
1232
1233 my $text = $doc_obj->get_text($thissection);
1234 &DateExtract::get_date_metadata($text, $doc_obj,
1235 $thissection,
1236 $self->{'no_biblio'},
1237 $self->{'max_year'},
1238 $self->{'max_century'});
1239 $thissection = $doc_obj->get_next_section ($thissection);
1240 }
1241 }
1242}
1243
1244# extract acronyms from a section in a document. progress is
1245# reported to outhandle based on the verbosity. both the Acronym
1246# and the AcronymKWIC metadata items are created.
1247
1248sub extract_acronyms {
1249 my $self = shift (@_);
1250 my ($textref, $doc_obj, $thissection) = @_;
1251 my $outhandle = $self->{'outhandle'};
1252
1253 # print $outhandle " extracting acronyms ...\n"
1254 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1255 if ($self->{'verbosity'} > 2);
1256
1257 my $acro_array = &acronym::acronyms($textref);
1258
1259 foreach my $acro (@$acro_array) {
1260
1261 #check that this is the first time ...
1262 my $seen_before = "false";
1263 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1264 foreach my $thisAcro (@$previous_data) {
1265 if ($thisAcro eq $acro->to_string()) {
1266 $seen_before = "true";
1267 if ($self->{'verbosity'} >= 4) {
1268 gsprintf($outhandle, " {BasPlug.already_seen} " .
1269 $acro->to_string() . "\n");
1270 }
1271 }
1272 }
1273
1274 if ($seen_before eq "false") {
1275 #write it to the file ...
1276 $acro->write_to_file();
1277
1278 #do the normal acronym
1279 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1280 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1281 if ($self->{'verbosity'} > 3);
1282 }
1283 }
1284
1285 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1286 if ($self->{'verbosity'} > 2);
1287}
1288
1289sub markup_acronyms {
1290 my $self = shift (@_);
1291 my ($text, $doc_obj, $thissection) = @_;
1292 my $outhandle = $self->{'outhandle'};
1293
1294 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1295 if ($self->{'verbosity'} > 2);
1296
1297 #self is passed in to check for verbosity ...
1298 $text = &acronym::markup_acronyms($text, $self);
1299
1300 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1301 if ($self->{'verbosity'} > 2);
1302
1303 return $text;
1304}
1305
1306sub compile_stats {
1307 my $self = shift(@_);
1308 my ($stats) = @_;
1309
1310 $stats->{'num_processed'} += $self->{'num_processed'};
1311 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1312 $stats->{'num_archives'} += $self->{'num_archives'};
1313
1314}
1315
1316sub associate_cover_image {
1317 my $self = shift(@_);
1318 my ($doc_obj, $filename) = @_;
1319
1320 my $top_section=$doc_obj->get_top_section();
1321
1322 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1323 if (-e $filename) {
1324 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1325 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1326 } else {
1327 $filename =~ s/jpg$/JPG/;
1328 if (-e $filename) {
1329 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1330 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1331 }
1332 }
1333}
1334
13351;
Note: See TracBrowser for help on using the repository browser.