source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10276

Last change on this file since 10276 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 39.8 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict;
39no strict 'subs';
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42use File::Basename;
43
44use Kea;
45use parsargv;
46use multiread;
47use encodings;
48use cnseg;
49use acronym;
50use textcat;
51use doc;
52eval "require diagnostics"; # some perl distros (eg mac) don't have this
53use DateExtract;
54use ghtml;
55use gsprintf 'gsprintf';
56use printusage;
57#$%^
58use parse2;
59
60
61use GISBasPlug;
62
63@BasPlug::ISA = ( GISBasPlug );
64
65my $unicode_list =
66 [ { 'name' => "ascii",
67 'desc' => "{BasPlug.input_encoding.ascii}" },
68 { 'name' => "utf8",
69 'desc' => "{BasPlug.input_encoding.utf8}" },
70 { 'name' => "unicode",
71 'desc' => "{BasPlug.input_encoding.unicode}" } ];
72
73my $auto_unicode_list =
74 [ { 'name' => "auto",
75 'desc' => "{BasPlug.input_encoding.auto}" } ];
76
77my $arguments =
78 [ { 'name' => "process_exp",
79 'desc' => "{BasPlug.process_exp}",
80 'type' => "regexp",
81 'deft' => "",
82 'reqd' => "no" },
83 { 'name' => "block_exp",
84 'desc' => "{BasPlug.block_exp}",
85 'type' => "regexp",
86 'deft' => "",
87 'reqd' => "no" },
88 { 'name' => "smart_block",
89 'desc' => "{BasPlug.smart_block}",
90 'type' => "flag",
91 'reqd' => "no" },
92 { 'name' => "associate_ext",
93 'desc' => "{BasPlug.associate_ext}",
94 'type' => "string",
95 'reqd' => "no" },
96 { 'name' => "input_encoding",
97 'desc' => "{BasPlug.input_encoding}",
98 'type' => "enum",
99 'list' => $auto_unicode_list,
100 'reqd' => "no" ,
101 'deft' => "auto" } ,
102 { 'name' => "default_encoding",
103 'desc' => "{BasPlug.default_encoding}",
104 'type' => "enum",
105 'list' => $unicode_list,
106 'reqd' => "no",
107 'deft' => "utf8" },
108 { 'name' => "extract_language",
109 'desc' => "{BasPlug.extract_language}",
110 'type' => "flag",
111 'reqd' => "no" },
112 { 'name' => "default_language",
113 'desc' => "{BasPlug.default_language}",
114 'type' => "language",
115 'deft' => "en",
116 'char_length' => "2",
117 'reqd' => "no" },
118 { 'name' => "extract_acronyms",
119 'desc' => "{BasPlug.extract_acronyms}",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "markup_acronyms",
123 'desc' => "{BasPlug.markup_acronyms}",
124 'type' => "flag",
125 'reqd' => "no" },
126 { 'name' => "extract_keyphrases",
127 'desc' => "{BasPlug.extract_keyphrases}",
128 'type' => "flag",
129 'reqd' => "no" },
130 { 'name' => "extract_keyphrase_options",
131 'desc' => "{BasPlug.extract_keyphrase_options}",
132 'type' => "string",
133 'deft' => "",
134 'reqd' => "no" },
135 { 'name' => "first",
136 'desc' => "{BasPlug.first}",
137 'type' => "string",
138 'reqd' => "no" },
139 { 'name' => "extract_email",
140 'desc' => "{BasPlug.extract_email}",
141 'type' => "flag",
142 'reqd' => "no" },
143 { 'name' => "extract_historical_years",
144 'desc' => "{BasPlug.extract_historical_years}",
145 'type' => "flag",
146 'reqd' => "no" },
147 { 'name' => "maximum_year",
148 'desc' => "{BasPlug.maximum_year}",
149 'type' => "int",
150 'deft' => (localtime)[5]+1900,
151 'char_length' => "4",
152 #'range' => "2,100",
153 'reqd' => "no"},
154 { 'name' => "maximum_century",
155 'desc' => "{BasPlug.maximum_century}",
156 'type' => "string",
157 'deft' => "-1",
158 'reqd' => "no" },
159 { 'name' => "no_bibliography",
160 'desc' => "{BasPlug.no_bibliography}",
161 'type' => "flag",
162 'reqd' => "no"},
163 { 'name' => "no_cover_image",
164 'desc' => "{BasPlug.no_cover_image}",
165 'type' => "flag",
166 'reqd' => "no" },
167 { 'name' => "extract_keyphrases",
168 'desc' => "{BasPlug.extract_keyphrases}",
169 'type' => "flag",
170 'reqd' => "no",
171 'hiddengli' => "yes" },
172 { 'name' => "extract_keyphrase_options",
173 'desc' => "{BasPlug.extract_keyphrase_options}",
174 'type' => "string",
175 'reqd' => "no",
176 'hiddengli' => "yes" },
177 { 'name' => "separate_cjk",
178 'desc' => "{BasPlug.separate_cjk}",
179 'type' => "flag",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182 { 'name' => "smart_block",
183 'desc' => "{BasPlug.smart_block}",
184 'type' => "flag",
185 'reqd' => "no",
186 'hiddengli' => "yes" },
187 { 'name' => "new_extract_email",
188 'desc' => "",
189 'type' => "flag",
190 'reqd' => "no",
191 'hiddengli' => "yes" } ];
192
193my $gis_arguments =
194 [ { 'name' => "extract_placenames",
195 'desc' => "{GISBasPlug.extract_placenames}",
196 'type' => "flag",
197 'reqd' => "no" },
198 { 'name' => "gazetteer",
199 'desc' => "{GISBasPlug.gazetteer}",
200 'type' => "string",
201 'reqd' => "no" },
202 { 'name' => "place_list",
203 'desc' => "{GISBasPlug.place_list}",
204 'type' => "flag",
205 'reqd' => "no" } ];
206
207
208my $options = { 'name' => "BasPlug",
209 'desc' => "{BasPlug.desc}",
210 'abstract' => "yes",
211 'inherits' => "no",
212 'args' => $arguments };
213
214
215
216
217sub get_arguments
218{
219 my $self = shift(@_);
220 my $optionlistref = $self->{'option_list'};
221 my @optionlist = @$optionlistref;
222 my $pluginoptions = pop(@$optionlistref);
223 my $pluginarguments = $pluginoptions->{'args'};
224 return $pluginarguments;
225}
226
227
228sub print_xml_usage
229{
230 my $self = shift(@_);
231
232 # XML output is always in UTF-8
233 gsprintf::output_strings_in_UTF8;
234
235 PrintUsage::print_xml_header();
236 $self->print_xml();
237}
238
239
240sub print_xml
241{
242 my $self = shift(@_);
243
244 my $optionlistref = $self->{'option_list'};
245 my @optionlist = @$optionlistref;
246 my $pluginoptions = shift(@$optionlistref);
247 return if (!defined($pluginoptions));
248
249 gsprintf(STDERR, "<PlugInfo>\n");
250 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
251 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
252 $desc =~ s/</&amp;lt;/g; # doubly escaped
253 $desc =~ s/>/&amp;gt;/g;
254
255 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
256 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
257 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
258 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
259 gsprintf(STDERR, " <Arguments>\n");
260 if (defined($pluginoptions->{'args'})) {
261 &PrintUsage::print_options_xml($pluginoptions->{'args'});
262 }
263
264 # Recurse up the plugin hierarchy
265 $self->print_xml();
266
267 gsprintf(STDERR, " </Arguments>\n");
268 gsprintf(STDERR, "</PlugInfo>\n");
269}
270
271
272sub print_txt_usage
273{
274 my $self = shift(@_);
275 # Print the usage message for a plugin (recursively)
276 my $descoffset = $self->determine_description_offset(0);
277 $self->print_plugin_usage($descoffset, 1);
278}
279
280
281sub determine_description_offset
282{
283 my $self = shift(@_);
284 my $maxoffset = shift(@_);
285
286 my $optionlistref = $self->{'option_list'};
287 my @optionlist = @$optionlistref;
288 my $pluginoptions = shift(@$optionlistref);
289 return $maxoffset if (!defined($pluginoptions));
290
291 # Find the length of the longest option string of this plugin
292 my $pluginargs = $pluginoptions->{'args'};
293 if (defined($pluginargs)) {
294 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
295 if ($longest > $maxoffset) {
296 $maxoffset = $longest;
297 }
298 }
299
300 # Recurse up the plugin hierarchy
301 $maxoffset = $self->determine_description_offset($maxoffset);
302 $self->{'option_list'} = \@optionlist;
303 return $maxoffset;
304}
305
306
307sub print_plugin_usage
308{
309 my $self = shift(@_);
310 my $descoffset = shift(@_);
311 my $isleafclass = shift(@_);
312
313 my $optionlistref = $self->{'option_list'};
314 my @optionlist = @$optionlistref;
315 my $pluginoptions = shift(@$optionlistref);
316 return if (!defined($pluginoptions));
317
318 my $pluginname = $pluginoptions->{'name'};
319 my $pluginargs = $pluginoptions->{'args'};
320 my $plugindesc = $pluginoptions->{'desc'};
321
322 # Produce the usage information using the data structure above
323 if ($isleafclass) {
324 if (defined($plugindesc)) {
325 gsprintf(STDERR, "$plugindesc\n\n");
326 }
327 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
328 }
329
330 # Display the plugin options, if there are some
331 if (defined($pluginargs)) {
332 # Calculate the column offset of the option descriptions
333 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
334
335 if ($isleafclass) {
336 gsprintf(STDERR, " {common.specific_options}:\n");
337 }
338 else {
339 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
340 }
341
342 # Display the plugin options
343 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
344 }
345
346 # Recurse up the plugin hierarchy
347 $self->print_plugin_usage($descoffset, 0);
348 $self->{'option_list'} = \@optionlist;
349}
350
351
352sub new {
353 # Set Encodings to the list!!
354
355 my $e = $encodings::encodings;
356 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
357 {
358 my $hashEncode =
359 {'name' => $enc,
360 'desc' => $e->{$enc}->{'name'}};
361
362 push(@{$unicode_list},$hashEncode);
363 }
364
365 push(@{$auto_unicode_list},@{$unicode_list});
366
367 # Start the BasPlug Constructor
368 my $class = shift (@_);
369 my ($pluginlist,$args,$hashArgOptLists) = @_;
370 push(@$pluginlist, $class);
371 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
372
373 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
374 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
375
376 if (GISBasPlug::has_mapdata()) {
377 push(@$arguments,@$gis_arguments);
378 }
379
380 my $self = {};
381 if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
382 {
383 my $classTempClass = bless $self, $class;
384 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
385 $classTempClass->print_txt_usage(""); # Use default resource bundle
386 die "\n";
387 }
388
389 # else parsing was successful.
390
391 $self->{'plugin_type'} = $plugin_name;
392 $self->{'outhandle'} = STDERR;
393 $self->{'textcat'} = new textcat();
394 $self->{'num_processed'} = 0;
395 $self->{'num_not_processed'} = 0;
396 $self->{'num_blocked'} = 0;
397 $self->{'num_archives'} = 0;
398 $self->{'cover_image'} = 1; # cover image is on by default
399 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
400 $self->{'file_blocks'} = {};
401 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
402
403 my $associate_ext = $self->{'associate_ext'};
404 if ((defined $associate_ext) && ($associate_ext ne "")) {
405 my @exts = split(/,/,$associate_ext);
406
407 my %associate_ext_lookup = ();
408 foreach my $e (@exts) {
409 $associate_ext_lookup{$e} = 1;
410 }
411
412 $self->{'associate_ext_lookup'} = \%associate_ext_lookup;
413 }
414
415 $self->{'shared_fileroot'} = {};
416 $self->{'file_blocks'} = {};
417
418 if ($self->{'extract_placenames'}) {
419
420 my $outhandle = $self->{'outhandle'};
421
422 my $places_ref
423 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
424
425 if (!defined $places_ref) {
426 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
427 print $outhandle " No placename extraction will take place.\n";
428 $self->{'extract_placenames'} = undef;
429 }
430 else {
431 $self->{'places'} = $places_ref;
432 }
433 }
434 return bless $self, $class;
435
436}
437
438# initialize BasPlug options
439# if init() is overridden in a sub-class, remember to call BasPlug::init()
440sub init {
441 my $self = shift (@_);
442 my ($verbosity, $outhandle, $failhandle) = @_;
443
444 # verbosity is passed through from the processor
445 $self->{'verbosity'} = $verbosity;
446
447 # as are the outhandle and failhandle
448 $self->{'outhandle'} = $outhandle if defined $outhandle;
449 $self->{'failhandle'} = $failhandle;
450
451 # set process_exp and block_exp to defaults unless they were
452 # explicitly set
453
454 if ((!$self->is_recursive()) and
455 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
456
457 $self->{'process_exp'} = $self->get_default_process_exp ();
458 if ($self->{'process_exp'} eq "") {
459 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
460 }
461 }
462
463 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
464 $self->{'block_exp'} = $self->get_default_block_exp ();
465 }
466}
467
468sub begin {
469 my $self = shift (@_);
470 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
471 $self->initialise_extractors();
472}
473
474sub end {
475 # potentially called at the end of each plugin pass
476 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
477
478 my ($self) = @_;
479 $self->finalise_extractors();
480}
481
482sub deinit {
483 # called only once, after all plugin passes have been done
484
485 my ($self) = @_;
486}
487
488# this function should be overridden to return 1
489# in recursive plugins
490sub is_recursive {
491 my $self = shift (@_);
492
493 return 0;
494}
495
496sub get_default_block_exp {
497 my $self = shift (@_);
498
499 return "";
500}
501
502sub get_default_process_exp {
503 my $self = shift (@_);
504
505 return "";
506}
507
508# default implementation is to do nothing.
509sub store_block_files
510{
511 my $self =shift (@_);
512 my ($filename) = @_;
513 return;
514}
515
516#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
517sub block_cover_image
518{
519 my $self =shift (@_);
520 my ($filename) = @_;
521 if ($self->{'cover_image'}) {
522 my $coverfile = $filename;
523 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
524 if (!-e $coverfile) {
525 $coverfile =~ s/jpg$/JPG/;
526 }
527 if (-e $coverfile) {
528 $self->{'file_blocks'}->{$coverfile} = 1;
529 }
530 }
531
532 return;
533}
534
535sub metadata_read {
536 my $self = shift (@_);
537 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
538 # Keep track of filenames with same root but different extensions
539 # Used to support -associate_ext
540
541 my $associate_ext = $self->{'associate_ext'};
542 if ((defined $associate_ext) && ($associate_ext ne "")) {
543
544 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
545 if ((defined $file_prefix) && (defined $file_ext)) {
546
547 my $shared_fileroot = $self->{'shared_fileroot'};
548 if (!defined $shared_fileroot->{$file_prefix}) {
549 my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
550 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
551 }
552
553 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
554
555 my $process_exp = $self->{'process_exp'};
556
557 if ($file =~ m/$self->{'process_exp'}/) {
558 # This is the document the others should be tied to
559 $file_prefix_rec->{'tie_to'} = $file_ext;
560 }
561 else {
562 if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
563 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
564 }
565 }
566 }
567 }
568
569 # now check whether we are actually processing this
570 my $filename = $file;
571 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
572 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
573 return undef; # can't recognise
574 }
575
576 # do smart blocking if appropriate
577 if (defined $self->{'smart_block'}) {
578 $self->block_cover_image($filename);
579 $self->store_block_files($filename);
580 }
581
582 return 1;
583}
584
585sub tie_to_filename
586{
587 my $self = shift (@_);
588
589 my ($file_ext,$file_prefix_rec) = @_;
590
591 if (defined $file_prefix_rec) {
592 my $tie_to = $file_prefix_rec->{'tie_to'};
593
594 if (defined $tie_to) {
595 if ($tie_to eq $file_ext) {
596 return 1;
597 }
598 }
599 }
600
601 return 0;
602}
603
604sub tie_to_assoc_file
605{
606 my $self = shift (@_);
607 my ($file_ext,$file_prefix_rec) = @_;
608
609 if (defined $file_prefix_rec) {
610 my $tie_to = $file_prefix_rec->{'tie_to'};
611 if (defined $tie_to) {
612
613 my $exts = $file_prefix_rec->{'exts'};
614
615 my $has_file_ext = $exts->{$file_ext};
616
617 if ($has_file_ext) {
618 return 1;
619 }
620 }
621 }
622
623 return 0;
624}
625
626
627sub associate_with
628{
629 my $self = shift (@_);
630 my ($file, $filename, $metadata) = @_;
631
632 my $associate_ext = $self->{'associate_ext'};
633
634
635 return 0 if (!$associate_ext);
636
637 # If file, see if matches with "tie_to" doc or is one of the
638 # associated filename extensions.
639
640 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
641 if ((defined $file_prefix) && (defined $file_ext)) {
642
643 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
644
645 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
646
647 # Set up gsdlassocfile_tobe
648
649 my $exts = $file_prefix_rec->{'exts'};
650
651 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
652 $metadata->{'gsdlassocfile_tobe'} = [];
653 }
654
655 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
656
657 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
658 foreach my $e (keys %$exts) {
659 my $assoc_file = "$full_prefix.$e";
660 my $mime_type = ""; # let system auto detect this
661 push(@$assoc_tobe,"$assoc_file:$mime_type:");
662 }
663 }
664 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
665 # a form of smart block
666
667 return 1;
668 }
669 }
670
671 return 0;
672}
673
674
675# The BasPlug read() function. This function does all the right things
676# to make general options work for a given plugin. It calls the process()
677# function which does all the work specific to a plugin (like the old
678# read functions used to do). Most plugins should define their own
679# process() function and let this read() function keep control.
680#
681# recursive plugins (e.g. RecPlug) and specialized plugins like those
682# capable of processing many documents within a single file (e.g.
683# GMLPlug) should normally implement their own version of read()
684#
685# Return number of files processed, undef if can't recognise, -1 if can't
686# process
687# Note that $base_dir might be "" and that $file might
688# include directories
689
690sub read {
691 my $self = shift (@_);
692
693 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
694
695 if ($self->is_recursive()) {
696 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
697 }
698
699 my $outhandle = $self->{'outhandle'};
700 my $smart_block = $self->{'smart_block'};
701 my $smart_block_BN = $self->{'smart_block_BN'};
702
703 my $filename = $file;
704 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
705
706 if ($self->associate_with($file,$filename,$metadata)) {
707 # a form of smart block
708 $self->{'num_blocked'} ++;
709 return 0; # blocked
710 }
711
712 if ($smart_block || $smart_block_BN) {
713
714 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
715 $self->{'num_blocked'} ++;
716 return 0; # blocked
717 }
718 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
719 $self->{'num_blocked'} ++;
720 return 0; # blocked
721 }
722
723 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
724 return undef; # can't recognise
725 }
726 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
727
728 # Do encoding stuff
729 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
730
731 # create a new document
732 my $doc_obj = new doc ($filename, "indexed_doc");
733 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
734 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
735 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
736 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
737 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
738
739 my ($filemeta) = $file =~ /([^\\\/]+)$/;
740 # how do we know what encoding the filename is in?
741 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
742 if ($self->{'cover_image'}) {
743 $self->associate_cover_image($doc_obj, $filename);
744 }
745
746 # read in file ($text will be in utf8)
747 my $text = "";
748 $self->read_file ($filename, $encoding, $language, \$text);
749
750 if (!length ($text)) {
751 my $plugin_name = ref ($self);
752 if ($gli) {
753 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
754 }
755 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
756
757 my $failhandle = $self->{'failhandle'};
758 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
759 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
760 $self->{'num_not_processed'} ++;
761
762 return 0; # what should we return here?? error but don't want to pass it on
763 }
764
765 # include any metadata passed in from previous plugins
766 # note that this metadata is associated with the top level section
767
768 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
769
770 # do plugin specific processing of doc_obj
771 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
772 $text = '';
773 undef $text;
774 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
775 return -1;
776 }
777 $text='';
778 undef $text;
779
780 # do any automatic metadata extraction
781 $self->auto_extract_metadata ($doc_obj);
782
783 # add an OID
784 # see if there is a plugin-specific set_OID function...
785 if (defined ($self->can('set_OID'))) {
786 # it will need $doc_obj to set the Identifier metadata...
787 $self->set_OID($doc_obj);
788 } else {
789 # use the default set_OID() in doc.pm
790 $doc_obj->set_OID();
791 }
792
793 # process the document
794 $processor->process($doc_obj);
795
796 if(defined($self->{'places_filename'})){
797 &util::rm($self->{'places_filename'});
798 $self->{'places_filename'} = undef;
799 }
800
801 $self->{'num_processed'} ++;
802 undef $doc_obj;
803 return 1; # processed the file
804}
805
806# returns undef if file is rejected by the plugin
807sub process {
808 my $self = shift (@_);
809 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
810
811 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
812 # die "Basplug::process function must be implemented in sub-class\n";
813
814 return undef; # never gets here
815}
816
817# uses the multiread package to read in the entire file pointed to
818# by filename and loads the resulting text into $$textref. Input text
819# may be in any of the encodings handled by multiread, output text
820# will be in utf8
821sub read_file {
822 my $self = shift (@_);
823 my ($filename, $encoding, $language, $textref) = @_;
824
825 if (!-r $filename)
826 {
827 my $outhandle = $self->{'outhandle'};
828 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
829 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
830 return;
831 }
832
833 $$textref = "";
834
835 if (!open (FILE, $filename)) {
836 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
837 die "\n";
838 }
839
840 if ($encoding eq "ascii") {
841 undef $/;
842 $$textref = <FILE>;
843 $/ = "\n";
844 } else {
845 my $reader = new multiread();
846 $reader->set_handle ('BasPlug::FILE');
847 $reader->set_encoding ($encoding);
848 $reader->read_file ($textref);
849
850 #Now segments chinese if the separate_cjk option is set
851 if ($self->{'separate_cjk'}) {
852 # segment the Chinese words
853 $$textref = &cnseg::segment($$textref);
854 }
855 }
856
857 close FILE;
858}
859
860sub filename_based_title
861{
862 my $self = shift (@_);
863 my ($file) = @_;
864
865 my $file_derived_title = $file;
866 $file_derived_title =~ s/_/ /g;
867 $file_derived_title =~ s/\..*?$//;
868
869 return $file_derived_title;
870}
871
872
873sub title_fallback
874{
875 my $self = shift (@_);
876 my ($doc_obj,$section,$file) = @_;
877
878 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
879
880 my $file_derived_title = $self->filename_based_title($file);
881 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
882 }
883}
884
885sub textcat_get_language_encoding {
886 my $self = shift (@_);
887 my ($filename) = @_;
888
889 my ($language, $encoding, $extracted_encoding);
890 if ($self->{'input_encoding'} eq "auto") {
891 # use textcat to automatically work out the input encoding and language
892 ($language, $encoding) = $self->get_language_encoding ($filename);
893 } elsif ($self->{'extract_language'}) {
894 # use textcat to get language metadata
895 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
896 $encoding = $self->{'input_encoding'};
897 # don't print this message for english... english in utf8 is identical
898 # to english in iso-8859-1 (except for some punctuation). We don't have
899 # a language model for en_utf8, so textcat always says iso-8859-1!
900 if ($extracted_encoding ne $encoding && $language ne "en"
901 && $self->{'verbosity'}) {
902 my $plugin_name = ref ($self);
903 my $outhandle = $self->{'outhandle'};
904 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
905 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
906 # print $outhandle "appears to be encoded as $extracted_encoding.\n";
907 }
908 } else {
909 $language = $self->{'default_language'};
910 $encoding = $self->{'input_encoding'};
911 }
912 return ($language, $encoding);
913}
914
915# Uses textcat to work out the encoding and language of the text in
916# $filename. All html tags are removed before processing.
917# returns an array containing "language" and "encoding"
918sub get_language_encoding {
919 my $self = shift (@_);
920 my ($filename) = @_;
921 my $outhandle = $self->{'outhandle'};
922 my $unicode_format = "";
923 # read in file
924 open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
925 undef $/;
926 my $text = <FILE>;
927 $/ = "\n";
928 close FILE;
929
930 # check if first few bytes have a Byte Order Marker
931 my $bom=substr($text,0,2); # check 16bit unicode
932 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
933 $unicode_format="unicode";
934 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
935 $unicode_format="unicode";
936 } else {
937 $bom=substr($text,0,3); # check utf-8
938 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
939 $unicode_format="utf8";
940# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
941# $unicode_format="utf8";
942 }
943 }
944
945
946 # remove <title>stuff</title> -- as titles tend often to be in English
947 # for foreign language documents
948 $text =~ s/<title>(.|\n)*?<\/title>//i;
949
950 # remove all HTML tags
951 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
952 if (ref($self) eq 'HTMLPlug' ||
953 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
954 $text =~ s/<[^>]*>//sg;
955 }
956
957 # get the language/encoding
958 my $results = $self->{'textcat'}->classify(\$text);
959
960 # if textcat returns 3 or less possibilities we'll use the
961 # first one in the list - otherwise use the defaults
962 if (scalar @$results > 3) {
963 my $best_encoding="";
964 if ($unicode_format) { # in case the first had a BOM
965 $best_encoding=$unicode_format;
966 } else {
967 my %guessed_encodings = ();
968 foreach my $result (@$results) {
969 $result =~ /([^\-]+)$/;
970 my $enc=$1;
971 if (!defined($guessed_encodings{$enc})) {
972 $guessed_encodings{$enc}=0;
973 }
974 $guessed_encodings{$enc}++;
975 }
976
977 $guessed_encodings{""}=-1; # for default best_encoding of ""
978 foreach my $enc (keys %guessed_encodings) {
979 if ($guessed_encodings{$enc} >
980 $guessed_encodings{$best_encoding}){
981 $best_encoding=$enc;
982 }
983 }
984 }
985
986 if ($self->{'input_encoding'} ne 'auto') {
987 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
988 gsprintf($outhandle,
989 "BasPlug: {BasPlug.could_not_extract_language}\n",
990 $filename, $self->{'default_language'});
991 }
992 return ($self->{'default_language'}, $self->{'input_encoding'});
993
994 } else {
995 if ($self->{'verbosity'}>2) {
996 gsprintf($outhandle,
997 "BasPlug: {BasPlug.could_not_extract_language}\n",
998 $filename, $self->{'default_language'});
999 }
1000 return ($self->{'default_language'}, $best_encoding);
1001 }
1002 }
1003
1004 # format language/encoding
1005 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
1006 if (!defined $language) {
1007 if ($self->{'verbosity'}>2) {
1008 gsprintf($outhandle,
1009 "BasPlug: {BasPlug.could_not_extract_language}\n",
1010 $filename, $self->{'default_language'});
1011 }
1012 $language = $self->{'default_language'};
1013 }
1014 if (!defined $encoding) {
1015 if ($self->{'verbosity'}>2) {
1016 gsprintf($outhandle,
1017 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
1018 $filename, $self->{'default_encoding'});
1019 }
1020 $encoding = $self->{'default_encoding'};
1021 }
1022
1023
1024 # check for equivalents where textcat doesn't have some encodings...
1025 # eg MS versions of standard encodings
1026 if ($encoding =~ /^iso_8859_(\d+)/) {
1027 my $iso = $1; # which variant of the iso standard?
1028 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1029 if ($text =~ /[\x80-\x9f]/) {
1030 # Western Europe
1031 if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1032 elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1033 elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1034 elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1035 elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1036 elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1037 elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1038 }
1039 }
1040
1041 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
1042 !defined $encodings::encodings->{$encoding}) {
1043 if ($self->{'verbosity'}) {
1044 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1045 $filename, $encoding, $self->{'default_encoding'});
1046 }
1047 $encoding = $self->{'default_encoding'};
1048 }
1049
1050 return ($language, $encoding);
1051}
1052
1053# add any extra metadata that's been passed around from one
1054# plugin to another.
1055# extra_metadata uses add_utf8_metadata so it expects metadata values
1056# to already be in utf8
1057sub extra_metadata {
1058 my $self = shift (@_);
1059 my ($doc_obj, $cursection, $metadata) = @_;
1060
1061 foreach my $field (keys(%$metadata)) {
1062 # $metadata->{$field} may be an array reference
1063 if ($field eq "gsdlassocfile_tobe") {
1064 # 'gsdlassocfile_tobe' is artificially introduced metadata
1065 # that is used to signal that certain additional files should
1066 # be tied to this document. Useful in situations where a
1067 # metadata pass in the plugin pipeline works out some files
1068 # need to be associated with a document, but the document hasn't
1069 # been formed yet.
1070
1071 my $equiv_form = "";
1072 foreach my $gaf (@{$metadata->{$field}}) {
1073 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1074 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1075 my $filename = $full_filename;
1076
1077 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1078
1079 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1080 my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1081 my $srcicon = "_icon".$doc_ext."_";
1082 my $end_doclink = "</a>";
1083
1084 $equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1085 }
1086 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1087 }
1088 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1089 map {
1090 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1091 } @{$metadata->{$field}};
1092 } else {
1093 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1094 }
1095 }
1096}
1097
1098# initialise metadata extractors
1099sub initialise_extractors {
1100 my $self = shift (@_);
1101
1102 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1103 &acronym::initialise_acronyms();
1104 }
1105}
1106
1107# finalise metadata extractors
1108sub finalise_extractors {
1109 my $self = shift (@_);
1110
1111 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1112 &acronym::finalise_acronyms();
1113 }
1114}
1115
1116# FIRSTNNN: extract the first NNN characters as metadata
1117sub extract_first_NNNN_characters {
1118 my $self = shift (@_);
1119 my ($textref, $doc_obj, $thissection) = @_;
1120
1121 foreach my $size (split /,/, $self->{'first'}) {
1122 my $tmptext = $$textref;
1123 $tmptext =~ s/^\s+//;
1124 $tmptext =~ s/\s+$//;
1125 $tmptext =~ s/\s+/ /gs;
1126 $tmptext = substr ($tmptext, 0, $size);
1127 $tmptext =~ s/\s\S*$/&#8230;/;
1128 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1129 }
1130}
1131
1132sub extract_email {
1133 my $self = shift (@_);
1134 my ($textref, $doc_obj, $thissection) = @_;
1135 my $outhandle = $self->{'outhandle'};
1136
1137 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1138 if ($self->{'verbosity'} > 2);
1139
1140 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1141 @email = sort @email;
1142
1143# if($self->{"new_extract_email"} == 0)
1144# {
1145# my @email2 = ();
1146# foreach my $address (@email)
1147# {
1148# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
1149# {
1150# push @email2, $address;
1151# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1152# # print $outhandle " extracting $address\n"
1153# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1154# if ($self->{'verbosity'} > 3);
1155# }
1156# }
1157# }
1158# else
1159# {
1160 my $hashExistMail = {};
1161 foreach my $address (@email) {
1162 if (!(defined $hashExistMail->{$address}))
1163 {
1164 $hashExistMail->{$address} = 1;
1165 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1166 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1167 if ($self->{'verbosity'} > 3);
1168 }
1169 }
1170 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1171 if ($self->{'verbosity'} > 2);
1172}
1173
1174# extract metadata
1175sub auto_extract_metadata {
1176
1177 my $self = shift (@_);
1178 my ($doc_obj) = @_;
1179
1180 if ($self->{'extract_email'}) {
1181 my $thissection = $doc_obj->get_top_section();
1182 while (defined $thissection) {
1183 my $text = $doc_obj->get_text($thissection);
1184 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1185 $thissection = $doc_obj->get_next_section ($thissection);
1186 }
1187 }
1188 if ($self->{'extract_placenames'}) {
1189 my $thissection = $doc_obj->get_top_section();
1190 while (defined $thissection) {
1191 my $text = $doc_obj->get_text($thissection);
1192 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1193 $thissection = $doc_obj->get_next_section ($thissection);
1194 }
1195 }
1196
1197 #adding kea keyphrases
1198 if ($self->{'extract_keyphrases'}) {
1199
1200 my $thissection = $doc_obj->get_top_section();
1201 my $text = "";
1202 my $list;
1203
1204 #loop through sections to gather whole doc
1205 while (defined $thissection) {
1206 my $sectiontext = $doc_obj->get_text($thissection);
1207 $text = $text.$sectiontext;
1208 $thissection = $doc_obj->get_next_section ($thissection);
1209 }
1210
1211 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1212 $list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
1213 } else { #otherwise call Kea with no options
1214 $list = &Kea::extract_KeyPhrases ($text);
1215 }
1216
1217 if ($list){
1218 # if a list of kea keyphrases was returned (ie not empty)
1219 if ($self->{'verbosity'}) {
1220 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1221 }
1222
1223 #add metadata to top section
1224 $thissection = $doc_obj->get_top_section();
1225
1226 # add all key phrases as one metadata
1227 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1228
1229 # add individual key phrases as multiple metadata
1230 foreach my $keyphrase (split(',', $list)) {
1231 $keyphrase =~ s/^\s+|\s+$//g;
1232 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1233 }
1234 }
1235 } #end of kea
1236
1237 if ($self->{'first'}) {
1238 my $thissection = $doc_obj->get_top_section();
1239 while (defined $thissection) {
1240 my $text = $doc_obj->get_text($thissection);
1241 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1242 $thissection = $doc_obj->get_next_section ($thissection);
1243 }
1244 }
1245
1246 if ($self->{'extract_acronyms'}) {
1247 my $thissection = $doc_obj->get_top_section();
1248 while (defined $thissection) {
1249 my $text = $doc_obj->get_text($thissection);
1250 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1251 $thissection = $doc_obj->get_next_section ($thissection);
1252 }
1253 }
1254
1255 if ($self->{'markup_acronyms'}) {
1256 my $thissection = $doc_obj->get_top_section();
1257 while (defined $thissection) {
1258 my $text = $doc_obj->get_text($thissection);
1259 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1260 $doc_obj->delete_text($thissection);
1261 $doc_obj->add_text($thissection, $text);
1262 $thissection = $doc_obj->get_next_section ($thissection);
1263 }
1264 }
1265
1266 if($self->{'extract_historical_years'}) {
1267 my $thissection = $doc_obj->get_top_section();
1268 while (defined $thissection) {
1269
1270 my $text = $doc_obj->get_text($thissection);
1271 &DateExtract::get_date_metadata($text, $doc_obj,
1272 $thissection,
1273 $self->{'no_bibliography'},
1274 $self->{'maximum_year'},
1275 $self->{'maximum_century'});
1276 $thissection = $doc_obj->get_next_section ($thissection);
1277 }
1278 }
1279}
1280
1281# extract acronyms from a section in a document. progress is
1282# reported to outhandle based on the verbosity. both the Acronym
1283# and the AcronymKWIC metadata items are created.
1284
1285sub extract_acronyms {
1286 my $self = shift (@_);
1287 my ($textref, $doc_obj, $thissection) = @_;
1288 my $outhandle = $self->{'outhandle'};
1289
1290 # print $outhandle " extracting acronyms ...\n"
1291 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1292 if ($self->{'verbosity'} > 2);
1293
1294 my $acro_array = &acronym::acronyms($textref);
1295
1296 foreach my $acro (@$acro_array) {
1297
1298 #check that this is the first time ...
1299 my $seen_before = "false";
1300 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1301 foreach my $thisAcro (@$previous_data) {
1302 if ($thisAcro eq $acro->to_string()) {
1303 $seen_before = "true";
1304 if ($self->{'verbosity'} >= 4) {
1305 gsprintf($outhandle, " {BasPlug.already_seen} " .
1306 $acro->to_string() . "\n");
1307 }
1308 }
1309 }
1310
1311 if ($seen_before eq "false") {
1312 #write it to the file ...
1313 $acro->write_to_file();
1314
1315 #do the normal acronym
1316 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1317 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1318 if ($self->{'verbosity'} > 3);
1319 }
1320 }
1321
1322 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1323 if ($self->{'verbosity'} > 2);
1324}
1325
1326sub markup_acronyms {
1327 my $self = shift (@_);
1328 my ($text, $doc_obj, $thissection) = @_;
1329 my $outhandle = $self->{'outhandle'};
1330
1331 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1332 if ($self->{'verbosity'} > 2);
1333
1334 #self is passed in to check for verbosity ...
1335 $text = &acronym::markup_acronyms($text, $self);
1336
1337 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1338 if ($self->{'verbosity'} > 2);
1339
1340 return $text;
1341}
1342
1343sub compile_stats {
1344 my $self = shift(@_);
1345 my ($stats) = @_;
1346
1347 $stats->{'num_processed'} += $self->{'num_processed'};
1348 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1349 $stats->{'num_archives'} += $self->{'num_archives'};
1350
1351}
1352
1353sub associate_cover_image {
1354 my $self = shift(@_);
1355 my ($doc_obj, $filename) = @_;
1356
1357 my $top_section=$doc_obj->get_top_section();
1358
1359 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1360 if (-e $filename) {
1361 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1362 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1363 } else {
1364 $filename =~ s/jpg$/JPG/;
1365 if (-e $filename) {
1366 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1367 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1368 }
1369 }
1370}
1371
13721;
Note: See TracBrowser for help on using the repository browser.