source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10537

Last change on this file since 10537 was 10478, checked in by kjdon, 19 years ago

arcPlug now knows about keepold, and if its not set, it wont try to do incremental building

  • Property svn:keywords set to Author Date Id Revision
File size: 42.0 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict;
39no strict 'subs';
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42use File::Basename;
43
44use Kea;
45use multiread;
46use encodings;
47use cnseg;
48use acronym;
49use textcat;
50use doc;
51eval "require diagnostics"; # some perl distros (eg mac) don't have this
52use DateExtract;
53use ghtml;
54use gsprintf 'gsprintf';
55use printusage;
56use parse2;
57
58
59use GISBasPlug;
60
61@BasPlug::ISA = ( GISBasPlug );
62
63my $unicode_list =
64 [ { 'name' => "ascii",
65 'desc' => "{BasPlug.input_encoding.ascii}" },
66 { 'name' => "utf8",
67 'desc' => "{BasPlug.input_encoding.utf8}" },
68 { 'name' => "unicode",
69 'desc' => "{BasPlug.input_encoding.unicode}" } ];
70
71my $auto_unicode_list =
72 [ { 'name' => "auto",
73 'desc' => "{BasPlug.input_encoding.auto}" } ];
74
75my $arguments =
76 [ { 'name' => "process_exp",
77 'desc' => "{BasPlug.process_exp}",
78 'type' => "regexp",
79 'deft' => "",
80 'reqd' => "no" },
81 { 'name' => "block_exp",
82 'desc' => "{BasPlug.block_exp}",
83 'type' => "regexp",
84 'deft' => "",
85 'reqd' => "no" },
86 { 'name' => "smart_block",
87 'desc' => "{BasPlug.smart_block}",
88 'type' => "flag",
89 'reqd' => "no" },
90 { 'name' => "associate_ext",
91 'desc' => "{BasPlug.associate_ext}",
92 'type' => "string",
93 'reqd' => "no" },
94 { 'name' => "input_encoding",
95 'desc' => "{BasPlug.input_encoding}",
96 'type' => "enum",
97 'list' => $auto_unicode_list,
98 'reqd' => "no" ,
99 'deft' => "auto" } ,
100 { 'name' => "default_encoding",
101 'desc' => "{BasPlug.default_encoding}",
102 'type' => "enum",
103 'list' => $unicode_list,
104 'reqd' => "no",
105 'deft' => "utf8" },
106 { 'name' => "extract_language",
107 'desc' => "{BasPlug.extract_language}",
108 'type' => "flag",
109 'reqd' => "no" },
110 { 'name' => "default_language",
111 'desc' => "{BasPlug.default_language}",
112 'type' => "string",
113 'deft' => "en",
114 'reqd' => "no" },
115 { 'name' => "extract_acronyms",
116 'desc' => "{BasPlug.extract_acronyms}",
117 'type' => "flag",
118 'reqd' => "no" },
119 { 'name' => "markup_acronyms",
120 'desc' => "{BasPlug.markup_acronyms}",
121 'type' => "flag",
122 'reqd' => "no" },
123 { 'name' => "extract_keyphrases",
124 'desc' => "{BasPlug.extract_keyphrases}",
125 'type' => "flag",
126 'reqd' => "no" },
127 { 'name' => "extract_keyphrase_options",
128 'desc' => "{BasPlug.extract_keyphrase_options}",
129 'type' => "string",
130 'deft' => "",
131 'reqd' => "no" },
132 { 'name' => "first",
133 'desc' => "{BasPlug.first}",
134 'type' => "string",
135 'reqd' => "no" },
136 { 'name' => "extract_email",
137 'desc' => "{BasPlug.extract_email}",
138 'type' => "flag",
139 'reqd' => "no" },
140 { 'name' => "extract_historical_years",
141 'desc' => "{BasPlug.extract_historical_years}",
142 'type' => "flag",
143 'reqd' => "no" },
144 { 'name' => "maximum_year",
145 'desc' => "{BasPlug.maximum_year}",
146 'type' => "int",
147 'deft' => (localtime)[5]+1900,
148 'char_length' => "4",
149 #'range' => "2,100",
150 'reqd' => "no"},
151 { 'name' => "maximum_century",
152 'desc' => "{BasPlug.maximum_century}",
153 'type' => "string",
154 'deft' => "-1",
155 'reqd' => "no" },
156 { 'name' => "no_bibliography",
157 'desc' => "{BasPlug.no_bibliography}",
158 'type' => "flag",
159 'reqd' => "no"},
160 { 'name' => "no_cover_image",
161 'desc' => "{BasPlug.no_cover_image}",
162 'type' => "flag",
163 'reqd' => "no" },
164 { 'name' => "extract_keyphrases",
165 'desc' => "{BasPlug.extract_keyphrases}",
166 'type' => "flag",
167 'reqd' => "no",
168 'hiddengli' => "yes" },
169 { 'name' => "extract_keyphrase_options",
170 'desc' => "{BasPlug.extract_keyphrase_options}",
171 'type' => "string",
172 'reqd' => "no",
173 'hiddengli' => "yes" },
174 { 'name' => "separate_cjk",
175 'desc' => "{BasPlug.separate_cjk}",
176 'type' => "flag",
177 'reqd' => "no",
178 'hiddengli' => "yes" },
179 { 'name' => "smart_block",
180 'desc' => "{BasPlug.smart_block}",
181 'type' => "flag",
182 'reqd' => "no",
183 'hiddengli' => "yes" },
184 { 'name' => "new_extract_email",
185 'desc' => "",
186 'type' => "flag",
187 'reqd' => "no",
188 'hiddengli' => "yes" } ];
189
190my $gis_arguments =
191 [ { 'name' => "extract_placenames",
192 'desc' => "{GISBasPlug.extract_placenames}",
193 'type' => "flag",
194 'reqd' => "no" },
195 { 'name' => "gazetteer",
196 'desc' => "{GISBasPlug.gazetteer}",
197 'type' => "string",
198 'reqd' => "no" },
199 { 'name' => "place_list",
200 'desc' => "{GISBasPlug.place_list}",
201 'type' => "flag",
202 'reqd' => "no" } ];
203
204
205my $options = { 'name' => "BasPlug",
206 'desc' => "{BasPlug.desc}",
207 'abstract' => "yes",
208 'inherits' => "no",
209 'args' => $arguments };
210
211
212sub set_keepold {
213 my $self = shift(@_);
214 my ($keepold) = @_;
215
216 $self->{'keepold'} = $keepold;
217}
218
219sub get_arguments
220{
221 my $self = shift(@_);
222 my $optionlistref = $self->{'option_list'};
223 my @optionlist = @$optionlistref;
224 my $pluginoptions = pop(@$optionlistref);
225 my $pluginarguments = $pluginoptions->{'args'};
226 return $pluginarguments;
227}
228
229
230sub print_xml_usage
231{
232 my $self = shift(@_);
233
234 # XML output is always in UTF-8
235 gsprintf::output_strings_in_UTF8;
236
237 PrintUsage::print_xml_header();
238 $self->print_xml();
239}
240
241
242sub print_xml
243{
244 my $self = shift(@_);
245
246 my $optionlistref = $self->{'option_list'};
247 my @optionlist = @$optionlistref;
248 my $pluginoptions = shift(@$optionlistref);
249 return if (!defined($pluginoptions));
250
251 gsprintf(STDERR, "<PlugInfo>\n");
252 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
253 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
254 $desc =~ s/</&amp;lt;/g; # doubly escaped
255 $desc =~ s/>/&amp;gt;/g;
256
257 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
258 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
259 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
260 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
261 gsprintf(STDERR, " <Arguments>\n");
262 if (defined($pluginoptions->{'args'})) {
263 &PrintUsage::print_options_xml($pluginoptions->{'args'});
264 }
265
266 # Recurse up the plugin hierarchy
267 $self->print_xml();
268
269 gsprintf(STDERR, " </Arguments>\n");
270 gsprintf(STDERR, "</PlugInfo>\n");
271}
272
273
274sub print_txt_usage
275{
276 my $self = shift(@_);
277 # Print the usage message for a plugin (recursively)
278 my $descoffset = $self->determine_description_offset(0);
279 $self->print_plugin_usage($descoffset, 1);
280}
281
282
283sub determine_description_offset
284{
285 my $self = shift(@_);
286 my $maxoffset = shift(@_);
287
288 my $optionlistref = $self->{'option_list'};
289 my @optionlist = @$optionlistref;
290 my $pluginoptions = shift(@$optionlistref);
291 return $maxoffset if (!defined($pluginoptions));
292
293 # Find the length of the longest option string of this plugin
294 my $pluginargs = $pluginoptions->{'args'};
295 if (defined($pluginargs)) {
296 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
297 if ($longest > $maxoffset) {
298 $maxoffset = $longest;
299 }
300 }
301
302 # Recurse up the plugin hierarchy
303 $maxoffset = $self->determine_description_offset($maxoffset);
304 $self->{'option_list'} = \@optionlist;
305 return $maxoffset;
306}
307
308
309sub print_plugin_usage
310{
311 my $self = shift(@_);
312 my $descoffset = shift(@_);
313 my $isleafclass = shift(@_);
314
315 my $optionlistref = $self->{'option_list'};
316 my @optionlist = @$optionlistref;
317 my $pluginoptions = shift(@$optionlistref);
318 return if (!defined($pluginoptions));
319
320 my $pluginname = $pluginoptions->{'name'};
321 my $pluginargs = $pluginoptions->{'args'};
322 my $plugindesc = $pluginoptions->{'desc'};
323
324 # Produce the usage information using the data structure above
325 if ($isleafclass) {
326 if (defined($plugindesc)) {
327 gsprintf(STDERR, "$plugindesc\n\n");
328 }
329 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
330 }
331
332 # Display the plugin options, if there are some
333 if (defined($pluginargs)) {
334 # Calculate the column offset of the option descriptions
335 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
336
337 if ($isleafclass) {
338 gsprintf(STDERR, " {common.specific_options}:\n");
339 }
340 else {
341 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
342 }
343
344 # Display the plugin options
345 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
346 }
347
348 # Recurse up the plugin hierarchy
349 $self->print_plugin_usage($descoffset, 0);
350 $self->{'option_list'} = \@optionlist;
351}
352
353
354sub new {
355 # Set Encodings to the list!!
356
357 my $e = $encodings::encodings;
358 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
359 {
360 my $hashEncode =
361 {'name' => $enc,
362 'desc' => $e->{$enc}->{'name'}};
363
364 push(@{$unicode_list},$hashEncode);
365 }
366
367 push(@{$auto_unicode_list},@{$unicode_list});
368
369 # Start the BasPlug Constructor
370 my $class = shift (@_);
371 my ($pluginlist,$args,$hashArgOptLists) = @_;
372 push(@$pluginlist, $class);
373 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
374
375 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
376 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
377
378 if (GISBasPlug::has_mapdata()) {
379 push(@$arguments,@$gis_arguments);
380 }
381
382 my $self = {};
383 if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
384 {
385 my $classTempClass = bless $self, $class;
386 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
387 $classTempClass->print_txt_usage(""); # Use default resource bundle
388 die "\n";
389 }
390
391
392 # else parsing was successful.
393
394 $self->{'plugin_type'} = $plugin_name;
395 $self->{'outhandle'} = STDERR;
396 $self->{'textcat'} = new textcat();
397 $self->{'num_processed'} = 0;
398 $self->{'num_not_processed'} = 0;
399 $self->{'num_blocked'} = 0;
400 $self->{'num_archives'} = 0;
401 $self->{'cover_image'} = 1; # cover image is on by default
402 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
403 $self->{'file_blocks'} = {};
404 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
405
406 my $associate_ext = $self->{'associate_ext'};
407 if ((defined $associate_ext) && ($associate_ext ne "")) {
408 my @exts = split(/,/,$associate_ext);
409
410 my %associate_ext_lookup = ();
411 foreach my $e (@exts) {
412 $associate_ext_lookup{$e} = 1;
413 }
414
415 $self->{'associate_ext_lookup'} = \%associate_ext_lookup;
416 }
417
418 $self->{'shared_fileroot'} = {};
419 $self->{'file_blocks'} = {};
420
421 if ($self->{'extract_placenames'}) {
422
423 my $outhandle = $self->{'outhandle'};
424
425 my $places_ref
426 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
427
428 if (!defined $places_ref) {
429 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
430 print $outhandle " No placename extraction will take place.\n";
431 $self->{'extract_placenames'} = undef;
432 }
433 else {
434 $self->{'places'} = $places_ref;
435 }
436 }
437 return bless $self, $class;
438
439}
440
441# initialize BasPlug options
442# if init() is overridden in a sub-class, remember to call BasPlug::init()
443sub init {
444 my $self = shift (@_);
445 my ($verbosity, $outhandle, $failhandle) = @_;
446
447 # verbosity is passed through from the processor
448 $self->{'verbosity'} = $verbosity;
449
450 # as are the outhandle and failhandle
451 $self->{'outhandle'} = $outhandle if defined $outhandle;
452 $self->{'failhandle'} = $failhandle;
453
454 # set process_exp and block_exp to defaults unless they were
455 # explicitly set
456
457 if ((!$self->is_recursive()) and
458 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
459
460 $self->{'process_exp'} = $self->get_default_process_exp ();
461 if ($self->{'process_exp'} eq "") {
462 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
463 }
464 }
465
466 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
467 $self->{'block_exp'} = $self->get_default_block_exp ();
468 }
469}
470
471sub begin {
472 my $self = shift (@_);
473 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
474 $self->initialise_extractors();
475}
476
477sub end {
478 # potentially called at the end of each plugin pass
479 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
480
481 my ($self) = @_;
482 $self->finalise_extractors();
483}
484
485sub deinit {
486 # called only once, after all plugin passes have been done
487
488 my ($self) = @_;
489}
490
491# this function should be overridden to return 1
492# in recursive plugins
493sub is_recursive {
494 my $self = shift (@_);
495
496 return 0;
497}
498
499sub get_default_block_exp {
500 my $self = shift (@_);
501
502 return "";
503}
504
505sub get_default_process_exp {
506 my $self = shift (@_);
507
508 return "";
509}
510
511# default implementation is to do nothing.
512sub store_block_files
513{
514 my $self =shift (@_);
515 my ($filename) = @_;
516 return;
517}
518
519#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
520sub block_cover_image
521{
522 my $self =shift (@_);
523 my ($filename) = @_;
524 if ($self->{'cover_image'}) {
525 my $coverfile = $filename;
526 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
527 if (!-e $coverfile) {
528 $coverfile =~ s/jpg$/JPG/;
529 }
530 if (-e $coverfile) {
531 $self->{'file_blocks'}->{$coverfile} = 1;
532 }
533 }
534
535 return;
536}
537
538sub metadata_read {
539 my $self = shift (@_);
540 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
541 # Keep track of filenames with same root but different extensions
542 # Used to support -associate_ext
543
544 my $associate_ext = $self->{'associate_ext'};
545 if ((defined $associate_ext) && ($associate_ext ne "")) {
546
547 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
548 if ((defined $file_prefix) && (defined $file_ext)) {
549
550 my $shared_fileroot = $self->{'shared_fileroot'};
551 if (!defined $shared_fileroot->{$file_prefix}) {
552 my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
553 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
554 }
555
556 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
557
558 my $process_exp = $self->{'process_exp'};
559
560 if ($file =~ m/$self->{'process_exp'}/) {
561 # This is the document the others should be tied to
562 $file_prefix_rec->{'tie_to'} = $file_ext;
563 }
564 else {
565 if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
566 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
567 }
568 }
569 }
570 }
571
572 # now check whether we are actually processing this
573 my $filename = $file;
574 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
575 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
576 return undef; # can't recognise
577 }
578
579 # do smart blocking if appropriate
580 if (defined $self->{'smart_block'}) {
581 $self->block_cover_image($filename);
582 $self->store_block_files($filename);
583 }
584
585 return 1;
586}
587
588sub tie_to_filename
589{
590 my $self = shift (@_);
591
592 my ($file_ext,$file_prefix_rec) = @_;
593
594 if (defined $file_prefix_rec) {
595 my $tie_to = $file_prefix_rec->{'tie_to'};
596
597 if (defined $tie_to) {
598 if ($tie_to eq $file_ext) {
599 return 1;
600 }
601 }
602 }
603
604 return 0;
605}
606
607sub tie_to_assoc_file
608{
609 my $self = shift (@_);
610 my ($file_ext,$file_prefix_rec) = @_;
611
612 if (defined $file_prefix_rec) {
613 my $tie_to = $file_prefix_rec->{'tie_to'};
614 if (defined $tie_to) {
615
616 my $exts = $file_prefix_rec->{'exts'};
617
618 my $has_file_ext = $exts->{$file_ext};
619
620 if ($has_file_ext) {
621 return 1;
622 }
623 }
624 }
625
626 return 0;
627}
628
629
630sub associate_with
631{
632 my $self = shift (@_);
633 my ($file, $filename, $metadata) = @_;
634
635 my $associate_ext = $self->{'associate_ext'};
636
637
638 return 0 if (!$associate_ext);
639
640 # If file, see if matches with "tie_to" doc or is one of the
641 # associated filename extensions.
642
643 my ($file_prefix,$file_ext) = ($file =~ m/^(.*)\.(.*?)$/);
644 if ((defined $file_prefix) && (defined $file_ext)) {
645
646 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
647
648 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
649
650 # Set up gsdlassocfile_tobe
651
652 my $exts = $file_prefix_rec->{'exts'};
653
654 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
655 $metadata->{'gsdlassocfile_tobe'} = [];
656 }
657
658 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
659
660 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
661 foreach my $e (keys %$exts) {
662 my $assoc_file = "$full_prefix.$e";
663 my $mime_type = ""; # let system auto detect this
664 push(@$assoc_tobe,"$assoc_file:$mime_type:");
665 }
666 }
667 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
668 # a form of smart block
669
670 return 1;
671 }
672 }
673
674 return 0;
675}
676
677
678sub read_block {
679 my $self = shift (@_);
680
681 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
682
683
684 my $filename = $file;
685 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
686
687 if ($self->associate_with($file,$filename,$metadata)) {
688 # a form of smart block
689 $self->{'num_blocked'} ++;
690 return (0,undef); # blocked
691 }
692
693 my $smart_block = $self->{'smart_block'};
694 my $smart_block_BN = $self->{'smart_block_BN'};
695
696 if ($smart_block || $smart_block_BN) {
697 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
698 $self->{'num_blocked'} ++;
699 return (0,undef); # blocked
700 }
701 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
702 $self->{'num_blocked'} ++;
703 return (0,undef); # blocked
704 }
705
706 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
707 return (undef,undef); # can't recognise
708 }
709
710 return (1,$filename);
711}
712
713sub read_tidy_file {
714
715 my $self = shift (@_);
716
717 my ($file) = @_;
718
719 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
720
721 return $file;
722}
723
724
725
726# The BasPlug read_into_doc_obj() function. This function does all the
727# right things to make general options work for a given plugin. It reads in
728# a file and sets up a slew of metadata all saved in doc_obj, which
729# it then returns as part of a tuple (process_status,doc_obj)
730#
731# Much of this functionality used to reside in read, but it was broken
732# down into a supporting routine to make the code more flexible.
733#
734# recursive plugins (e.g. RecPlug) and specialized plugins like those
735# capable of processing many documents within a single file (e.g.
736# GMLPlug) will normally want to implement their own version of
737# read_into_doc_obj()
738#
739# Note that $base_dir might be "" and that $file might
740# include directories
741sub read_into_doc_obj {
742 my $self = shift (@_);
743 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
744
745 if ($self->is_recursive()) {
746 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
747 }
748
749 my $outhandle = $self->{'outhandle'};
750
751 my ($block_status,$filename) = $self->read_block(@_);
752 return $block_status if ((!defined $block_status) || ($block_status==0));
753 $file = $self->read_tidy_file($file);
754
755 # Do encoding stuff
756 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
757
758 # create a new document
759 my $doc_obj = new doc ($filename, "indexed_doc");
760 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
761 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
762 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
763 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
764 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
765
766 my ($filemeta) = $file =~ /([^\\\/]+)$/;
767 # how do we know what encoding the filename is in?
768 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
769 if ($self->{'cover_image'}) {
770 $self->associate_cover_image($doc_obj, $filename);
771 }
772
773 # read in file ($text will be in utf8)
774 my $text = "";
775 $self->read_file ($filename, $encoding, $language, \$text);
776
777 if (!length ($text)) {
778 my $plugin_name = ref ($self);
779 if ($gli) {
780 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
781 }
782 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
783
784 my $failhandle = $self->{'failhandle'};
785 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
786 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
787 $self->{'num_not_processed'} ++;
788
789 return (0,undef); # what should we return here?? error but don't want to pass it on
790 }
791
792 # include any metadata passed in from previous plugins
793 # note that this metadata is associated with the top level section
794
795 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
796
797 # do plugin specific processing of doc_obj
798 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
799 $text = '';
800 undef $text;
801 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
802 return (-1,undef);
803 }
804 $text='';
805 undef $text;
806
807 # do any automatic metadata extraction
808 $self->auto_extract_metadata ($doc_obj);
809
810 # add an OID
811 # see if there is a plugin-specific set_OID function...
812 if (defined ($self->can('set_OID'))) {
813 # it will need $doc_obj to set the Identifier metadata...
814 $self->set_OID($doc_obj);
815 } else {
816 # use the default set_OID() in doc.pm
817 $doc_obj->set_OID();
818 }
819
820 return (1,$doc_obj);
821}
822
823
824# The BasPlug read() function. This function calls read_into_doc_obj()
825# to ensure all the right things to make general options work for a
826# given plugin are done. It then calls the process() function which
827# does all the work specific to a plugin (like the old read functions
828# used to do). Most plugins should define their own process() function
829# and let this read() function keep control.
830#
831# recursive plugins (e.g. RecPlug) and specialized plugins like those
832# capable of processing many documents within a single file (e.g.
833# GMLPlug) might want to implement their own version of read(), but
834# more likely need to implement their own version of read_into_doc_obj()
835#
836# Return number of files processed, undef if can't recognise, -1 if can't
837# process
838
839sub read {
840 my $self = shift (@_);
841 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
842
843 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
844
845 if ((defined $process_status) && ($process_status == 1)) {
846 # process the document
847 $processor->process($doc_obj);
848
849 if(defined($self->{'places_filename'})){
850 &util::rm($self->{'places_filename'});
851 $self->{'places_filename'} = undef;
852 }
853
854 $self->{'num_processed'} ++;
855 undef $doc_obj;
856 }
857
858 # if process_status == 1, then the file has been processed.
859 return $process_status;
860
861}
862
863# returns undef if file is rejected by the plugin
864sub process {
865 my $self = shift (@_);
866 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
867
868 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
869 # die "Basplug::process function must be implemented in sub-class\n";
870
871 return undef; # never gets here
872}
873
874# uses the multiread package to read in the entire file pointed to
875# by filename and loads the resulting text into $$textref. Input text
876# may be in any of the encodings handled by multiread, output text
877# will be in utf8
878sub read_file {
879 my $self = shift (@_);
880 my ($filename, $encoding, $language, $textref) = @_;
881
882 if (!-r $filename)
883 {
884 my $outhandle = $self->{'outhandle'};
885 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
886 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
887 return;
888 }
889 $$textref = "";
890 if (!open (FILE, $filename)) {
891 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
892 die "\n";
893 }
894
895 if ($encoding eq "ascii") {
896 undef $/;
897 $$textref = <FILE>;
898 $/ = "\n";
899 } else {
900 my $reader = new multiread();
901 $reader->set_handle ('BasPlug::FILE');
902 $reader->set_encoding ($encoding);
903 $reader->read_file ($textref);
904 #Now segments chinese if the separate_cjk option is set
905 if ($self->{'separate_cjk'}) {
906 # segment the Chinese words
907 $$textref = &cnseg::segment($$textref);
908 }
909 }
910 close FILE;
911}
912
913# write_file -- used by ConvertToPlug, for example in post processing
914#
915sub utf8_write_file {
916 my $self = shift (@_);
917 my ($textref, $filename) = @_;
918
919 if (!open (FILE, ">$filename")) {
920 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
921 die "\n";
922 }
923 print FILE $$textref;
924
925 close FILE;
926}
927
928
929sub filename_based_title
930{
931 my $self = shift (@_);
932 my ($file) = @_;
933
934 my $file_derived_title = $file;
935 $file_derived_title =~ s/_/ /g;
936 $file_derived_title =~ s/\..*?$//;
937
938 return $file_derived_title;
939}
940
941
942sub title_fallback
943{
944 my $self = shift (@_);
945 my ($doc_obj,$section,$file) = @_;
946
947 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
948
949 my $file_derived_title = $self->filename_based_title($file);
950 $doc_obj->add_metadata ($section, "Title", $file_derived_title);
951 }
952}
953
954sub textcat_get_language_encoding {
955 my $self = shift (@_);
956 my ($filename) = @_;
957
958
959 my ($language, $encoding, $extracted_encoding);
960 if ($self->{'input_encoding'} eq "auto") {
961 # use textcat to automatically work out the input encoding and language
962 ($language, $encoding) = $self->get_language_encoding ($filename);
963 } elsif ($self->{'extract_language'}) {
964 # use textcat to get language metadata
965 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
966 $encoding = $self->{'input_encoding'};
967 # don't print this message for english... english in utf8 is identical
968 # to english in iso-8859-1 (except for some punctuation). We don't have
969 # a language model for en_utf8, so textcat always says iso-8859-1!
970 if ($extracted_encoding ne $encoding && $language ne "en"
971 && $self->{'verbosity'}) {
972 my $plugin_name = ref ($self);
973 my $outhandle = $self->{'outhandle'};
974 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
975 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
976 # print $outhandle "appears to be encoded as $extracted_encoding.\n";
977 }
978 } else {
979 $language = $self->{'default_language'};
980 $encoding = $self->{'input_encoding'};
981 }
982
983 return ($language, $encoding);
984}
985
986# Uses textcat to work out the encoding and language of the text in
987# $filename. All html tags are removed before processing.
988# returns an array containing "language" and "encoding"
989sub get_language_encoding {
990 my $self = shift (@_);
991 my ($filename) = @_;
992 my $outhandle = $self->{'outhandle'};
993 my $unicode_format = "";
994 # read in file
995 open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
996 undef $/;
997 my $text = <FILE>;
998 $/ = "\n";
999 close FILE;
1000
1001 # check if first few bytes have a Byte Order Marker
1002 my $bom=substr($text,0,2); # check 16bit unicode
1003 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1004 $unicode_format="unicode";
1005 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1006 $unicode_format="unicode";
1007 } else {
1008 $bom=substr($text,0,3); # check utf-8
1009 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1010 $unicode_format="utf8";
1011# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1012# $unicode_format="utf8";
1013 }
1014 }
1015
1016 # VB scripting generated Word to HTML file
1017 if ($text =~ /charset=(windows.*)[\"]/ig){
1018 my $vbhtml_encoding = $1;
1019 $vbhtml_encoding =~ s/-+/_/g;
1020 $self->{'input_encoding'} = $vbhtml_encoding;
1021 }
1022
1023 # remove <title>stuff</title> -- as titles tend often to be in English
1024 # for foreign language documents
1025 $text =~ s/<title>(.|\n)*?<\/title>//i;
1026
1027 # remove all HTML tags
1028 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1029 if (ref($self) eq 'HTMLPlug' ||
1030 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1031 $text =~ s/<[^>]*>//sg;
1032 }
1033
1034 # get the language/encoding
1035 my $results = $self->{'textcat'}->classify(\$text);
1036
1037 # if textcat returns 3 or less possibilities we'll use the
1038 # first one in the list - otherwise use the defaults
1039 if (scalar @$results > 3) {
1040 my $best_encoding="";
1041 if ($unicode_format) { # in case the first had a BOM
1042 $best_encoding=$unicode_format;
1043 } else {
1044 my %guessed_encodings = ();
1045 foreach my $result (@$results) {
1046 $result =~ /([^\-]+)$/;
1047 my $enc=$1;
1048 if (!defined($guessed_encodings{$enc})) {
1049 $guessed_encodings{$enc}=0;
1050 }
1051 $guessed_encodings{$enc}++;
1052 }
1053
1054 $guessed_encodings{""}=-1; # for default best_encoding of ""
1055 foreach my $enc (keys %guessed_encodings) {
1056 if ($guessed_encodings{$enc} >
1057 $guessed_encodings{$best_encoding}){
1058 $best_encoding=$enc;
1059 }
1060 }
1061 }
1062
1063 if ($self->{'input_encoding'} ne 'auto') {
1064 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1065 gsprintf($outhandle,
1066 "BasPlug: {BasPlug.could_not_extract_language}\n",
1067 $filename, $self->{'default_language'});
1068 }
1069 return ($self->{'default_language'}, $self->{'input_encoding'});
1070
1071 } else {
1072 if ($self->{'verbosity'}>2) {
1073 gsprintf($outhandle,
1074 "BasPlug: {BasPlug.could_not_extract_language}\n",
1075 $filename, $self->{'default_language'});
1076 }
1077 return ($self->{'default_language'}, $best_encoding);
1078 }
1079 }
1080
1081 # format language/encoding
1082 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
1083 if (!defined $language) {
1084 if ($self->{'verbosity'}>2) {
1085 gsprintf($outhandle,
1086 "BasPlug: {BasPlug.could_not_extract_language}\n",
1087 $filename, $self->{'default_language'});
1088 }
1089 $language = $self->{'default_language'};
1090 }
1091 if (!defined $encoding) {
1092 if ($self->{'verbosity'}>2) {
1093 gsprintf($outhandle,
1094 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
1095 $filename, $self->{'default_encoding'});
1096 }
1097 $encoding = $self->{'default_encoding'};
1098 }
1099
1100
1101 # check for equivalents where textcat doesn't have some encodings...
1102 # eg MS versions of standard encodings
1103 if ($encoding =~ /^iso_8859_(\d+)/) {
1104 my $iso = $1; # which variant of the iso standard?
1105 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1106 if ($text =~ /[\x80-\x9f]/) {
1107 # Western Europe
1108 if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1109 elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1110 elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1111 elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1112 elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1113 elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1114 elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1115 }
1116 }
1117
1118 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
1119 !defined $encodings::encodings->{$encoding}) {
1120 if ($self->{'verbosity'}) {
1121 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1122 $filename, $encoding, $self->{'default_encoding'});
1123 }
1124 $encoding = $self->{'default_encoding'};
1125 }
1126
1127 return ($language, $encoding);
1128}
1129
1130# add any extra metadata that's been passed around from one
1131# plugin to another.
1132# extra_metadata uses add_utf8_metadata so it expects metadata values
1133# to already be in utf8
1134sub extra_metadata {
1135 my $self = shift (@_);
1136 my ($doc_obj, $cursection, $metadata) = @_;
1137
1138 foreach my $field (keys(%$metadata)) {
1139 # $metadata->{$field} may be an array reference
1140 if ($field eq "gsdlassocfile_tobe") {
1141 # 'gsdlassocfile_tobe' is artificially introduced metadata
1142 # that is used to signal that certain additional files should
1143 # be tied to this document. Useful in situations where a
1144 # metadata pass in the plugin pipeline works out some files
1145 # need to be associated with a document, but the document hasn't
1146 # been formed yet.
1147
1148 my $equiv_form = "";
1149 foreach my $gaf (@{$metadata->{$field}}) {
1150 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1151 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1152 my $filename = $full_filename;
1153
1154 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1155
1156 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1157 my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1158 my $srcicon = "_icon".$doc_ext."_";
1159 my $end_doclink = "</a>";
1160
1161 $equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1162 }
1163 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1164 }
1165 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1166 map {
1167 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1168 } @{$metadata->{$field}};
1169 } else {
1170 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1171 }
1172 }
1173}
1174
1175# initialise metadata extractors
1176sub initialise_extractors {
1177 my $self = shift (@_);
1178
1179 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1180 &acronym::initialise_acronyms();
1181 }
1182}
1183
1184# finalise metadata extractors
1185sub finalise_extractors {
1186 my $self = shift (@_);
1187
1188 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1189 &acronym::finalise_acronyms();
1190 }
1191}
1192
1193# FIRSTNNN: extract the first NNN characters as metadata
1194sub extract_first_NNNN_characters {
1195 my $self = shift (@_);
1196 my ($textref, $doc_obj, $thissection) = @_;
1197
1198 foreach my $size (split /,/, $self->{'first'}) {
1199 my $tmptext = $$textref;
1200 $tmptext =~ s/^\s+//;
1201 $tmptext =~ s/\s+$//;
1202 $tmptext =~ s/\s+/ /gs;
1203 $tmptext = substr ($tmptext, 0, $size);
1204 $tmptext =~ s/\s\S*$/&#8230;/;
1205 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1206 }
1207}
1208
1209sub extract_email {
1210 my $self = shift (@_);
1211 my ($textref, $doc_obj, $thissection) = @_;
1212 my $outhandle = $self->{'outhandle'};
1213
1214 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1215 if ($self->{'verbosity'} > 2);
1216
1217 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1218 @email = sort @email;
1219
1220# if($self->{"new_extract_email"} == 0)
1221# {
1222# my @email2 = ();
1223# foreach my $address (@email)
1224# {
1225# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
1226# {
1227# push @email2, $address;
1228# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1229# # print $outhandle " extracting $address\n"
1230# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1231# if ($self->{'verbosity'} > 3);
1232# }
1233# }
1234# }
1235# else
1236# {
1237 my $hashExistMail = {};
1238 foreach my $address (@email) {
1239 if (!(defined $hashExistMail->{$address}))
1240 {
1241 $hashExistMail->{$address} = 1;
1242 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1243 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1244 if ($self->{'verbosity'} > 3);
1245 }
1246 }
1247 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1248 if ($self->{'verbosity'} > 2);
1249}
1250
1251# extract metadata
1252sub auto_extract_metadata {
1253
1254 my $self = shift (@_);
1255 my ($doc_obj) = @_;
1256
1257 if ($self->{'extract_email'}) {
1258 my $thissection = $doc_obj->get_top_section();
1259 while (defined $thissection) {
1260 my $text = $doc_obj->get_text($thissection);
1261 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1262 $thissection = $doc_obj->get_next_section ($thissection);
1263 }
1264 }
1265 if ($self->{'extract_placenames'}) {
1266 my $thissection = $doc_obj->get_top_section();
1267 while (defined $thissection) {
1268 my $text = $doc_obj->get_text($thissection);
1269 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1270 $thissection = $doc_obj->get_next_section ($thissection);
1271 }
1272 }
1273
1274 #adding kea keyphrases
1275 if ($self->{'extract_keyphrases'}) {
1276
1277 my $thissection = $doc_obj->get_top_section();
1278 my $text = "";
1279 my $list;
1280
1281 #loop through sections to gather whole doc
1282 while (defined $thissection) {
1283 my $sectiontext = $doc_obj->get_text($thissection);
1284 $text = $text.$sectiontext;
1285 $thissection = $doc_obj->get_next_section ($thissection);
1286 }
1287
1288 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1289 $list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
1290 } else { #otherwise call Kea with no options
1291 $list = &Kea::extract_KeyPhrases ($text);
1292 }
1293
1294 if ($list){
1295 # if a list of kea keyphrases was returned (ie not empty)
1296 if ($self->{'verbosity'}) {
1297 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1298 }
1299
1300 #add metadata to top section
1301 $thissection = $doc_obj->get_top_section();
1302
1303 # add all key phrases as one metadata
1304 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1305
1306 # add individual key phrases as multiple metadata
1307 foreach my $keyphrase (split(',', $list)) {
1308 $keyphrase =~ s/^\s+|\s+$//g;
1309 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1310 }
1311 }
1312 } #end of kea
1313
1314 if ($self->{'first'}) {
1315 my $thissection = $doc_obj->get_top_section();
1316 while (defined $thissection) {
1317 my $text = $doc_obj->get_text($thissection);
1318 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1319 $thissection = $doc_obj->get_next_section ($thissection);
1320 }
1321 }
1322
1323 if ($self->{'extract_acronyms'}) {
1324 my $thissection = $doc_obj->get_top_section();
1325 while (defined $thissection) {
1326 my $text = $doc_obj->get_text($thissection);
1327 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1328 $thissection = $doc_obj->get_next_section ($thissection);
1329 }
1330 }
1331
1332 if ($self->{'markup_acronyms'}) {
1333 my $thissection = $doc_obj->get_top_section();
1334 while (defined $thissection) {
1335 my $text = $doc_obj->get_text($thissection);
1336 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1337 $doc_obj->delete_text($thissection);
1338 $doc_obj->add_text($thissection, $text);
1339 $thissection = $doc_obj->get_next_section ($thissection);
1340 }
1341 }
1342
1343 if($self->{'extract_historical_years'}) {
1344 my $thissection = $doc_obj->get_top_section();
1345 while (defined $thissection) {
1346
1347 my $text = $doc_obj->get_text($thissection);
1348 &DateExtract::get_date_metadata($text, $doc_obj,
1349 $thissection,
1350 $self->{'no_bibliography'},
1351 $self->{'maximum_year'},
1352 $self->{'maximum_century'});
1353 $thissection = $doc_obj->get_next_section ($thissection);
1354 }
1355 }
1356}
1357
1358# extract acronyms from a section in a document. progress is
1359# reported to outhandle based on the verbosity. both the Acronym
1360# and the AcronymKWIC metadata items are created.
1361
1362sub extract_acronyms {
1363 my $self = shift (@_);
1364 my ($textref, $doc_obj, $thissection) = @_;
1365 my $outhandle = $self->{'outhandle'};
1366
1367 # print $outhandle " extracting acronyms ...\n"
1368 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1369 if ($self->{'verbosity'} > 2);
1370
1371 my $acro_array = &acronym::acronyms($textref);
1372
1373 foreach my $acro (@$acro_array) {
1374
1375 #check that this is the first time ...
1376 my $seen_before = "false";
1377 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1378 foreach my $thisAcro (@$previous_data) {
1379 if ($thisAcro eq $acro->to_string()) {
1380 $seen_before = "true";
1381 if ($self->{'verbosity'} >= 4) {
1382 gsprintf($outhandle, " {BasPlug.already_seen} " .
1383 $acro->to_string() . "\n");
1384 }
1385 }
1386 }
1387
1388 if ($seen_before eq "false") {
1389 #write it to the file ...
1390 $acro->write_to_file();
1391
1392 #do the normal acronym
1393 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1394 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1395 if ($self->{'verbosity'} > 3);
1396 }
1397 }
1398
1399 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1400 if ($self->{'verbosity'} > 2);
1401}
1402
1403sub markup_acronyms {
1404 my $self = shift (@_);
1405 my ($text, $doc_obj, $thissection) = @_;
1406 my $outhandle = $self->{'outhandle'};
1407
1408 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1409 if ($self->{'verbosity'} > 2);
1410
1411 #self is passed in to check for verbosity ...
1412 $text = &acronym::markup_acronyms($text, $self);
1413
1414 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1415 if ($self->{'verbosity'} > 2);
1416
1417 return $text;
1418}
1419
1420sub compile_stats {
1421 my $self = shift(@_);
1422 my ($stats) = @_;
1423
1424 $stats->{'num_processed'} += $self->{'num_processed'};
1425 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1426 $stats->{'num_archives'} += $self->{'num_archives'};
1427
1428}
1429
1430sub associate_cover_image {
1431 my $self = shift(@_);
1432 my ($doc_obj, $filename) = @_;
1433
1434 my $top_section=$doc_obj->get_top_section();
1435
1436 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1437 if (-e $filename) {
1438 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1439 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1440 } else {
1441 $filename =~ s/jpg$/JPG/;
1442 if (-e $filename) {
1443 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1444 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1445 }
1446 }
1447}
1448
14491;
Note: See TracBrowser for help on using the repository browser.