source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:keywords set to Author Date Id Revision
File size: 24.4 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28# suppress the annoying "subroutine redefined" warning that various
29# plugins cause under perl 5.6
30$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
31
32use Kea;
33use parsargv;
34use multiread;
35use encodings;
36use cnseg;
37use acronym;
38use textcat;
39use doc;
40use diagnostics;
41use DateExtract;
42use ghtml;
43
44sub print_general_usage {
45 my ($plugin_name) = @_;
46
47 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
48
49 print STDERR " -process_exp A perl regular expression to match against filenames.\n";
50 print STDERR " Matching filenames will be processed by this plugin.\n";
51 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
52 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
53 print STDERR " .htm or .html (case-insensitive).\n\n";
54
55 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
56 print STDERR " being passed to any later plugins in the list. This has no\n";
57 print STDERR " real effect other than to prevent lots of warning messages\n";
58 print STDERR " about input files you don't care about. Each plugin might\n";
59 print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
60 print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
61 print STDERR " file extensions.\n\n";
62
63
64 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
65 print STDERR " converted from these encodings and stored internally as\n";
66 print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
67 print STDERR " are:\n";
68
69 print STDERR " auto: Use text categorization algorithm to automatically\n";
70 print STDERR " identify the encoding of each source document. This\n";
71 print STDERR " will be slower than explicitly setting the encoding\n";
72 print STDERR " but will work where more than one encoding is used\n";
73 print STDERR " within the same collection.\n";
74
75 print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
76 print STDERR " using iso_8859_1. Beware of using this on a collection\n";
77 print STDERR " of documents that may contain characters outside the\n";
78 print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
79 print STDERR " documents containing accents), use iso_8859_1 instead.\n";
80
81 print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
82 print STDERR " unicode: just unicode\n";
83
84 my $e = $encodings::encodings;
85 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
86 print STDERR " $enc: $e->{$enc}->{'name'}\n";
87 }
88 print STDERR "\n";
89 print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
90 print STDERR " the text categorization algorithm fails to extract the\n";
91 print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
92 print STDERR " The default is iso_8859_1.\n\n";
93
94 print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
95 print STDERR " metadata. Note that this will be done automatically if\n";
96 print STDERR " -input_encoding is 'auto'.\n\n";
97 print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
98 print STDERR " the 'Language' metadata element will be set to this value.\n";
99 print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
100 print STDERR " en = English). Note that if -input_encoding is not set to\n";
101 print STDERR " 'auto' and -extract_language is not set, all documents will\n";
102 print STDERR " have their 'Language' metadata set to this value.\n\n";
103
104 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
105
106 print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
107
108 print STDERR " -first Comma separated list of first sizes to extract from the\n";
109 print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
110
111 print STDERR " -extract_email Extract email addresses as metadata\n\n";
112
113 print STDERR " -extract_date Extract dates pertaining to the content of documents about\n";
114 print STDERR " history\n";
115 print STDERR " -maximum_date The maximum historical date to be used as metadata (in a\n";
116 print STDERR " Common Era date, such as 1950)\n";
117 print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
118 print STDERR " metadata (e.g. 14 will extract all references up to the\n";
119 print STDERR " 14th century)\n";
120 print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
121 print STDERR " historical dates.\n\n";
122}
123
124# print_usage should be overridden for any sub-classes having
125# their own plugin specific options
126sub print_usage {
127 print STDERR "\nThis plugin has no plugin specific options\n\n";
128}
129
130sub new {
131 my $class = shift (@_);
132 my $plugin_name = shift (@_);
133 my $self = {};
134
135 my $enc = "^(";
136 map {$enc .= "$_|";} keys %$encodings::encodings;
137 my $denc = $enc . "ascii|utf8|unicode)\$";
138 $enc .= "ascii|utf8|unicode|auto)\$";
139
140 $self->{'outhandle'} = STDERR;
141 my $year = (localtime)[5]+1900;
142
143 $self->{'textcat'} = new textcat();
144
145 $self->{'num_processed'} = 0;
146 $self->{'num_not_processed'} = 0;
147 $self->{'num_blocked'} = 0;
148 $self->{'num_archives'} = 0;
149
150 # general options available to all plugins
151 if (!parsargv::parse(\@_,
152 q^process_exp/.*/^, \$self->{'process_exp'},
153 q^block_exp/.*/^, \$self->{'block_exp'},
154 q^extract_acronyms^, \$self->{'extract_acronyms'},
155 q^extract_keyphrases^, \$self->{'kea'}, #with extra options
156 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
157 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
158 qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
159 q^extract_email^, \$self->{'extract_email'},
160 q^markup_acronyms^, \$self->{'markup_acronyms'},
161 q^default_language/.{2}/en^, \$self->{'default_language'},
162 q^first/.*/^, \$self->{'first'},
163 q^extract_date^, \$self->{'date_extract'},
164 qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
165 q^no_bibliography^, \$self->{'no_biblio'},
166 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
167 "allow_extra_options")) {
168
169 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
170 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
171 &print_general_usage($plugin_name);
172 die "\n";
173 }
174
175 return bless $self, $class;
176}
177
178# initialize BasPlug options
179# if init() is overridden in a sub-class, remember to call BasPlug::init()
180sub init {
181 my $self = shift (@_);
182 my ($verbosity, $outhandle, $failhandle) = @_;
183
184 # verbosity is passed through from the processor
185 $self->{'verbosity'} = $verbosity;
186
187 # as are the outhandle and failhandle
188 $self->{'outhandle'} = $outhandle if defined $outhandle;
189 $self->{'failhandle'} = $failhandle;
190
191 # set process_exp and block_exp to defaults unless they were
192 # explicitly set
193
194 if ((!$self->is_recursive()) and
195 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
196
197 $self->{'process_exp'} = $self->get_default_process_exp ();
198 if ($self->{'process_exp'} eq "") {
199 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
200 }
201 }
202
203 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
204 $self->{'block_exp'} = $self->get_default_block_exp ();
205 }
206}
207
208sub begin {
209 my $self = shift (@_);
210 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
211 $self->initialise_extractors();
212}
213
214sub end {
215 my ($self) = @_;
216 $self->finalise_extractors();
217}
218
219# this function should be overridden to return 1
220# in recursive plugins
221sub is_recursive {
222 my $self = shift (@_);
223
224 return 0;
225}
226
227sub get_default_block_exp {
228 my $self = shift (@_);
229
230 return "";
231}
232
233sub get_default_process_exp {
234 my $self = shift (@_);
235
236 return "";
237}
238
239# The BasPlug read() function. This function does all the right things
240# to make general options work for a given plugin. It calls the process()
241# function which does all the work specific to a plugin (like the old
242# read functions used to do). Most plugins should define their own
243# process() function and let this read() function keep control.
244#
245# recursive plugins (e.g. RecPlug) and specialized plugins like those
246# capable of processing many documents within a single file (e.g.
247# GMLPlug) should normally implement their own version of read()
248#
249# Return number of files processed, undef if can't process
250# Note that $base_dir might be "" and that $file might
251# include directories
252
253sub read {
254 my $self = shift (@_);
255
256 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
257
258 if ($self->is_recursive()) {
259 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
260 }
261
262 my $outhandle = $self->{'outhandle'};
263
264 my $filename = &util::filename_cat($base_dir, $file);
265 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
266 $self->{'num_blocked'} ++;
267 return 0;
268 }
269 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
270 return undef;
271 }
272 my $plugin_name = ref ($self);
273 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
274
275 my ($language, $encoding);
276 if ($self->{'input_encoding'} eq "auto") {
277 # use textcat to automatically work out the input encoding and language
278 ($language, $encoding) = $self->get_language_encoding ($filename);
279
280 } elsif ($self->{'extract_language'}) {
281 # use textcat to get language metadata
282 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
283 $encoding = $self->{'input_encoding'};
284
285 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
286 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
287 print $outhandle "appears to be encoded as $extracted_encoding.\n";
288 }
289
290 } else {
291 $language = $self->{'default_language'};
292 $encoding = $self->{'input_encoding'};
293 }
294
295 # create a new document
296 my $doc_obj = new doc ($filename, "indexed_doc");
297 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
298 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
299 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
300 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($file));
301
302 # read in file ($text will be in utf8)
303 my $text = "";
304 $self->read_file ($filename, $encoding, $language, \$text);
305
306 if (!length ($text)) {
307 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
308
309 my $failhandle = $self->{'failhandle'};
310 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
311 $self->{'num_not_processed'} ++;
312
313 return 0;
314 }
315
316 # include any metadata passed in from previous plugins
317 # note that this metadata is associated with the top level section
318 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
319
320 # do plugin specific processing of doc_obj
321 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
322
323 # do any automatic metadata extraction
324 $self->auto_extract_metadata ($doc_obj);
325
326 # add an OID
327 $doc_obj->set_OID();
328
329 # process the document
330 $processor->process($doc_obj);
331
332 $self->{'num_processed'} ++;
333
334 return 1; # processed the file
335}
336
337# returns undef if file is rejected by the plugin
338sub process {
339 my $self = shift (@_);
340 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
341
342 die "Basplug::process function must be implemented in sub-class\n";
343
344 return undef; # never gets here
345}
346
347# uses the multiread package to read in the entire file pointed to
348# by filename and loads the resulting text into $$textref. Input text
349# may be in any of the encodings handled by multiread, output text
350# will be in utf8
351sub read_file {
352 my $self = shift (@_);
353 my ($filename, $encoding, $language, $textref) = @_;
354
355 if (!-r $filename)
356 {
357 my $outhandle = $self->{'outhandle'};
358 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
359 return;
360 }
361
362 $$textref = "";
363
364 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
365
366 if ($encoding eq "ascii") {
367 undef $/;
368 $$textref = <FILE>;
369 $/ = "\n";
370 } else {
371 my $reader = new multiread();
372 $reader->set_handle ('BasPlug::FILE');
373 $reader->set_encoding ($encoding);
374 $reader->read_file ($textref);
375
376 if ($language eq "zh") {
377 # segment the Chinese words
378 $$textref = &cnseg::segment($$textref);
379 }
380 }
381
382 close FILE;
383}
384
385# Uses textcat to work out the encoding and language of the text in
386# $filename. All html tags are removed before processing.
387# returns an array containing "language" and "encoding"
388sub get_language_encoding {
389 my $self = shift (@_);
390 my ($filename) = @_;
391 my $outhandle = $self->{'outhandle'};
392
393 # read in file
394 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
395 undef $/;
396 my $text = <FILE>;
397 $/ = "\n";
398 close FILE;
399
400 # remove <title>stuff</title> -- as titles tend often to be in English
401 # for foreign language documents
402 $text =~ s/<title>.*?<\/title>//i;
403
404 # remove all HTML tags
405 $text =~ s/<[^>]*>//sg;
406
407 # get the language/encoding
408 my $results = $self->{'textcat'}->classify(\$text);
409
410 # if textcat returns 3 or less possibilities we'll use the
411 # first one in the list - otherwise use the defaults
412 if (scalar @$results > 3) {
413
414 if ($self->{'input_encoding'} ne 'auto') {
415 if ($self->{'extract_language'} && $self->{'verbosity'}) {
416 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
417 print $outhandle "defaulting to $self->{'default_language'}\n";
418 }
419 return ($self->{'default_language'}, $self->{'input_encoding'});
420
421 } else {
422 if ($self->{'verbosity'}) {
423 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
424 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
425 }
426 return ($self->{'default_language'}, $self->{'default_encoding'});
427 }
428 }
429
430 # format language/encoding
431 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
432 if (!defined $language) {
433 if ($self->{'verbosity'}) {
434 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
435 print $outhandle "defaulting to $self->{'default_language'}\n";
436 }
437 $language = $self->{'default_language'};
438 }
439 if (!defined $encoding) {
440 if ($self->{'verbosity'}) {
441 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
442 print $outhandle "defaulting to $self->{'default_encoding'}\n";
443 }
444 $encoding = $self->{'default_encoding'};
445 }
446
447 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
448 !defined $encodings::encodings->{$encoding}) {
449 if ($self->{'verbosity'}) {
450 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
451 print $outhandle "using $self->{'default_encoding'}\n";
452 }
453 $encoding = $self->{'default_encoding'};
454 }
455
456 return ($language, $encoding);
457}
458
459# add any extra metadata that's been passed around from one
460# plugin to another.
461# extra_metadata uses add_utf8_metadata so it expects metadata values
462# to already be in utf8
463sub extra_metadata {
464 my $self = shift (@_);
465 my ($doc_obj, $cursection, $metadata) = @_;
466
467 foreach my $field (keys(%$metadata)) {
468 # $metadata->{$field} may be an array reference
469 if (ref ($metadata->{$field}) eq "ARRAY") {
470 map {
471 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
472 } @{$metadata->{$field}};
473 } else {
474 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
475 }
476 }
477}
478
479# initialise metadata extractors
480sub initialise_extractors {
481 my $self = shift (@_);
482
483 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
484 &acronym::initialise_acronyms();
485 }
486}
487
488# finalise metadata extractors
489sub finalise_extractors {
490 my $self = shift (@_);
491
492 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
493 &acronym::finalise_acronyms();
494 }
495}
496
497# FIRSTNNN: extract the first NNN characters as metadata
498sub extract_first_NNNN_characters {
499 my $self = shift (@_);
500 my ($textref, $doc_obj, $thissection) = @_;
501
502 foreach my $size (split /,/, $self->{'first'}) {
503 my $tmptext = $$textref;
504 $tmptext =~ s/^\s+//;
505 $tmptext =~ s/\s+$//;
506 $tmptext =~ s/\s+/ /gs;
507 $tmptext = substr ($tmptext, 0, $size);
508 $tmptext =~ s/\s\S*$/&#8230;/;
509 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
510 }
511}
512
513sub extract_email {
514 my $self = shift (@_);
515 my ($textref, $doc_obj, $thissection) = @_;
516 my $outhandle = $self->{'outhandle'};
517
518 print $outhandle " extracting email addresses ...\n"
519 if ($self->{'verbosity'} > 2);
520
521 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
522 @email = sort @email;
523
524 my @email2 = ();
525 foreach my $address (@email) {
526 if (!(join(" ",@email2) =~ m/$address/ )) {
527 push @email2, $address;
528 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
529 print $outhandle " extracting $address\n"
530 if ($self->{'verbosity'} > 3);
531 }
532 }
533 print $outhandle " done extracting email addresses.\n"
534 if ($self->{'verbosity'} > 2);
535
536}
537
538# extract metadata
539sub auto_extract_metadata {
540
541
542 my $self = shift (@_);
543 my ($doc_obj) = @_;
544
545 if ($self->{'extract_email'}) {
546 my $thissection = $doc_obj->get_top_section();
547 while (defined $thissection) {
548 my $text = $doc_obj->get_text($thissection);
549 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
550 $thissection = $doc_obj->get_next_section ($thissection);
551 }
552 }
553
554
555#adding kea keyphrases
556 if ($self->{'kea'}) {
557
558 my $thissection = $doc_obj->get_top_section();
559 my $text = "";
560 my @list;
561
562 while (defined $thissection) { #loop through sections to gather whole doc
563 my $sectiontext = $doc_obj->get_text($thissection);
564 $text = $text.$sectiontext;
565 $thissection = $doc_obj->get_next_section ($thissection);
566 }
567
568 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
569 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
570 } else { #otherwise call Kea with no options
571 @list = &Kea::extract_KeyPhrases ($text);
572 }
573
574 if(@list){ #if a list of kea keyphrases was returned (ie not empty)
575 my $keyphrases = $list[0]; #first arg is keyphrase list
576 my $stems = $list[1]; #second arg is stemmed keyphrase list
577 print STDERR "keyphrases: $keyphrases\n";
578 print STDERR "stems: $stems\n";
579 $thissection = $doc_obj->get_top_section(); #add metadata to top section
580 $doc_obj->add_metadata($thissection, "kea", $keyphrases);
581 $doc_obj->add_metadata($thissection, "stems", $stems);
582 }
583 } #end of kea
584
585 if ($self->{'first'}) {
586 my $thissection = $doc_obj->get_top_section();
587 while (defined $thissection) {
588 my $text = $doc_obj->get_text($thissection);
589 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
590 $thissection = $doc_obj->get_next_section ($thissection);
591 }
592 }
593
594 if ($self->{'extract_acronyms'}) {
595 my $thissection = $doc_obj->get_top_section();
596 while (defined $thissection) {
597 my $text = $doc_obj->get_text($thissection);
598 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
599 $thissection = $doc_obj->get_next_section ($thissection);
600 }
601 }
602
603 if ($self->{'markup_acronyms'}) {
604 my $thissection = $doc_obj->get_top_section();
605 while (defined $thissection) {
606 my $text = $doc_obj->get_text($thissection);
607 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
608 $doc_obj->delete_text($thissection);
609 $doc_obj->add_text($thissection, $text);
610 $thissection = $doc_obj->get_next_section ($thissection);
611 }
612 }
613
614 if($self->{'date_extract'}) {
615 my $thissection = $doc_obj->get_top_section();
616 while (defined $thissection) {
617
618 my $text = $doc_obj->get_text($thissection);
619 &DateExtract::get_date_metadata($text, $doc_obj,
620 $thissection,
621 $self->{'no_biblio'},
622 $self->{'max_year'},
623 $self->{'max_century'});
624 $thissection = $doc_obj->get_next_section ($thissection);
625 }
626 }
627}
628
629# extract acronyms from a section in a document. progress is
630# reported to outhandle based on the verbosity. both the Acronym
631# and the AcronymKWIC metadata items are created.
632
633sub extract_acronyms {
634 my $self = shift (@_);
635 my ($textref, $doc_obj, $thissection) = @_;
636 my $outhandle = $self->{'outhandle'};
637
638 print $outhandle " extracting acronyms ...\n"
639 if ($self->{'verbosity'} > 2);
640
641 my $acro_array = &acronym::acronyms($textref);
642
643 foreach my $acro (@$acro_array) {
644
645 #check that this is the first time ...
646 my $seen_before = "false";
647 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
648 foreach my $thisAcro (@$previous_data) {
649 if ($thisAcro eq $acro->to_string()) {
650 $seen_before = "true";
651 print $outhandle " already seen ". $acro->to_string() . "\n"
652 if ($self->{'verbosity'} >= 4);
653 }
654 }
655
656 if ($seen_before eq "false") {
657 #write it to the file ...
658 $acro->write_to_file();
659
660 #do the normal acronym
661 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
662 print $outhandle " adding ". $acro->to_string() . "\n"
663 if ($self->{'verbosity'} > 3);
664
665 }
666 }
667 print $outhandle " done extracting acronyms. \n"
668 if ($self->{'verbosity'} > 2);
669}
670
671sub markup_acronyms {
672 my $self = shift (@_);
673 my ($text, $doc_obj, $thissection) = @_;
674 my $outhandle = $self->{'outhandle'};
675
676 print $outhandle " marking up acronyms ...\n"
677 if ($self->{'verbosity'} > 2);
678
679 #self is passed in to check for verbosity ...
680 $text = &acronym::markup_acronyms($text, $self);
681
682 print $outhandle " done marking up acronyms. \n"
683 if ($self->{'verbosity'} > 2);
684
685 return $text;
686}
687
688sub compile_stats {
689 my $self = shift(@_);
690 my ($stats) = @_;
691
692 $stats->{'num_processed'} += $self->{'num_processed'};
693 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
694
695}
696
697
6981;
Note: See TracBrowser for help on using the repository browser.