source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1844

Last change on this file since 1844 was 1844, checked in by sjboddie, 23 years ago

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.3 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28use parsargv;
29use multiread;
30use cnseg;
31use acronym;
32use textcat;
33use doc;
34use diagnostics;
35use DateExtract;
36use iso639;
37
38# if textcat returns an encoding that isn't in this list
39# we'll print a warning and use the default encoding instead
40%supported_encodings = (
41 "ascii" => "",
42 "iso_8859_1" => "",
43 "windows_1252" => "",
44 "iso_8859_2" => "",
45 "windows_1250" => "",
46 "iso_8859_3" => "",
47 "iso_8859_4" => "",
48 "iso_8859_5" => "",
49 "windows_1251" => "",
50 "koi8_r" => "",
51 "koi8_u" => "",
52 "iso_8859_6" => "",
53 "windows_1256" => "",
54 "iso_8859_7" => "",
55 "windows_1253" => "",
56 "iso_8859_8" => "",
57 "windows_1255" => "",
58 "iso_8859_9" => "",
59 "windows_1254" => "",
60 "gb" => ""
61 );
62
63sub print_general_usage {
64 my ($plugin_name) = @_;
65
66 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
67
68 print STDERR " -process_exp A perl regular expression to match against filenames.\n";
69 print STDERR " Matching filenames will be processed by this plugin.\n";
70 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
71 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
72 print STDERR " .htm or .html (case-insensitive).\n\n";
73
74 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
75 print STDERR " being passed to any further plugins in the list. This has no\n";
76 print STDERR " real effect other than to prevent lots of warning messages\n";
77 print STDERR " about input files you don't care about. Each plugin may or may\n";
78 print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n";
79 print STDERR " any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
80 print STDERR " file extensions.\n\n";
81
82
83 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
84 print STDERR " converted from these encodings and stored internally as\n";
85 print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
86 print STDERR " are:\n";
87
88 print STDERR " auto: Use text categorization algorithm to automatically\n";
89 print STDERR " identify the encoding of each source document. This\n";
90 print STDERR " will be slower than explicitly setting the encoding\n";
91 print STDERR " but will work where more than one encoding is used\n";
92 print STDERR " within the same collection.\n";
93
94 print STDERR " ascii: Plain 7 bit ascii. This may be a little faster than\n";
95 print STDERR " using iso_8859_1. Beware of using 'ascii' on a collection\n";
96 print STDERR " of documents that may contain characters outside of plain\n";
97 print STDERR " 7 bit ascii though (e.g. German or French documents\n";
98 print STDERR " containing accents), use iso_8859_1 instead.\n";
99
100 print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
101 print STDERR " unicode: just unicode\n";
102
103 print STDERR " iso_8859_1: Latin1 (western european languages)\n";
104 print STDERR " windows_1252: Windows codepage 1252 (WinLatin1)\n";
105
106 print STDERR " iso_8859_2: Latin2 (central and eastern european languages)\n";
107 print STDERR " windows_1250: Windows codepage 1250 (WinLatin2)\n";
108
109 print STDERR " iso_8859_3: Latin3\n";
110
111 print STDERR " iso_8859_4: Latin4\n";
112
113 print STDERR " iso_8859_5: Cyrillic\n";
114 print STDERR " windows_1251: Windows codepage 1251 (WinCyrillic)\n";
115 print STDERR " koi8_r: Cyrillic - Russian\n";
116 print STDERR " koi8_u: Cyrillic - Ukrainian\n";
117
118 print STDERR " iso_8859_6: Arabic\n";
119 print STDERR " windows_1256: Windows codepage 1256 (WinArabic)\n";
120
121 print STDERR " iso_8859_7: Greek\n";
122 print STDERR " windows_1253: Windows codepage 1253 (WinGreek)\n";
123
124 print STDERR " iso_8859_8: Hebrew\n";
125 print STDERR " windows_1255: Windows codepage 1255 (WinHebrew)\n";
126
127 print STDERR " iso_8859_9: Latin5\n";
128 print STDERR " windows_1254: Windows codepage 1254 (WinTurkish)\n";
129
130 print STDERR " gb: GB or GBK simplified Chinese\n\n";
131
132 print STDERR " -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
133 print STDERR " algorithm fails to extract the encoding or extracts an encoding\n";
134 print STDERR " that is not supported by Greenstone, this encoding will be used\n";
135 print STDERR " instead. The default is iso_8859_1\n\n";
136
137 print STDERR " -extract_language Identify the language of each document and set 'Language' metadata. Note\n";
138 print STDERR " that this will be done automatically if -input_encoding is 'auto'.\n";
139 print STDERR " -default_language If Greenstone fails to work out what language a document is the\n";
140 print STDERR " 'Language' metadata element will be set to this value. The default\n";
141 print STDERR " is 'en' (ISO 639 language symbols should be used - en = English).\n";
142 print STDERR " Note that if -input_encoding is not set to 'auto' and -extract_language\n";
143 print STDERR " is not set, all documents will have their 'Language' metadata set to\n";
144 print STDERR " this value.\n\n";
145
146 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
147
148 print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
149
150 print STDERR " -first Comma seperated list of first sizes to extract from the text\n";
151 print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n\n";
152
153 print STDERR " -extract_email Extract email addresses as metadata\n\n";
154
155 print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n\n";
156}
157
158# print_usage should be overridden for any sub-classes having
159# their own plugin specific options
160sub print_usage {
161 print STDERR "\nThis plugin has no plugin specific options\n\n";
162
163}
164
165sub new {
166 my $class = shift (@_);
167 my $plugin_name = shift (@_);
168 my $self = {};
169
170 my $enc = "^(";
171 map {$enc .= "|$_";} keys %supported_encodings;
172 my $denc = $enc . "|utf8|unicode)\$";
173 $enc .= "|utf8|unicode|auto)\$";
174
175 $self->{'outhandle'} = STDERR;
176 my $year = (localtime)[5]+1900;
177
178 # general options available to all plugins
179 if (!parsargv::parse(\@_,
180 q^process_exp/.*/^, \$self->{'process_exp'},
181 q^block_exp/.*/^, \$self->{'block_exp'},
182 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
183 qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
184 q^extract_acronyms^, \$self->{'extract_acronyms'},
185 q^extract_email^, \$self->{'extract_email'},
186 q^markup_acronyms^, \$self->{'markup_acronyms'},
187 q^extract_language^, \$self->{'extract_language'},
188 q^default_language/.{2}/en^, \$self->{'default_language'},
189 q^first/.*/^, \$self->{'first'},
190 q^extract_date^, \$self->{'date_extract'},
191 qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
192 q^no_bibliography^, \$self->{'no_biblio'},
193 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
194 "allow_extra_options")) {
195
196 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
197 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
198 &print_general_usage($plugin_name);
199 die "\n";
200 }
201
202 return bless $self, $class;
203}
204
205# initialize BasPlug options
206# if init() is overridden in a sub-class, remember to call BasPlug::init()
207sub init {
208 my $self = shift (@_);
209 my ($verbosity, $outhandle) = @_;
210
211 # verbosity is passed through from the processor
212 $self->{'verbosity'} = $verbosity;
213
214 # as is the outhandle ...
215 $self->{'outhandle'} = $outhandle if defined $outhandle;
216
217 # set process_exp and block_exp to defaults unless they were
218 # explicitly set
219
220 if ((!$self->is_recursive()) and
221 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
222
223 $self->{'process_exp'} = $self->get_default_process_exp ();
224 if ($self->{'process_exp'} eq "") {
225 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
226 }
227 }
228
229 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
230 $self->{'block_exp'} = $self->get_default_block_exp ();
231 }
232}
233
234sub begin {
235 my $self = shift (@_);
236 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
237 $self->initialise_extractors();
238}
239
240sub end {
241 my ($self) = @_;
242 $self->finalise_extractors();
243}
244
245# this function should be overridden to return 1
246# in recursive plugins
247sub is_recursive {
248 my $self = shift (@_);
249
250 return 0;
251}
252
253sub get_default_block_exp {
254 my $self = shift (@_);
255
256 return "";
257}
258
259sub get_default_process_exp {
260 my $self = shift (@_);
261
262 return "";
263}
264
265# The BasPlug read() function. This function does all the right things
266# to make general options work for a given plugin. It calls the process()
267# function which does all the work specific to a plugin (like the old
268# read functions used to do). Most plugins should define their own
269# process() function and let this read() function keep control.
270#
271# recursive plugins (e.g. RecPlug) and specialized plugins like those
272# capable of processing many documents within a single file (e.g.
273# GMLPlug) should normally implement their own version of read()
274#
275# Return number of files processed, undef if can't process
276# Note that $base_dir might be "" and that $file might
277# include directories
278
279sub read {
280 my $self = shift (@_);
281 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
282
283 if ($self->is_recursive()) {
284 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
285 }
286
287 my $outhandle = $self->{'outhandle'};
288
289 my $filename = &util::filename_cat($base_dir, $file);
290 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
291 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
292 return undef;
293 }
294 my $plugin_name = ref ($self);
295 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
296
297 my ($language, $encoding);
298 if ($self->{'input_encoding'} eq "auto") {
299 # use textcat to automatically work out the input encoding and language
300 ($language, $encoding) = $self->get_language_encoding ($filename);
301
302 } elsif ($self->{'extract_language'}) {
303 # use textcat to get language metadata
304 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
305 $encoding = $self->{'input_encoding'};
306
307 if ($extracted_encoding != $encoding && $self->{'verbosity'}) {
308 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
309 print $outhandle "appears to be encoded as $extracted_encoding.";
310 }
311
312 } else {
313 $language = $self->{'default_language'};
314 $encoding = $self->{'input_encoding'};
315 }
316
317 # create a new document
318 my $doc_obj = new doc ($filename, "indexed_doc");
319 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
320 $doc_obj->set_source_encoding ($encoding);
321
322
323 # read in file ($text will be in utf8)
324 my $text = "";
325 $self->read_file ($filename, $encoding, \$text);
326
327 if (!length ($text)) {
328 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
329 return 0;
330 }
331
332 # include any metadata passed in from previous plugins
333 # note that this metadata is associated with the top level section
334 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
335
336 # do plugin specific processing of doc_obj
337 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
338
339 # do any automatic metadata extraction
340 $self->auto_extract_metadata ($doc_obj);
341
342 # add an OID
343 $doc_obj->set_OID();
344
345 # process the document
346 $processor->process($doc_obj);
347
348 return 1; # processed the file
349}
350
351# returns undef if file is rejected by the plugin
352sub process {
353 my $self = shift (@_);
354 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
355
356 die "Basplug::process function must be implemented in sub-class\n";
357
358 return undef; # never gets here
359}
360
361# uses the multiread package to read in the entire file pointed to
362# by filename and loads the resulting text into $$textref. Input text
363# may be in any of the encodings handled by multiread, output text
364# will be in utf8
365sub read_file {
366 my $self = shift (@_);
367 my ($filename, $encoding, $textref) = @_;
368
369 if (!-r $filename)
370 {
371 my $outhandle = $self->{'outhandle'};
372 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
373 return;
374 }
375
376 $$textref = "";
377
378 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
379
380 if ($encoding eq "ascii") {
381 undef $/;
382 $$textref = <FILE>;
383 $/ = "\n";
384 } else {
385 my $reader = new multiread();
386 $reader->set_handle ('BasPlug::FILE');
387 $reader->set_encoding ($encoding);
388 $reader->read_file ($textref);
389
390 if ($encoding eq "gb") {
391 # segment the Chinese words
392 $$textref = &cnseg::segment($$textref);
393 }
394 }
395
396 close FILE;
397}
398
399# Uses textcat to work out the encoding and language of the text in
400# $filename. All html tags are removed before processing.
401# returns an array containing "language" and "encoding"
402sub get_language_encoding {
403 my $self = shift (@_);
404 my ($filename) = @_;
405 my $outhandle = $self->{'outhandle'};
406
407 # read in file
408 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
409 undef $/;
410 my $text = <FILE>;
411 $/ = "\n";
412 close FILE;
413
414 # remove all HTML tags
415 $text =~ s/<[^>]*>//sg;
416
417 # get the language/encoding
418 my @results = textcat::classify($text);
419
420# foreach $i (@results) {
421# print STDERR "i: $i\n";
422# }
423
424 if (scalar @results != 1) {
425 if ($self->{'input_encoding'} ne 'auto') {
426 if ($self->{'extract_language'} && $self->{'verbosity'}) {
427 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
428 print $outhandle "defaulting to $self->{'default_language'}\n";
429 }
430 return ($self->{'default_language'}, $self->{'input_encoding'});
431
432 } else {
433 if ($self->{'verbosity'}) {
434 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
435 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
436 }
437 return ($self->{'default_language'}, $self->{'default_encoding'});
438 }
439 }
440
441 # format language/encoding
442 my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
443 $language = $iso639::toiso639{lc($language)};
444 die "Invalid language\n" if !defined $language;
445
446 if (!defined $encoding) {
447 # if textcat returned no encoding info it is assumed to be iso_8859_1
448 $encoding = "iso_8859_1";
449 } else {
450 # convert to the format we expect
451 $encoding =~ s/windows/windows_/;
452 $encoding =~ s/iso8859/iso_8859/;
453 $encoding =~ s/^gb.*$/gb/;
454 }
455
456 if (!defined $supported_encodings{$encoding}) {
457 if ($self->{'verbosity'}) {
458 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
459 print $outhandle "using $self->{'default_encoding'}\n";
460 }
461 $encoding = $self->{'default_encoding'};
462 }
463
464 return ($language, $encoding);
465}
466
467# add any extra metadata that's been passed around from one
468# plugin to another.
469# extra_metadata uses add_utf8_metadata so it expects metadata values
470# to already be in utf8
471sub extra_metadata {
472 my $self = shift (@_);
473 my ($doc_obj, $cursection, $metadata) = @_;
474
475 foreach my $field (keys(%$metadata)) {
476 # $metadata->{$field} may be an array reference
477 if (ref ($metadata->{$field}) eq "ARRAY") {
478 map {
479 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
480 } @{$metadata->{$field}};
481 } else {
482 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
483 }
484 }
485}
486
487# initialise metadata extractors
488sub initialise_extractors {
489 my $self = shift (@_);
490
491 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
492 &acronym::initialise_acronyms();
493 }
494}
495
496# finalise metadata extractors
497sub finalise_extractors {
498 my $self = shift (@_);
499
500 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
501 &acronym::finalise_acronyms();
502 }
503}
504
505# FIRSTNNN: extract the first NNN characters as metadata
506sub extract_first_NNNN_characters {
507 my $self = shift (@_);
508 my ($textref, $doc_obj, $thissection) = @_;
509
510 foreach my $size (split /,/, $self->{'first'}) {
511 my $tmptext = $$textref;
512 $tmptext =~ s/^\s+//;
513 $tmptext =~ s/\s+$//;
514 $tmptext =~ s/\s+/ /gs;
515 $tmptext = substr ($tmptext, 0, $size);
516 $tmptext =~ s/\s\S*$/&#8230;/;
517 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
518 }
519}
520
521sub extract_email {
522 my $self = shift (@_);
523 my ($textref, $doc_obj, $thissection) = @_;
524 my $outhandle = $self->{'outhandle'};
525
526 print $outhandle " extracting email addresses ...\n"
527 if ($self->{'verbosity'} > 2);
528
529 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
530 @email = sort @email;
531
532 my @email2 = ();
533 foreach my $address (@email) {
534 if (!(join(" ",@email2) =~ m/$address/ )) {
535 push @email2, $address;
536 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
537 print $outhandle " extracting $address\n"
538 if ($self->{'verbosity'} > 3);
539 }
540 }
541 print $outhandle " done extracting email addresses.\n"
542 if ($self->{'verbosity'} > 2);
543
544}
545
546# extract metadata
547sub auto_extract_metadata {
548 my $self = shift (@_);
549 my ($doc_obj) = @_;
550
551 if ($self->{'extract_email'}) {
552 my $thissection = $doc_obj->get_top_section();
553 while (defined $thissection) {
554 my $text = $doc_obj->get_text($thissection);
555 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
556 $thissection = $doc_obj->get_next_section ($thissection);
557 }
558 }
559 if ($self->{'first'}) {
560 my $thissection = $doc_obj->get_top_section();
561 while (defined $thissection) {
562 my $text = $doc_obj->get_text($thissection);
563 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
564 $thissection = $doc_obj->get_next_section ($thissection);
565 }
566 }
567
568 if ($self->{'extract_acronyms'}) {
569 my $thissection = $doc_obj->get_top_section();
570 while (defined $thissection) {
571 my $text = $doc_obj->get_text($thissection);
572 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
573 $thissection = $doc_obj->get_next_section ($thissection);
574 }
575 }
576
577 if ($self->{'markup_acronyms'}) {
578 my $thissection = $doc_obj->get_top_section();
579 while (defined $thissection) {
580 my $text = $doc_obj->get_text($thissection);
581 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
582 $doc_obj->delete_text($thissection);
583 $doc_obj->add_text($thissection, $text);
584 $thissection = $doc_obj->get_next_section ($thissection);
585 }
586 }
587
588 if($self->{'date_extract'}) {
589 my $thissection = $doc_obj->get_top_section();
590 while (defined $thissection) {
591
592 my $text = $doc_obj->get_text($thissection);
593 &DateExtract::get_date_metadata($text, $doc_obj,
594 $thissection,
595 $self->{'no_biblio'},
596 $self->{'max_year'},
597 $self->{'max_century'});
598 $thissection = $doc_obj->get_next_section ($thissection);
599 }
600 }
601
602 if ($self->{'extract_language'}) {
603 my $thissection = $doc_obj->get_top_section();
604 while (defined $thissection) {
605 my $text = $doc_obj->get_text($thissection);
606 $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
607 $thissection = $doc_obj->get_next_section ($thissection);
608 }
609 }
610
611}
612
613# extract acronyms from a section in a document. progress is
614# reported to outhandle based on the verbosity. both the Acronym
615# and the AcronymKWIC metadata items are created.
616
617sub extract_acronyms {
618 my $self = shift (@_);
619 my ($textref, $doc_obj, $thissection) = @_;
620 my $outhandle = $self->{'outhandle'};
621
622 print $outhandle " extracting acronyms ...\n"
623 if ($self->{'verbosity'} > 2);
624
625 my $acro_array = &acronym::acronyms($textref);
626
627 foreach my $acro (@$acro_array) {
628
629 #check that this is the first time ...
630 my $seen_before = "false";
631 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
632 foreach my $thisAcro (@$previous_data) {
633 if ($thisAcro eq $acro->to_string()) {
634 $seen_before = "true";
635 print $outhandle " already seen ". $acro->to_string() . "\n"
636 if ($self->{'verbosity'} >= 4);
637 }
638 }
639
640 if ($seen_before eq "false") {
641 #write it to the file ...
642 $acro->write_to_file();
643
644 #do the normal acronym
645 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
646 print $outhandle " adding ". $acro->to_string() . "\n"
647 if ($self->{'verbosity'} > 3);
648
649 }
650 }
651 print $outhandle " done extracting acronyms. \n"
652 if ($self->{'verbosity'} > 2);
653}
654
655sub markup_acronyms {
656 my $self = shift (@_);
657 my ($text, $doc_obj, $thissection) = @_;
658 my $outhandle = $self->{'outhandle'};
659
660 print $outhandle " marking up acronyms ...\n"
661 if ($self->{'verbosity'} > 2);
662
663 #self is passed in to check for verbosity ...
664 $text = &acronym::markup_acronyms($text, $self);
665
666 print $outhandle " done marking up acronyms. \n"
667 if ($self->{'verbosity'} > 2);
668
669 return $text;
670}
671
6721;
Note: See TracBrowser for help on using the repository browser.