source: gsdl/trunk/perllib/plugins/AcronymExtractor.pm@ 15867

Last change on this file since 15867 was 15867, checked in by kjdon, 16 years ago

plugin overhaul: automatic metadata extraction moved out of BasPlug into several extractor plugins (Keyphrase, Date, Acronym, EmailAddress Extractors). These are used by the AutoExtractMetadata plugin to add this functionality to BasePlugin (using multiple inheritance)

  • Property svn:executable set to *
File size: 3.9 KB
Line 
1package AcronymExtractor;
2
3use acronym;
4use PrintInfo;
5
6BEGIN {
7 @AcronymExtractor::ISA = ('PrintInfo');
8}
9
10my $arguments = [
11 { 'name' => "extract_acronyms",
12 'desc' => "{AcronymExtractor.extract_acronyms}",
13 'type' => "flag",
14 'reqd' => "no" },
15 { 'name' => "markup_acronyms",
16 'desc' => "{AcronymExtractor.markup_acronyms}",
17 'type' => "flag",
18 'reqd' => "no" } ];
19
20my $options = { 'name' => "AcronymExtractor",
21 'desc' => "{AcronymExtractor.desc}",
22 'abstract' => "yes",
23 'inherits' => "yes",
24 'args' => $arguments };
25
26
27sub new {
28 my ($class) = shift (@_);
29 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
30 push(@$pluginlist, $class);
31
32 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
33 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
34
35 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
36
37 return bless $self, $class;
38
39}
40
41
42# initialise metadata extractors
43sub initialise_acronym_extractor {
44 my $self = shift (@_);
45
46 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
47 &acronym::initialise_acronyms();
48 }
49}
50
51# finalise metadata extractors
52sub finalise_acronym_extractor {
53 my $self = shift (@_);
54
55 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
56 &acronym::finalise_acronyms();
57 }
58}
59
60# extract metadata
61sub extract_acronym_metadata {
62
63 my $self = shift (@_);
64 my ($doc_obj) = @_;
65
66
67 if ($self->{'extract_acronyms'}) {
68 my $thissection = $doc_obj->get_top_section();
69 while (defined $thissection) {
70 my $text = $doc_obj->get_text($thissection);
71 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
72 $thissection = $doc_obj->get_next_section ($thissection);
73 }
74 }
75
76 if ($self->{'markup_acronyms'}) {
77 my $thissection = $doc_obj->get_top_section();
78 while (defined $thissection) {
79 my $text = $doc_obj->get_text($thissection);
80 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
81 $doc_obj->delete_text($thissection);
82 $doc_obj->add_text($thissection, $text);
83 $thissection = $doc_obj->get_next_section ($thissection);
84 }
85 }
86
87}
88
89
90
91# extract acronyms from a section in a document. progress is
92# reported to outhandle based on the verbosity. both the Acronym
93# and the AcronymKWIC metadata items are created.
94
95sub extract_acronyms {
96 my $self = shift (@_);
97 my ($textref, $doc_obj, $thissection) = @_;
98 my $outhandle = $self->{'outhandle'};
99
100 # print $outhandle " extracting acronyms ...\n"
101 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
102 if ($self->{'verbosity'} > 2);
103
104 my $acro_array = &acronym::acronyms($textref);
105
106 foreach my $acro (@$acro_array) {
107
108 #check that this is the first time ...
109 my $seen_before = "false";
110 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
111 foreach my $thisAcro (@$previous_data) {
112 if ($thisAcro eq $acro->to_string()) {
113 $seen_before = "true";
114 if ($self->{'verbosity'} >= 4) {
115 gsprintf($outhandle, " {BasPlug.already_seen} " .
116 $acro->to_string() . "\n");
117 }
118 }
119 }
120
121 if ($seen_before eq "false") {
122 #write it to the file ...
123 $acro->write_to_file();
124
125 #do the normal acronym
126 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
127 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
128 if ($self->{'verbosity'} > 3);
129 }
130 }
131
132 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
133 if ($self->{'verbosity'} > 2);
134}
135
136sub markup_acronyms {
137 my $self = shift (@_);
138 my ($text, $doc_obj, $thissection) = @_;
139 my $outhandle = $self->{'outhandle'};
140
141 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
142 if ($self->{'verbosity'} > 2);
143
144 #self is passed in to check for verbosity ...
145 $text = &acronym::markup_acronyms($text, $self);
146
147 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
148 if ($self->{'verbosity'} > 2);
149
150 return $text;
151}
152
1531;
Note: See TracBrowser for help on using the repository browser.