source: main/trunk/greenstone2/perllib/plugins/AcronymExtractor.pm@ 25797

Last change on this file since 25797 was 25797, checked in by kjdon, 12 years ago

need to define gsprintf in order to use it

  • Property svn:executable set to *
File size: 5.1 KB
Line 
1###########################################################################
2#
3# AcronymExtractor - helper plugin that extacts acronyms from text
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2008 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package AcronymExtractor;
28
29use acronym;
30use PrintInfo;
31use strict;
32
33use gsprintf 'gsprintf';
34
35BEGIN {
36 @AcronymExtractor::ISA = ('PrintInfo');
37}
38
39my $arguments = [
40 { 'name' => "extract_acronyms",
41 'desc' => "{AcronymExtractor.extract_acronyms}",
42 'type' => "flag",
43 'reqd' => "no" },
44 { 'name' => "markup_acronyms",
45 'desc' => "{AcronymExtractor.markup_acronyms}",
46 'type' => "flag",
47 'reqd' => "no" } ];
48
49my $options = { 'name' => "AcronymExtractor",
50 'desc' => "{AcronymExtractor.desc}",
51 'abstract' => "yes",
52 'inherits' => "yes",
53 'args' => $arguments };
54
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
63
64 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,1);
65
66 return bless $self, $class;
67
68}
69
70
71# initialise metadata extractors
72sub initialise_acronym_extractor {
73 my $self = shift (@_);
74
75 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
76 &acronym::initialise_acronyms();
77 }
78}
79
80# finalise metadata extractors
81sub finalise_acronym_extractor {
82 my $self = shift (@_);
83
84 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
85 &acronym::finalise_acronyms();
86 }
87}
88
89# extract metadata
90sub extract_acronym_metadata {
91
92 my $self = shift (@_);
93 my ($doc_obj) = @_;
94
95
96 if ($self->{'extract_acronyms'}) {
97 my $thissection = $doc_obj->get_top_section();
98 while (defined $thissection) {
99 my $text = $doc_obj->get_text($thissection);
100 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
101 $thissection = $doc_obj->get_next_section ($thissection);
102 }
103 }
104
105 if ($self->{'markup_acronyms'}) {
106 my $thissection = $doc_obj->get_top_section();
107 while (defined $thissection) {
108 my $text = $doc_obj->get_text($thissection);
109 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
110 $doc_obj->delete_text($thissection);
111 $doc_obj->add_text($thissection, $text);
112 $thissection = $doc_obj->get_next_section ($thissection);
113 }
114 }
115
116}
117
118
119
120# extract acronyms from a section in a document. progress is
121# reported to outhandle based on the verbosity. both the Acronym
122# and the AcronymKWIC metadata items are created.
123
124sub extract_acronyms {
125 my $self = shift (@_);
126 my ($textref, $doc_obj, $thissection) = @_;
127 my $outhandle = $self->{'outhandle'};
128
129 # print $outhandle " extracting acronyms ...\n"
130 gsprintf($outhandle, " {AcronymExtractor.extracting_acronyms}...\n")
131 if ($self->{'verbosity'} > 2);
132
133 my $acro_array = &acronym::acronyms($textref);
134
135 foreach my $acro (@$acro_array) {
136
137 #check that this is the first time ...
138 my $seen_before = "false";
139 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
140 foreach my $thisAcro (@$previous_data) {
141 if ($thisAcro eq $acro->to_string()) {
142 $seen_before = "true";
143 if ($self->{'verbosity'} >= 4) {
144 gsprintf($outhandle, " {AcronymExtractor.already_seen} " .
145 $acro->to_string() . "\n");
146 }
147 }
148 }
149
150 if ($seen_before eq "false") {
151 #write it to the file ...
152 $acro->write_to_file();
153
154 #do the normal acronym
155 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
156 gsprintf($outhandle, " {AcronymExtractor.adding} ".$acro->to_string()."\n")
157 if ($self->{'verbosity'} > 3);
158 }
159 }
160
161 gsprintf($outhandle, " {AcronymExtractor.done_acronym_extract}\n")
162 if ($self->{'verbosity'} > 2);
163}
164
165sub markup_acronyms {
166 my $self = shift (@_);
167 my ($text, $doc_obj, $thissection) = @_;
168 my $outhandle = $self->{'outhandle'};
169
170 gsprintf($outhandle, " {AcronymExtractor.marking_up_acronyms}...\n")
171 if ($self->{'verbosity'} > 2);
172
173 #self is passed in to check for verbosity ...
174 $text = &acronym::markup_acronyms($text, $self);
175
176 gsprintf($outhandle, " {AcronymExtractor.done_acronym_markup}\n")
177 if ($self->{'verbosity'} > 2);
178
179 return $text;
180}
181
1821;
Note: See TracBrowser for help on using the repository browser.