source: trunk/gsdl/perllib/acronym.pm@ 1317

Last change on this file since 1317 was 1244, checked in by sjboddie, 24 years ago

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1###########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# class to hold acronyms
27
28use strict;
29#use diagnostics;
30
31package acronym;
32#use Class::Struct;
33
34
35sub new {
36 my $self = [
37 "", # 0 acronym
38 [], # 1 definition
39 [], # 2 stop_words
40 0, # 3 letters_for_far
41
42 ];
43 bless $self;
44}
45
46#struct (
47# # core items
48# acronym => '$', # the acronym (a string)
49# definition => '@', # the acronyms defintion (an array of strings)
50# stop_words => '@', # an array of 1 (stop word) and 0 (non stop word)
51#
52# # items related to the context in which the acronym was mined
53# word_count => '$', # the index of the acronym within the text
54# definition_offset => '$', # the distance between the acronym and the definition.
55#
56# #temporary items used during the mining of the acronym
57# letters_for_far => '$', # how many letters have we found so far ?
58#
59# #calculated items
60# #...
61#
62# );
63
64#sub definition {
65# my $self = shift (@_);
66# my @def = @$self->[2];
67# print "definition::\@de+f = " . @def . "\n";
68# print "definition::\@_ = " . @_ . "\n";
69# if (@_) {
70# push @def, @_;
71# @$self->[2] = @def;
72# }
73# return @def;
74#}
75
76
77
78sub clone {
79 my $self = shift (@_);
80
81 my $copy = new acronym;
82
83 $copy->[0] = $self->[0];
84 push @{$copy->[1]}, @{$self->[1]};
85 push @{$copy->[2]}, @{$self->[2]};
86 $copy->[3] = $self->[3];
87 $copy->[4] = $self->[4];
88 $copy->[5] = $self->[5];
89 $copy->[6] = $self->[6];
90
91 return $copy;
92}
93
94#sub initialise {
95# my $self = shift (@_);
96#
97# # initialise the struct from the parameters ...
98# my($acro, $wc, $def) = @_;
99# $self->acronym($acro);
100# $self->word_count($wc);
101# $self->definition_offset($def);
102#
103# $self->letters_for_far(0);
104#}
105
106#print out the kwic for the acronym
107sub to_string_kwic {
108 my $self = shift (@_);
109
110 # the list of all possible combinations
111 my @list = ();
112
113 my $result = "";
114
115 my $j = 0;
116 my @array = @{$self->[1]};
117 while ($j <= $#array)
118 {
119
120 $result = "";
121
122 # do the definition
123 my $i = 0;
124 while ($i <= $#array)
125 {
126 my $current = ($i + $j) % ($#array+1);
127 $result = $result . $array[$current] . " ";
128 $i++;
129 }
130 $result = $result . "(" . $self->[0] . ")";
131
132 push @list, $result;
133 $j++;
134 }
135 return @list;
136}
137
138#this is the one used when building the collection ...
139sub to_string {
140 my $self = shift (@_);
141
142 my $result = $self->[0] . " ";
143
144 # do the definition
145 my @array = @{$self->[1]};
146 my $i = 0;
147 while ($i <= $#array)
148 {
149 my $resultnext = $result . $array[$i] . " ";
150 $result = $resultnext;
151 $i++;
152 }
153 return $result;
154}
155
156# called when the acronym is complete and after altering any stats to compute stats etc.
157sub stablise {
158
159
160}
161
162sub acronyms {
163 my $processed_text = shift @_;
164 $$processed_text =~ s/[^A-Za-z]/ /g;
165 $$processed_text =~ s/\s+/ /g;
166
167 return &acronyms_from_clean_text($processed_text)
168}
169
170sub acronyms_from_clean_text {
171 my ($processed_text) = @_;
172 my @acro_list = ();
173
174 my @text = split / /, $$processed_text;
175
176# my $i = 0;
177# while ($i <= $#text)
178# {
179# print $text->[$i] . "\n";
180# $i++;
181# }
182
183 my $first = 0;
184 my $last = $#text +1;
185 my $acro_counter = $first;
186
187 while ($acro_counter < $last)
188 {
189 my $word = $text[$acro_counter];
190
191 # the tests on the following line are VERY important to the performance of this algorithm
192 # be VERY careful when relaxing them...
193 if (length $word >= 3 && (uc($word) eq $word))
194 {
195 my $def_counter = 0;
196 while ($def_counter <= $last)
197 {
198 my $letter_counter = 0;
199 my $match = 1;
200 while ($letter_counter < length($word))
201 {
202 if ($def_counter+$letter_counter >= $last)
203 {
204 $match = 0;
205 last;
206 }
207 my $def_word = $text[$def_counter+$letter_counter];
208
209 #throw it out if it's recursing...
210 if (uc($word) eq uc($def_word))
211 {
212 $match = 0;
213 last
214 }
215 if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1))
216 {
217 $match = 0;
218 last;
219 }
220 $letter_counter++;
221 }
222 # this line should perhaps be more sophisticated ... it encodes what we consider
223 # to be a valid acronym
224 if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50))
225 {
226 my $acro = new acronym();
227 $acro->[0] = $word;
228 push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ];
229 $acro->[3] = $letter_counter;
230# my @tmp = ( $acro );
231 push @acro_list, ( $acro );
232# print $acro->to_string(). "\n";
233 $match = 0;
234 }
235 $def_counter++;
236 }
237 }
238 $acro_counter++;
239 }
240
241 return \@acro_list;
242}
243
244
245
246sub test {
247
248# my $blarg = new acronym();
249# my $simple;
250# $simple = 10;
251# $blarg->initialise($simple, $simple, $simple);
252# my $blarg2 = $blarg->clone();
253# print $blarg->to_string();
254# print $blarg2;
255# print "\n";
256#
257 my $tla = new acronym();
258 $tla->[0] = "TLA";
259
260 my @array = ("Three", "Letter", "Acronym");
261# my $i = 0;
262# while ($i <= $#array)
263# {
264# print @array[$i] . "\n";
265# $i++;
266# }
267
268 print "\n";
269 push @{$tla->[1]}, ("Three" );
270 push @{$tla->[1]}, ("Letter" );
271 push @{$tla->[1]}, ("Letter" );
272 push @{$tla->[1]}, ("Acronym" );
273 print $tla->to_string(). "\n";
274 print "\n";
275 print "\n";
276 my $tla2 = $tla->clone();
277 push @{$tla2->[1]}, ("One");
278 push @{$tla2->[1]}, ("Two");
279 $tla2->[0] = "ALT";
280 print $tla->to_string(). "\n";
281 print $tla2->to_string(). "\n";
282
283 print "\n";
284 print "\n";
285
286}
287
288#uncomment this line to test this package
289#&test();
290
2911;
Note: See TracBrowser for help on using the repository browser.