source: trunk/gsdl/perllib/acronym.pm@ 1361

Last change on this file since 1361 was 1361, checked in by say1, 24 years ago

rewrote recursively to handle stop words and more cases

  • Property svn:keywords set to Author Date Id Revision
File size: 10.0 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28use strict;
29use diagnostics;
30
31package acronym;
32
33###########################################################################
34# global variables
35###########################################################################
36# valiables to control the recall/precision tradeoff
37
38#the maximum range to look for acronyms
39my $max_offset = 30;
40#acronyms must be upper case
41my $upper_case = 1;
42#acronym case must match
43my $case_match = 1;
44#minimum acronym length
45my $min_def_length = 3;
46#minimum acronym length
47my $min_acro_length = 3;
48#minimum acronym length saving
49my $min_length_saving = 3;
50#allow recusive acronyms
51my $allow_recursive = "";
52
53my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND";
54#my @stop_words = split / /, "OF AT THE IN TO AND";
55
56#the text split into an array, one word per element
57my @split_text = ();
58my @acronym_list = ();
59
60
61###########################################################################
62# member functions
63###########################################################################
64
65
66sub new {
67 my $self = [
68 "", # 0 acronym
69 [], # 1 definition
70 ];
71 bless $self;
72}
73
74
75
76sub clone {
77 my $self = shift (@_);
78
79 my $copy = new acronym;
80
81 $copy->[0] = $self->[0];
82 push @{$copy->[1]}, @{$self->[1]};
83 bless $copy;
84
85 return $copy;
86}
87
88#return the acronym
89sub to_acronym {
90 my $self = shift (@_);
91 my @array = @{$self->[1]};
92
93 return $self->[0];
94}
95
96#return the number of words in the acronym definition
97sub words_in_acronym_definition {
98 my $self = shift (@_);
99 my @array = @{$self->[1]};
100
101 return $#array + 1;
102}
103
104#return the number of letters in the acronym definition
105sub letters_in_acronym_definition {
106 my $self = shift (@_);
107
108 return length($self->to_def_string());
109}
110
111#return the number of letters in the acronym definition
112sub letters_in_acronym {
113 my $self = shift (@_);
114
115 return length($self->to_acronym());
116}
117
118#return the acronym definition
119sub to_def_string {
120 my $self = shift (@_);
121
122 my $result = "";
123
124 # do the definition
125 my @array = @{$self->[1]};
126 my $i = 0;
127 while ($i <= $#array)
128 {
129 $result = $result . $array[$i];
130
131 if ($i+1 <= $#array)
132 {
133 $result = $result . " ";
134 }
135 $i++;
136 }
137 return $result;
138}
139
140
141#print out the kwic for the acronym
142sub to_string_kwic {
143 my $self = shift (@_);
144
145 # the list of all possible combinations
146 my @list = ();
147
148 my $result = "";
149
150 my $j = 0;
151 my @array = @{$self->[1]};
152 while ($j <= $#array)
153 {
154
155 # do the definition
156 my $i = 0;
157
158 #add the key word
159 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
160
161 #add the proceeding words
162 while ($i < $j)
163 {
164 $result = $result . $array[$i] . " ";
165 $i++;
166 }
167 #add the key word
168 $result = $result . "</td><td halign=left>" . $array[$j] .
169 "</td><td halign=left>";
170
171 #add the trailing words
172 $i++;
173 while ($i <= $#array )
174 {
175 $result = $result . $array[$i] . " ";
176 $i++;
177 }
178
179 #add the actual acronym
180
181 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
182
183 push @list, $result;
184 $j++;
185 }
186 return @list;
187}
188
189#this is the one used when building the collection ...
190sub to_string {
191 my $self = shift (@_);
192
193 my $result = $self->[0] . " ";
194
195 # do the definition
196 my @array = @{$self->[1]};
197 my $i = 0;
198 while ($i <= $#array)
199 {
200 $result = $result . $array[$i];
201 if ($i+1 <= $#array)
202 {
203 $result = $result . " ";
204 }
205 $i++;
206 }
207 return $result;
208}
209
210sub check {
211 my $self = shift (@_);
212
213 if (length($self->to_acronym()) < $min_acro_length)
214 {
215# print "acronym " . $self->to_string() . " rejected (too short I)\n";
216 return 0;
217 }
218 if ($self->words_in_acronym_definition() < $min_def_length)
219 {
220# print "acronym " . $self->to_string() . " rejected (too short II)\n";
221 return 0;
222 }
223 if ($min_length_saving * $self->letters_in_acronym() >
224 $self->letters_in_acronym_definition())
225 {
226# print "acronym " . $self->to_string() . " rejected (too short III)\n";
227# print "" . $min_length_saving .
228 "|" . $self->letters_in_acronym() .
229 "|" . $self->letters_in_acronym_definition() . "\n";
230 return 0;
231 }
232# print "acronym " . $self->to_string() . " not rejected\n";
233 return 1;
234}
235
236###########################################################################
237# static functions
238###########################################################################
239
240sub recurse {
241 my ($acro_offset, #offset of word we're finding acronyms for
242 $text_offset,
243 $letter_offset,
244 @def_so_far) = @_;
245
246 my $word = $split_text[$text_offset];
247 my $acro = $split_text[$acro_offset];
248 $word = "" if !defined $word;
249 $acro = "" if !defined $acro;
250
251# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
252# $letter_offset . ", " . @def_so_far . ")\n";
253
254 #check for termination ...
255 if ($letter_offset >= length($acro))
256 {
257 my $acronym = new acronym();
258 $acronym->[0] = $acro;
259 push @{$acronym->[1]}, @def_so_far;
260 if ($acronym->check())
261 {
262 push @acronym_list, ( $acronym );
263 }
264# print "acronym created\n";
265 return;
266 }
267 #check for recursion
268 if (!$allow_recursive)
269 {
270 if ($word eq $acro)
271 {
272# print "recursion detected\n";
273 return;
274 }
275 }
276
277 #skip a stop-word
278 my $i = 0;
279 if ($letter_offset != 0)
280 {
281 while ($i <= $#stop_words)
282 {
283 if ($stop_words[$i] eq uc($word))
284 {
285# print "::found stop word::" . $stop_words[$i] . "\n";
286 &recurse($acro_offset,
287 $text_offset+1,
288 $letter_offset,
289 @def_so_far, $word);
290 }
291 $i++;
292 }
293 }
294 $i = 1;
295 #using the first $i letters ...
296 while ($letter_offset+$i <= length($acro) )
297 {
298# print "". ((substr $word, 0, $i) . " " .
299# (substr $acro, $letter_offset, $i) . "\n");
300 if (((!$case_match) &&
301 (uc(substr $word, 0, $i) eq
302 uc(substr $acro, $letter_offset, $i)))
303 ||
304 (($case_match) &&
305 ((substr $word, 0, $i) eq
306 (substr $acro, $letter_offset, $i))))
307 {
308# print "::match::\n";
309# print "" . ((substr $word, 0, $i) . " " .
310# (substr $acro, $letter_offset, $i) . "\n");
311 &recurse($acro_offset,
312 $text_offset+1,
313 $letter_offset+$i,
314 @def_so_far, $word);
315 } else {
316 return;
317 }
318 $i++;
319 }
320 return;
321}
322
323
324sub acronyms {
325 #clean up the text
326 my $processed_text = shift @_;
327 $$processed_text =~ s/[^A-Za-z]/ /g;
328 $$processed_text =~ s/\s+/ /g;
329
330 #clear some global variables
331 @split_text = ();
332 @acronym_list = ();
333
334 return &acronyms_from_clean_text($processed_text);
335}
336
337sub acronyms_from_clean_text {
338 my ($processed_text) = @_;
339
340 @split_text = split / /, $$processed_text;
341
342# my $i = 0;
343# while ($i <= $#split_text)
344# {
345# print $split_text[$i] . "\n";
346# $i++;
347# }
348
349 my $first = 0;
350 my $last = $#split_text +1;
351 my $acro_counter = $first;
352
353 while ($acro_counter < $last)
354 {
355 my $word = $split_text[$acro_counter];
356
357 if ((!$upper_case) ||
358 (uc($word) eq $word))
359 {
360
361 if (length $word >= $min_acro_length)
362 {
363 my $def_counter = 0;
364 if ($acro_counter - $max_offset > 0)
365 {
366 $def_counter = $acro_counter - $max_offset;
367 }
368 my $local_last = $acro_counter + $max_offset;
369 if ($local_last > $last)
370 {
371 $local_last = $last;
372 }
373 while ($def_counter <= $local_last)
374 {
375 &recurse($acro_counter,$def_counter,0,());
376 $def_counter++;
377 }
378 }
379 }
380 $acro_counter++;
381 }
382
383 return \@acronym_list;
384}
385
386
387
388sub test {
389
390# my $blarg = new acronym();
391# my $simple;
392# $simple = 10;
393# $blarg->initialise($simple, $simple, $simple);
394# my $blarg2 = $blarg->clone();
395# print $blarg->to_string();
396# print $blarg2;
397# print "\n";
398#
399 my $tla = new acronym();
400 $tla->[0] = "TLA";
401
402 my @array = ("Three", "Letter", "Acronym");
403# my $i = 0;
404# while ($i <= $#array)
405# {
406# print @array[$i] . "\n";
407# $i++;
408# }
409
410 print "\n";
411 push @{$tla->[1]}, ("Three" );
412 push @{$tla->[1]}, ("Letter" );
413 push @{$tla->[1]}, ("Acronym" );
414 print $tla->to_string(). "\n";
415 print "\n";
416 print "\n";
417 my $tla2 = $tla->clone();
418 push @{$tla2->[1]}, ("One");
419 push @{$tla2->[1]}, ("Two");
420 $tla2->[0] = "ALT";
421 print $tla->to_string(). "\n";
422 print $tla2->to_string(). "\n";
423
424 print "\n";
425 print "\n";
426
427 print "Testing recursion ...\n";
428 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
429
430 foreach my $acro (@$acros)
431 {
432 if ($acro->check)
433 {
434 print "accepted: " .$acro->to_string() . "\n";
435# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
436# "|" . $acro->words_in_acronym_definition() .
437# "|" . $acro->letters_in_acronym_definition() .
438# "|" . $acro->letters_in_acronym() . "|\n";
439 } else {
440# print "found but rejected: " .$acro->to_string() . "\n";
441 }
442 }
443}
444
445#uncomment this line to test this package
446#&test();
447
4481;
449
450
Note: See TracBrowser for help on using the repository browser.