source: trunk/gsdl/perllib/acronym.pm@ 1336

Last change on this file since 1336 was 1336, checked in by say1, 24 years ago

fixed acronym extraction so it is now runs in time linear to the document length (was l2)

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1###########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# class to hold acronyms
27
28use strict;
29#use diagnostics;
30
31package acronym;
32#use Class::Struct;
33
34
35sub new {
36 my $self = [
37 "", # 0 acronym
38 [], # 1 definition
39 [], # 2 stop_words
40 0, # 3 letters_for_far
41
42 ];
43 bless $self;
44}
45
46
47
48sub clone {
49 my $self = shift (@_);
50
51 my $copy = new acronym;
52
53 $copy->[0] = $self->[0];
54 push @{$copy->[1]}, @{$self->[1]};
55 push @{$copy->[2]}, @{$self->[2]};
56 $copy->[3] = $self->[3];
57 $copy->[4] = $self->[4];
58 $copy->[5] = $self->[5];
59 $copy->[6] = $self->[6];
60
61 return $copy;
62}
63
64
65#print out the kwic for the acronym
66sub to_string_kwic {
67 my $self = shift (@_);
68
69 # the list of all possible combinations
70 my @list = ();
71
72 my $result = "";
73
74 my $j = 0;
75 my @array = @{$self->[1]};
76 while ($j <= $#array)
77 {
78
79 $result = "";
80
81 # do the definition
82 my $i = 0;
83 while ($i <= $#array)
84 {
85 my $current = ($i + $j) % ($#array+1);
86 $result = $result . $array[$current] . " ";
87 $i++;
88 }
89 $result = $result . "(" . $self->[0] . ")";
90
91 push @list, $result;
92 $j++;
93 }
94 return @list;
95}
96
97#this is the one used when building the collection ...
98sub to_string {
99 my $self = shift (@_);
100
101 my $result = $self->[0] . " ";
102
103 # do the definition
104 my @array = @{$self->[1]};
105 my $i = 0;
106 while ($i <= $#array)
107 {
108 my $resultnext = $result . $array[$i] . " ";
109 $result = $resultnext;
110 $i++;
111 }
112 return $result;
113}
114
115sub acronyms {
116 my $processed_text = shift @_;
117 $$processed_text =~ s/[^A-Za-z]/ /g;
118 $$processed_text =~ s/\s+/ /g;
119
120 return &acronyms_from_clean_text($processed_text)
121}
122
123sub acronyms_from_clean_text {
124 my ($processed_text) = @_;
125 my @acro_list = ();
126 my $max_offset = 50;
127
128 my @text = split / /, $$processed_text;
129
130# my $i = 0;
131# while ($i <= $#text)
132# {
133# print $text->[$i] . "\n";
134# $i++;
135# }
136
137 my $first = 0;
138 my $last = $#text +1;
139 my $acro_counter = $first;
140
141 while ($acro_counter < $last)
142 {
143 my $word = $text[$acro_counter];
144
145 # the tests on the following line are VERY important
146 # to the performance of this algorithm be VERY careful
147 # when relaxing them...
148 if (length $word >= 3 && (uc($word) eq $word))
149 {
150 my $def_counter = 0;
151 if ($acro_counter - $max_offset > 0)
152 {
153 $def_counter = $acro_counter - $max_offset;
154 }
155 my $local_last = $acro_counter + $max_offset;
156 if ($local_last > $last)
157 {
158 $local_last = $last;
159 }
160 while ($def_counter <= $local_last)
161 {
162 my $letter_counter = 0;
163 my $match = 1;
164 while ($letter_counter < length($word))
165 {
166 if ($def_counter+$letter_counter >= $local_last)
167 {
168 $match = 0;
169 last;
170 }
171 my $def_word = $text[$def_counter+$letter_counter];
172
173 #throw it out if it's recursing...
174 if (uc($word) eq uc($def_word))
175 {
176 $match = 0;
177 last
178 }
179 if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1))
180 {
181 $match = 0;
182 last;
183 }
184 $letter_counter++;
185 }
186 # this line should perhaps be more sophisticated ...
187 # it encodes what we consider to be a valid acronym
188 if ($match == 1 && $letter_counter > 0 &&
189 (abs($def_counter - $acro_counter)< $max_offset))
190 {
191 my $acro = new acronym();
192 $acro->[0] = $word;
193 push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ];
194 $acro->[3] = $letter_counter;
195# my @tmp = ( $acro );
196 push @acro_list, ( $acro );
197# print $acro->to_string(). "\n";
198 $match = 0;
199 }
200 $def_counter++;
201 }
202 }
203 $acro_counter++;
204 }
205
206 return \@acro_list;
207}
208
209
210
211sub test {
212
213# my $blarg = new acronym();
214# my $simple;
215# $simple = 10;
216# $blarg->initialise($simple, $simple, $simple);
217# my $blarg2 = $blarg->clone();
218# print $blarg->to_string();
219# print $blarg2;
220# print "\n";
221#
222 my $tla = new acronym();
223 $tla->[0] = "TLA";
224
225 my @array = ("Three", "Letter", "Acronym");
226# my $i = 0;
227# while ($i <= $#array)
228# {
229# print @array[$i] . "\n";
230# $i++;
231# }
232
233 print "\n";
234 push @{$tla->[1]}, ("Three" );
235 push @{$tla->[1]}, ("Letter" );
236 push @{$tla->[1]}, ("Letter" );
237 push @{$tla->[1]}, ("Acronym" );
238 print $tla->to_string(). "\n";
239 print "\n";
240 print "\n";
241 my $tla2 = $tla->clone();
242 push @{$tla2->[1]}, ("One");
243 push @{$tla2->[1]}, ("Two");
244 $tla2->[0] = "ALT";
245 print $tla->to_string(). "\n";
246 print $tla2->to_string(). "\n";
247
248 print "\n";
249 print "\n";
250
251}
252
253#uncomment this line to test this package
254#&test();
255
2561;
Note: See TracBrowser for help on using the repository browser.