source: branches/New_Config_Format-branch/gsdl/perllib/acronym.pm@ 1278

Last change on this file since 1278 was 1278, checked in by (none), 24 years ago

This commit was manufactured by cvs2svn to create branch
'New_Config_Format-branch'.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1###########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# class to hold acronyms
27
28use strict;
29#use diagnostics;
30
31package acronym;
32#use Class::Struct;
33
34
35sub new {
36 my $self = [
37 "", # 0 acronym
38 [], # 1 definition
39 [], # 2 stop_words
40 0, # 3 letters_for_far
41
42 ];
43 bless $self;
44}
45
46#struct (
47# # core items
48# acronym => '$', # the acronym (a string)
49# definition => '@', # the acronyms defintion (an array of strings)
50# stop_words => '@', # an array of 1 (stop word) and 0 (non stop word)
51#
52# # items related to the context in which the acronym was mined
53# word_count => '$', # the index of the acronym within the text
54# definition_offset => '$', # the distance between the acronym and the definition.
55#
56# #temporary items used during the mining of the acronym
57# letters_for_far => '$', # how many letters have we found so far ?
58#
59# #calculated items
60# #...
61#
62# );
63
64#sub definition {
65# my $self = shift (@_);
66# my @def = @$self->[2];
67# print "definition::\@de+f = " . @def . "\n";
68# print "definition::\@_ = " . @_ . "\n";
69# if (@_) {
70# push @def, @_;
71# @$self->[2] = @def;
72# }
73# return @def;
74#}
75
76
77
78sub clone {
79 my $self = shift (@_);
80
81 my $copy = new acronym;
82
83 $copy->[0] = $self->[0];
84 push @{$copy->[1]}, @{$self->[1]};
85 push @{$copy->[2]}, @{$self->[2]};
86 $copy->[3] = $self->[3];
87 $copy->[4] = $self->[4];
88 $copy->[5] = $self->[5];
89 $copy->[6] = $self->[6];
90
91 return $copy;
92}
93
94#sub initialise {
95# my $self = shift (@_);
96#
97# # initialise the struct from the parameters ...
98# my($acro, $wc, $def) = @_;
99# $self->acronym($acro);
100# $self->word_count($wc);
101# $self->definition_offset($def);
102#
103# $self->letters_for_far(0);
104#}
105
106#print out the kwic for the acronym
107sub to_string_kwic {
108 my $self = shift (@_);
109
110 # the list of all possible combinations
111 my @list = ();
112
113 my $result = "";
114
115 my $j = 0;
116 my @array = @{$self->[1]};
117 while ($j <= $#array)
118 {
119
120 $result = "";
121
122 # do the definition
123 my $i = 0;
124 while ($i <= $#array)
125 {
126 my $current = ($i + $j) % ($#array+1);
127 $result = $result . $array[$current] . " ";
128 $i++;
129 }
130 $result = $result . "(" . $self->[0] . ")";
131
132 push @list, $result;
133 $j++;
134 }
135 return @list;
136}
137
138#this is the one used when building the collection ...
139sub to_string {
140 my $self = shift (@_);
141
142 my $result = $self->[0] . " ";
143
144 # do the definition
145 my @array = @{$self->[1]};
146 my $i = 0;
147 while ($i <= $#array)
148 {
149 my $resultnext = $result . $array[$i] . " ";
150 $result = $resultnext;
151 $i++;
152 }
153 return $result;
154}
155
156# called when the acronym is complete and after altering any stats to compute stats etc.
157sub stablise {
158
159
160}
161
162sub acronyms {
163 my $processed_text = shift @_;
164 $$processed_text =~ s/[^A-Za-z]/ /g;
165 $$processed_text =~ s/\s+/ /g;
166
167 return &acronyms_from_clean_text($processed_text)
168}
169
170sub acronyms_from_clean_text {
171 my ($processed_text) = @_;
172 my @acro_list = ();
173
174 my @text = split / /, $$processed_text;
175
176# my $i = 0;
177# while ($i <= $#text)
178# {
179# print $text->[$i] . "\n";
180# $i++;
181# }
182
183 my $first = 0;
184 my $last = $#text +1;
185 my $acro_counter = $first;
186
187 while ($acro_counter < $last)
188 {
189 my $word = $text[$acro_counter];
190
191 # the tests on the following line are VERY important to the performance of this algorithm
192 # be VERY careful when relaxing them...
193 if (length $word >= 3 && (uc($word) eq $word))
194 {
195 my $def_counter = 0;
196 while ($def_counter <= $last)
197 {
198 my $letter_counter = 0;
199 my $match = 1;
200 while ($letter_counter < length($word))
201 {
202 if ($def_counter+$letter_counter >= $last)
203 {
204 $match = 0;
205 last;
206 }
207 my $def_word = $text[$def_counter+$letter_counter];
208
209 #throw it out if it's recursing...
210 if (uc($word) eq uc($def_word))
211 {
212 $match = 0;
213 last
214 }
215 if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1))
216 {
217 $match = 0;
218 last;
219 }
220 $letter_counter++;
221 }
222 # this line should perhaps be more sophisticated ... it encodes what we consider
223 # to be a valid acronym
224 if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50))
225 {
226 my $acro = new acronym();
227 $acro->[0] = $word;
228 push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ];
229 $acro->[3] = $letter_counter;
230# my @tmp = ( $acro );
231 push @acro_list, ( $acro );
232# print $acro->to_string(). "\n";
233 $match = 0;
234 }
235 $def_counter++;
236 }
237 }
238 $acro_counter++;
239 }
240
241 return \@acro_list;
242}
243
244
245
246sub test {
247
248# my $blarg = new acronym();
249# my $simple;
250# $simple = 10;
251# $blarg->initialise($simple, $simple, $simple);
252# my $blarg2 = $blarg->clone();
253# print $blarg->to_string();
254# print $blarg2;
255# print "\n";
256#
257 my $tla = new acronym();
258 $tla->[0] = "TLA";
259
260 my @array = ("Three", "Letter", "Acronym");
261# my $i = 0;
262# while ($i <= $#array)
263# {
264# print @array[$i] . "\n";
265# $i++;
266# }
267
268 print "\n";
269 push @{$tla->[1]}, ("Three" );
270 push @{$tla->[1]}, ("Letter" );
271 push @{$tla->[1]}, ("Letter" );
272 push @{$tla->[1]}, ("Acronym" );
273 print $tla->to_string(). "\n";
274 print "\n";
275 print "\n";
276 my $tla2 = $tla->clone();
277 push @{$tla2->[1]}, ("One");
278 push @{$tla2->[1]}, ("Two");
279 $tla2->[0] = "ALT";
280 print $tla->to_string(). "\n";
281 print $tla2->to_string(). "\n";
282
283 print "\n";
284 print "\n";
285
286}
287
288#uncomment this line to test this package
289#&test();
290
2911;
Note: See TracBrowser for help on using the repository browser.