Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: branches/New_Config_Format-branch/gsdl/perllib/acronym.pm@ 1278

Last change on this file since 1278 was 1278, checked in by (none), 24 years ago
This commit was manufactured by cvs2svn to create branch 'New_Config_Format-branch'.
Property svn:keywords set to `Author Date Id Revision`
File size: 6.5 KB

Line
1	###########################################################################
2	#
3	# acronym.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# class to hold acronyms
27
28	use strict;
29	#use diagnostics;
30
31	package acronym;
32	#use Class::Struct;
33
34
35	sub new {
36	my $self = [
37	"", # 0 acronym
38	[], # 1 definition
39	[], # 2 stop_words
40	0, # 3 letters_for_far
41
42	];
43	bless $self;
44	}
45
46	#struct (
47	# # core items
48	# acronym => '$', # the acronym (a string)
49	# definition => '@', # the acronyms defintion (an array of strings)
50	# stop_words => '@', # an array of 1 (stop word) and 0 (non stop word)
51	#
52	# # items related to the context in which the acronym was mined
53	# word_count => '$', # the index of the acronym within the text
54	# definition_offset => '$', # the distance between the acronym and the definition.
55	#
56	# #temporary items used during the mining of the acronym
57	# letters_for_far => '$', # how many letters have we found so far ?
58	#
59	# #calculated items
60	# #...
61	#
62	# );
63
64	#sub definition {
65	# my $self = shift (@_);
66	# my @def = @$self->[2];
67	# print "definition::\@de+f = " . @def . "\n";
68	# print "definition::\@_ = " . @_ . "\n";
69	# if (@_) {
70	# push @def, @_;
71	# @$self->[2] = @def;
72	# }
73	# return @def;
74	#}
75
76
77
78	sub clone {
79	my $self = shift (@_);
80
81	my $copy = new acronym;
82
83	$copy->[0] = $self->[0];
84	push @{$copy->[1]}, @{$self->[1]};
85	push @{$copy->[2]}, @{$self->[2]};
86	$copy->[3] = $self->[3];
87	$copy->[4] = $self->[4];
88	$copy->[5] = $self->[5];
89	$copy->[6] = $self->[6];
90
91	return $copy;
92	}
93
94	#sub initialise {
95	# my $self = shift (@_);
96	#
97	# # initialise the struct from the parameters ...
98	# my($acro, $wc, $def) = @_;
99	# $self->acronym($acro);
100	# $self->word_count($wc);
101	# $self->definition_offset($def);
102	#
103	# $self->letters_for_far(0);
104	#}
105
106	#print out the kwic for the acronym
107	sub to_string_kwic {
108	my $self = shift (@_);
109
110	# the list of all possible combinations
111	my @list = ();
112
113	my $result = "";
114
115	my $j = 0;
116	my @array = @{$self->[1]};
117	while ($j <= $#array)
118	{
119
120	$result = "";
121
122	# do the definition
123	my $i = 0;
124	while ($i <= $#array)
125	{
126	my $current = ($i + $j) % ($#array+1);
127	$result = $result . $array[$current] . " ";
128	$i++;
129	}
130	$result = $result . "(" . $self->[0] . ")";
131
132	push @list, $result;
133	$j++;
134	}
135	return @list;
136	}
137
138	#this is the one used when building the collection ...
139	sub to_string {
140	my $self = shift (@_);
141
142	my $result = $self->[0] . " ";
143
144	# do the definition
145	my @array = @{$self->[1]};
146	my $i = 0;
147	while ($i <= $#array)
148	{
149	my $resultnext = $result . $array[$i] . " ";
150	$result = $resultnext;
151	$i++;
152	}
153	return $result;
154	}
155
156	# called when the acronym is complete and after altering any stats to compute stats etc.
157	sub stablise {
158
159
160	}
161
162	sub acronyms {
163	my $processed_text = shift @_;
164	$$processed_text =~ s/[^A-Za-z]/ /g;
165	$$processed_text =~ s/\s+/ /g;
166
167	return &acronyms_from_clean_text($processed_text)
168	}
169
170	sub acronyms_from_clean_text {
171	my ($processed_text) = @_;
172	my @acro_list = ();
173
174	my @text = split / /, $$processed_text;
175
176	# my $i = 0;
177	# while ($i <= $#text)
178	# {
179	# print $text->[$i] . "\n";
180	# $i++;
181	# }
182
183	my $first = 0;
184	my $last = $#text +1;
185	my $acro_counter = $first;
186
187	while ($acro_counter < $last)
188	{
189	my $word = $text[$acro_counter];
190
191	# the tests on the following line are VERY important to the performance of this algorithm
192	# be VERY careful when relaxing them...
193	if (length $word >= 3 && (uc($word) eq $word))
194	{
195	my $def_counter = 0;
196	while ($def_counter <= $last)
197	{
198	my $letter_counter = 0;
199	my $match = 1;
200	while ($letter_counter < length($word))
201	{
202	if ($def_counter+$letter_counter >= $last)
203	{
204	$match = 0;
205	last;
206	}
207	my $def_word = $text[$def_counter+$letter_counter];
208
209	#throw it out if it's recursing...
210	if (uc($word) eq uc($def_word))
211	{
212	$match = 0;
213	last
214	}
215	if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1))
216	{
217	$match = 0;
218	last;
219	}
220	$letter_counter++;
221	}
222	# this line should perhaps be more sophisticated ... it encodes what we consider
223	# to be a valid acronym
224	if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50))
225	{
226	my $acro = new acronym();
227	$acro->[0] = $word;
228	push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ];
229	$acro->[3] = $letter_counter;
230	# my @tmp = ( $acro );
231	push @acro_list, ( $acro );
232	# print $acro->to_string(). "\n";
233	$match = 0;
234	}
235	$def_counter++;
236	}
237	}
238	$acro_counter++;
239	}
240
241	return \@acro_list;
242	}
243
244
245
246	sub test {
247
248	# my $blarg = new acronym();
249	# my $simple;
250	# $simple = 10;
251	# $blarg->initialise($simple, $simple, $simple);
252	# my $blarg2 = $blarg->clone();
253	# print $blarg->to_string();
254	# print $blarg2;
255	# print "\n";
256	#
257	my $tla = new acronym();
258	$tla->[0] = "TLA";
259
260	my @array = ("Three", "Letter", "Acronym");
261	# my $i = 0;
262	# while ($i <= $#array)
263	# {
264	# print @array[$i] . "\n";
265	# $i++;
266	# }
267
268	print "\n";
269	push @{$tla->[1]}, ("Three" );
270	push @{$tla->[1]}, ("Letter" );
271	push @{$tla->[1]}, ("Letter" );
272	push @{$tla->[1]}, ("Acronym" );
273	print $tla->to_string(). "\n";
274	print "\n";
275	print "\n";
276	my $tla2 = $tla->clone();
277	push @{$tla2->[1]}, ("One");
278	push @{$tla2->[1]}, ("Two");
279	$tla2->[0] = "ALT";
280	print $tla->to_string(). "\n";
281	print $tla2->to_string(). "\n";
282
283	print "\n";
284	print "\n";
285
286	}
287
288	#uncomment this line to test this package
289	#&test();
290
291	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: