Context Navigation

source: trunk/gsdl/perllib/acronym.pm@ 1393

Last change on this file since 1393 was 1393, checked in by say1, 24 years ago
acronym markup functionality
Property svn:keywords set to `Author Date Id Revision`
File size: 16.8 KB

Line
1	##########################################################################
2	#
3	# acronym.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	# class to handle acronyms
26	###########################################################################
27
28	use strict;
29	use diagnostics;
30
31	package acronym;
32
33	###########################################################################
34	# global variables
35	###########################################################################
36	# valiables to control the recall/precision tradeoff
37
38	#the maximum range to look for acronyms
39	my $max_offset = 30;
40	#acronyms must be upper case
41	my $upper_case = 1;
42	#acronym case must match
43	my $case_match = 1;
44	#minimum acronym length
45	my $min_def_length = 3;
46	#minimum acronym length
47	my $min_acro_length = 3;
48	#minimum acronym length saving
49	my $min_length_saving = 4;
50	#allow recusive acronyms
51	my $allow_recursive = "";
52
53	my @stop_words = split / /, "OF AT THE IN TO AND";
54
55	#the text split into an array, one word per element
56	my @split_text = ();
57	my @acronym_list = ();
58
59	#the file to collate acronyms into
60	my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
61	my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
62
63	my %acronyms_found_in_collection = ();
64	my %acronyms_banned_from_collection = ();
65
66	my $writing_acronyms = 1;
67	my $accumulate_acronyms = 1;
68	my $markup_accumulate_acronyms = 1;
69	my $markup_local_acronyms = 1;
70
71
72
73	###########################################################################
74	# file saving / loading stuff
75	###########################################################################
76
77	sub init_acronyms {
78
79	my $file_text = "";
80	if (open ACRONYM_HANDLE, "<$acronym_options_file")
81	{
82	$file_text = do { local $/; <ACRONYM_HANDLE> };
83	}
84	if ($file_text eq "")
85	{
86	print STDERR "failed to open $acronym_options_file\n";
87	open ACRONYM_HANDLE, ">$acronym_options_file\n";
88	print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
89	print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
90	print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
91	print ACRONYM_HANDLE "#valid here.\n\n";
92	print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
93	print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
94	print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
95	print ACRONYM_HANDLE "#explain them fully \n\n";
96	print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
97	print ACRONYM_HANDLE "\$max_offset = 30;\n\n";
98	print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n";
99	print ACRONYM_HANDLE "\$upper_case = 1;\n\n";
100	print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n";
101	print ACRONYM_HANDLE "\$case_match = 1;\n\n";
102	print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
103	print ACRONYM_HANDLE "\$min_def_length = 3;\n\n";
104	print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
105	print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n";
106	print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
107	print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n";
108	print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n";
109	print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n";
110	print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
111	print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
112	print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
113	print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n";
114	print ACRONYM_HANDLE "\n";
115	print ACRONYM_HANDLE "#the file to collate acronyms into\n";
116	print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n";
117	print ACRONYM_HANDLE "\n";
118	print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n";
119	print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
120	print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
121	print ACRONYM_HANDLE "# \n";
122	print ACRONYM_HANDLE "# \n";
123	print ACRONYM_HANDLE "# \n";
124	}
125	eval $file_text;
126	#promotes warnings/errors from evaluated file to current context
127	warn $@ if $@;
128
129
130
131	&read_all_acronyms_from_file();
132	# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
133	# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
134	if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
135	{
136	print ACRONYM_HANDLE "#This is an automatically generated file.\n";
137	print ACRONYM_HANDLE "#\n";
138	print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
139	print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
140	print ACRONYM_HANDLE "#read-only. \n";
141	print ACRONYM_HANDLE "#\n";
142	print ACRONYM_HANDLE "#start of acronyms...\n";
143	$writing_acronyms = 1;
144	} else {
145	warn "failed to open $acronym_accumulate_file for writing\n";
146	$writing_acronyms = 0;
147	}
148	}
149
150	#close the list of accumulated acronyms
151	END {
152	if ($writing_acronyms)
153	{
154	print ACRONYM_HANDLE "#end of acronyms.\n";
155	close ACRONYM_HANDLE;
156	}
157	}
158
159	#eval a file of accumulated acronyms
160	sub read_all_acronyms_from_file {
161
162	my $file_text = "";
163	if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
164	{
165	$file_text = do { local $/; <ACRONYM_HANDLE> };
166	} else {
167	print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
168	}
169	eval $file_text;
170	#promotes warnings/errors from evaluated file to current context
171	warn $@ if $@;
172	}
173
174	#called from within the file of accumulated acronyms to indicate a good acronym
175	sub add {
176	my $self = shift (@_);
177	if (defined ($acronyms_found_in_collection{$self->[0]}))
178	{
179	my $def = $self->to_def_string();
180	if ($acronyms_found_in_collection{$self->[0]} =~ m/(^\|\\|)$def(\\|\|$)/)
181	{
182	return;
183	}
184	$acronyms_found_in_collection{$self->[0]} =
185	$acronyms_found_in_collection{$self->[0]} . "\|" . $self->to_def_string();
186	} else {
187	$acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
188	}
189	}
190
191	#called from within the file of accumulated acronyms to indicate a bad acronym
192	sub ban {
193	my $self = shift (@_);
194
195	if (!defined $acronyms_banned_from_collection{$self->[0]})
196	{
197	$acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
198	} else {
199	$acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "\|" . $self->to_def_string();
200	}
201	}
202
203
204	#write a good acronym to the accumulated acronyms file
205	sub write_to_file {
206	my $self = shift (@_);
207	if ($writing_acronyms)
208	{
209	print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
210	$self->to_def_string() .
211	"\")->add();\n";
212	}
213	}
214
215
216	###########################################################################
217	# mark functionality
218	###########################################################################
219
220	#small routine to sort by length
221	sub sort_by_length {
222	length($b) <=> length($a) or $a cmp $b
223	}
224
225	sub markup_acronyms {
226	my $text = shift (@_);
227	my $verbosity_obj = shift (@_);
228	if (defined $text)
229	{
230	for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
231	{
232	$text =~ s/^((?:[^\<\n]\|(?:\<[^\>\n]\>)))$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
233	printf STDERR " " . $acro . ","
234	if ($verbosity_obj->{'verbosity'} >= 2);
235	}
236	}
237	return $text;
238	}
239
240
241
242	###########################################################################
243	# member functions
244	###########################################################################
245
246
247	sub new {
248	my $trash = shift (@_);
249	my $acro = shift (@_);
250	my $def = shift (@_);
251
252	my $self = [
253	"", # 0 acronym
254	[], # 1 definition
255	];
256
257	$self->[0] = $acro if defined $acro;
258	push @{$self->[1]}, split / /, $def if defined $def;
259
260	bless $self;
261	}
262
263	sub clone {
264	my $self = shift (@_);
265
266	my $copy = new acronym;
267
268	$copy->[0] = $self->[0];
269	push @{$copy->[1]}, @{$self->[1]};
270	bless $copy;
271
272	return $copy;
273	}
274
275	#return the acronym
276	sub to_acronym {
277	my $self = shift (@_);
278	my @array = @{$self->[1]};
279
280	return $self->[0];
281	}
282
283	#return the number of words in the acronym definition
284	sub words_in_acronym_definition {
285	my $self = shift (@_);
286	my @array = @{$self->[1]};
287
288	return $#array + 1;
289	}
290
291	#return the number of letters in the acronym definition
292	sub letters_in_acronym_definition {
293	my $self = shift (@_);
294
295	return length($self->to_def_string());
296	}
297
298	#return the number of letters in the acronym definition
299	sub letters_in_acronym {
300	my $self = shift (@_);
301
302	return length($self->to_acronym());
303	}
304
305	#return the acronym definition
306	sub to_def_string {
307	my $self = shift (@_);
308
309	my $result = "";
310
311	# do the definition
312	my @array = @{$self->[1]};
313	my $i = 0;
314	while ($i <= $#array)
315	{
316	$result = $result . $array[$i];
317
318	if ($i+1 <= $#array)
319	{
320	$result = $result . " ";
321	}
322	$i++;
323	}
324	return $result;
325	}
326
327
328	#print out the kwic for the acronym
329	sub to_string_kwic {
330	my $self = shift (@_);
331
332	# the list of all possible combinations
333	my @list = ();
334
335	my $result = "";
336
337	my $j = 0;
338	my @array = @{$self->[1]};
339	while ($j <= $#array)
340	{
341
342	# do the definition
343	my $i = 0;
344
345	#add the key word
346	$result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
347
348	#add the proceeding words
349	while ($i < $j)
350	{
351	$result = $result . $array[$i] . " ";
352	$i++;
353	}
354	#add the key word
355	$result = $result . "</td><td halign=left>" . $array[$j] .
356	"</td><td halign=left>";
357
358	#add the trailing words
359	$i++;
360	while ($i <= $#array )
361	{
362	$result = $result . $array[$i] . " ";
363	$i++;
364	}
365
366	#add the actual acronym
367
368	$result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
369
370	push @list, $result;
371	$j++;
372	}
373	return @list;
374	}
375
376	#this is the one used when building the collection ...
377	sub to_string {
378	my $self = shift (@_);
379
380	my $result = $self->[0] . " ";
381
382	# do the definition
383	my @array = @{$self->[1]};
384	my $i = 0;
385	while ($i <= $#array)
386	{
387	$result = $result . $array[$i];
388	if ($i+1 <= $#array)
389	{
390	$result = $result . " ";
391	}
392	$i++;
393	}
394	return $result;
395	}
396
397	sub check {
398	my $self = shift (@_);
399
400	if (length($self->to_acronym()) < $min_acro_length)
401	{
402	# print "acronym " . $self->to_string() . " rejected (too short I)\n";
403	return 0;
404	}
405	if ($self->words_in_acronym_definition() < $min_def_length)
406	{
407	# print "acronym " . $self->to_string() . " rejected (too short II)\n";
408	return 0;
409	}
410	if ($min_length_saving * $self->letters_in_acronym() >
411	$self->letters_in_acronym_definition())
412	{
413	# print "acronym " . $self->to_string() . " rejected (too short III)\n";
414	# print "" . $min_length_saving .
415	# "\|" . $self->letters_in_acronym() .
416	# "\|" . $self->letters_in_acronym_definition() . "\n";
417	return 0;
418	}
419	# print "acronym " . $self->to_string() . " not rejected\n";
420	return 1;
421	}
422
423	###########################################################################
424	# static functions
425	###########################################################################
426
427	sub recurse {
428	my ($acro_offset, #offset of word we're finding acronyms for
429	$text_offset,
430	$letter_offset,
431	@def_so_far) = @_;
432
433	my $word = $split_text[$text_offset];
434	my $acro = $split_text[$acro_offset];
435	$word = "" if !defined $word;
436	$acro = "" if !defined $acro;
437
438	# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
439	# $letter_offset . ", " . @def_so_far . ")\n";
440
441	#check for termination ...
442	if ($letter_offset >= length($acro))
443	{
444	my $acronym = new acronym();
445	$acronym->[0] = $acro;
446	push @{$acronym->[1]}, @def_so_far;
447	if ($acronym->check())
448	{
449	push @acronym_list, ( $acronym );
450	}
451	# print "acronym created\n";
452	return;
453	}
454	#check for recursion
455	if (!$allow_recursive)
456	{
457	if ($word eq $acro)
458	{
459	# print "recursion detected\n";
460	return;
461	}
462	}
463
464	#skip a stop-word
465	my $i = 0;
466	if ($letter_offset != 0)
467	{
468	while ($i <= $#stop_words)
469	{
470	if ($stop_words[$i] eq uc($word))
471	{
472	# print "::found stop word::" . $stop_words[$i] . "\n";
473	&recurse($acro_offset,
474	$text_offset+1,
475	$letter_offset,
476	@def_so_far, $word);
477	}
478	$i++;
479	}
480	}
481	$i = 1;
482	#using the first $i letters ...
483	while ($letter_offset+$i <= length($acro) )
484	{
485	# print "". ((substr $word, 0, $i) . " " .
486	# (substr $acro, $letter_offset, $i) . "\n");
487	if (((!$case_match) &&
488	(uc(substr $word, 0, $i) eq
489	uc(substr $acro, $letter_offset, $i)))
490	\|\|
491	(($case_match) &&
492	((substr $word, 0, $i) eq
493	(substr $acro, $letter_offset, $i))))
494	{
495	# print "::match::\n";
496	# print "" . ((substr $word, 0, $i) . " " .
497	# (substr $acro, $letter_offset, $i) . "\n");
498	&recurse($acro_offset,
499	$text_offset+1,
500	$letter_offset+$i,
501	@def_so_far, $word);
502	} else {
503	return;
504	}
505	$i++;
506	}
507	return;
508	}
509
510
511	sub acronyms {
512	#clean up the text
513	my $processed_text = shift @_;
514	$$processed_text =~ s/[^A-Za-z]/ /g;
515	$$processed_text =~ s/\s+/ /g;
516
517	#clear some global variables
518	@split_text = ();
519	@acronym_list = ();
520
521	return &acronyms_from_clean_text($processed_text);
522	}
523
524	sub acronyms_from_clean_text {
525	my ($processed_text) = @_;
526
527	@split_text = split / /, $$processed_text;
528
529	# my $i = 0;
530	# while ($i <= $#split_text)
531	# {
532	# print $split_text[$i] . "\n";
533	# $i++;
534	# }
535
536	my $first = 0;
537	my $last = $#split_text +1;
538	my $acro_counter = $first;
539
540	while ($acro_counter < $last)
541	{
542	my $word = $split_text[$acro_counter];
543
544	if ((!$upper_case) \|\|
545	(uc($word) eq $word))
546	{
547
548	if (length $word >= $min_acro_length)
549	{
550	my $def_counter = 0;
551	if ($acro_counter - $max_offset > 0)
552	{
553	$def_counter = $acro_counter - $max_offset;
554	}
555	my $local_last = $acro_counter + $max_offset;
556	if ($local_last > $last)
557	{
558	$local_last = $last;
559	}
560	while ($def_counter <= $local_last)
561	{
562	&recurse($acro_counter,$def_counter,0,());
563	$def_counter++;
564	}
565	}
566	}
567	$acro_counter++;
568	}
569
570	return \@acronym_list;
571	}
572
573
574
575	sub test {
576
577	# my $blarg = new acronym();
578	# my $simple;
579	# $simple = 10;
580	# $blarg->initialise($simple, $simple, $simple);
581	# my $blarg2 = $blarg->clone();
582	# print $blarg->to_string();
583	# print $blarg2;
584	# print "\n";
585	#
586	my $tla = new acronym();
587	$tla->[0] = "TLA";
588
589	my @array = ("Three", "Letter", "Acronym");
590	# my $i = 0;
591	# while ($i <= $#array)
592	# {
593	# print @array[$i] . "\n";
594	# $i++;
595	# }
596
597	print "\n";
598	push @{$tla->[1]}, ("Three" );
599	push @{$tla->[1]}, ("Letter" );
600	push @{$tla->[1]}, ("Acronym" );
601	print $tla->to_string(). "\n";
602	print "\n";
603	print "\n";
604	my $tla2 = $tla->clone();
605	push @{$tla2->[1]}, ("One");
606	push @{$tla2->[1]}, ("Two");
607	$tla2->[0] = "ALT";
608	print $tla->to_string(). "\n";
609	print $tla2->to_string(). "\n";
610
611	print "\n";
612	print "\n";
613
614	print "Testing recursion ...\n";
615	my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
616
617	foreach my $acro (@$acros)
618	{
619	if ($acro->check)
620	{
621	print "accepted: " .$acro->to_string() . "\n";
622	# print "\|" . $acro->to_acronym() . "\|" . $acro->to_def_string() .
623	# "\|" . $acro->words_in_acronym_definition() .
624	# "\|" . $acro->letters_in_acronym_definition() .
625	# "\|" . $acro->letters_in_acronym() . "\|\n";
626	} else {
627	# print "found but rejected: " .$acro->to_string() . "\n";
628	}
629	}
630	}
631
632	#uncomment this line to test this package
633	#&test();
634
635	&init_acronyms();
636
637	1;
638
639

Note: See TracBrowser for help on using the repository browser.

Download in other formats: