source: trunk/gsdl/perllib/acronym.pm@ 2018

Last change on this file since 2018 was 1405, checked in by say1, 24 years ago

fixed acronym bugs

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28use strict;
29use diagnostics;
30
31package acronym;
32
33###########################################################################
34# global variables
35###########################################################################
36# valiables to control the recall/precision tradeoff
37
38#the maximum range to look for acronyms
39my $max_offset = 30;
40#acronyms must be upper case
41my $upper_case = 1;
42#acronym case must match
43my $case_match = 1;
44#minimum acronym length
45my $min_def_length = 3;
46#minimum acronym length
47my $min_acro_length = 3;
48#minimum acronym length saving
49my $min_length_saving = 4;
50#allow recusive acronyms
51my $allow_recursive = 0;
52#let definitions be all capitals
53my $allow_all_caps = 0;
54
55my @stop_words = split / /, "OF AT THE IN TO AND";
56
57#the text split into an array, one word per element
58my @split_text = ();
59my @acronym_list = ();
60
61#the file to collate acronyms into
62my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
63my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
64
65my %acronyms_found_in_collection = ();
66my %acronyms_banned_from_collection = ();
67
68my $writing_acronyms = 1;
69my $accumulate_acronyms = 1;
70my $markup_accumulate_acronyms = 1;
71my $markup_local_acronyms = 1;
72
73
74
75###########################################################################
76# file saving / loading stuff
77###########################################################################
78
79sub initialise_acronyms {
80
81 my $local_max_offset = $max_offset;
82 my $local_upper_case = $upper_case;
83 my $local_case_match = $case_match ;
84 my $local_min_def_length = $min_def_length;
85 my $local_min_acro_length = $min_acro_length;
86 my $local_min_length_saving = $min_length_saving;
87 my $local_allow_recursive = $allow_recursive;
88 my $local_allow_all_caps = $allow_all_caps;
89 my @local_stop_words = @stop_words;
90 my $local_acronym_accumulate_file = $acronym_accumulate_file;
91
92
93 my $file_text = "";
94 if (open ACRONYM_HANDLE, "<$acronym_options_file")
95 {
96 $file_text = do { local $/; <ACRONYM_HANDLE> };
97 }
98 if ($file_text eq "")
99 {
100 print STDERR "failed to open $acronym_options_file\n";
101 open ACRONYM_HANDLE, ">$acronym_options_file\n";
102 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
103 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
104 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
105 print ACRONYM_HANDLE "#valid here.\n\n";
106 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
107 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
108 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
109 print ACRONYM_HANDLE "#explain them fully \n\n";
110 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
111 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
112 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
113 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
114 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
115 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
116 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
117 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
118 print ACRONYM_HANDLE "#let definitions be all capitals\n";
119 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
120 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
121 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
122 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
123 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
124 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
125 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
126 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
127 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
128 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
129 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
130 print ACRONYM_HANDLE "\n";
131 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
132 print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n";
133 print ACRONYM_HANDLE "\n";
134 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
135 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
136 print ACRONYM_HANDLE "# \n";
137 print ACRONYM_HANDLE "# \n";
138 print ACRONYM_HANDLE "# \n";
139 print STDERR "written new options file to $acronym_options_file...\n";
140 } else {
141 print STDERR "read file $acronym_options_file...\n";
142 eval $file_text ;
143 warn $@ if $@;
144 print STDERR "evaluated file $acronym_options_file...\n";
145 }
146
147 $max_offset = $local_max_offset;
148 $upper_case = $local_upper_case;
149 $case_match = $local_case_match ;
150 $min_def_length = $local_min_def_length;
151 $min_acro_length = $local_min_acro_length;
152 $min_length_saving = $local_min_length_saving;
153 $allow_recursive = $local_allow_recursive;
154 $allow_all_caps = $local_allow_all_caps;
155 @stop_words = @local_stop_words;
156
157 $local_acronym_accumulate_file = $local_acronym_accumulate_file;
158
159
160 &read_all_acronyms_from_file();
161# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
162# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
163 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
164 {
165 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
166 print ACRONYM_HANDLE "#\n";
167 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
168 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
169 print ACRONYM_HANDLE "#read-only. \n";
170 print ACRONYM_HANDLE "#\n";
171 print ACRONYM_HANDLE "#start of acronyms...\n";
172 $writing_acronyms = 1;
173 } else {
174 warn "failed to open $acronym_accumulate_file for writing\n";
175 $writing_acronyms = 0;
176 }
177}
178
179#close the list of accumulated acronyms
180sub finalise_acronyms {
181 if ($writing_acronyms)
182 {
183 print ACRONYM_HANDLE "#end of acronyms.\n";
184 close ACRONYM_HANDLE;
185 }
186}
187
188#eval a file of accumulated acronyms
189sub read_all_acronyms_from_file {
190
191 my $file_text = " ";
192 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
193 {
194 $file_text = do { local $/; <ACRONYM_HANDLE> };
195 } else {
196 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
197 }
198 eval $file_text;
199 #promotes warnings/errors from evaluated file to current context
200 warn $@ if $@;
201}
202
203#called from within the file of accumulated acronyms to indicate a good acronym
204sub add {
205 my $self = shift (@_);
206 if (defined ($acronyms_found_in_collection{$self->[0]}))
207 {
208 my $def = $self->to_def_string();
209 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
210 {
211 return;
212 }
213 $acronyms_found_in_collection{$self->[0]} =
214 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
215 } else {
216 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
217 }
218}
219
220#called from within the file of accumulated acronyms to indicate a bad acronym
221sub ban {
222 my $self = shift (@_);
223
224 if (!defined $acronyms_banned_from_collection{$self->[0]})
225 {
226 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
227 } else {
228 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
229 }
230}
231
232
233#write a good acronym to the accumulated acronyms file
234sub write_to_file {
235 my $self = shift (@_);
236 if ($writing_acronyms)
237 {
238 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
239 $self->to_def_string() .
240 "\")->add();\n";
241 }
242}
243
244
245###########################################################################
246# mark functionality
247###########################################################################
248
249#small routine to sort by length
250sub sort_by_length {
251 length($b) <=> length($a) or $a cmp $b
252}
253
254sub markup_acronyms {
255 my $text = shift (@_);
256 my $verbosity_obj = shift (@_);
257 if (defined $text)
258 {
259 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
260 {
261 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
262 printf STDERR " " . $acro . ","
263 if ($verbosity_obj->{'verbosity'} >= 2);
264 }
265 }
266 return $text;
267}
268
269
270
271###########################################################################
272# member functions
273###########################################################################
274
275
276sub new {
277 my $trash = shift (@_);
278 my $acro = shift (@_);
279 my $def = shift (@_);
280
281 my $self = [
282 "", # 0 acronym
283 [], # 1 definition
284 ];
285
286 $self->[0] = $acro if defined $acro;
287 push @{$self->[1]}, split / /, $def if defined $def;
288
289 bless $self;
290}
291
292sub clone {
293 my $self = shift (@_);
294
295 my $copy = new acronym;
296
297 $copy->[0] = $self->[0];
298 push @{$copy->[1]}, @{$self->[1]};
299 bless $copy;
300
301 return $copy;
302}
303
304#return the acronym
305sub to_acronym {
306 my $self = shift (@_);
307 my @array = @{$self->[1]};
308
309 return $self->[0];
310}
311
312#return the number of words in the acronym definition
313sub words_in_acronym_definition {
314 my $self = shift (@_);
315 my @array = @{$self->[1]};
316
317 return $#array + 1;
318}
319
320#return the number of letters in the acronym definition
321sub letters_in_acronym_definition {
322 my $self = shift (@_);
323
324 return length($self->to_def_string());
325}
326
327#return the number of letters in the acronym definition
328sub letters_in_acronym {
329 my $self = shift (@_);
330
331 return length($self->to_acronym());
332}
333
334#return the acronym definition
335sub to_def_string {
336 my $self = shift (@_);
337
338 my $result = "";
339
340 # do the definition
341 my @array = @{$self->[1]};
342 my $i = 0;
343 while ($i <= $#array)
344 {
345 $result = $result . $array[$i];
346
347 if ($i+1 <= $#array)
348 {
349 $result = $result . " ";
350 }
351 $i++;
352 }
353 return $result;
354}
355
356
357#print out the kwic for the acronym
358sub to_string_kwic {
359 my $self = shift (@_);
360
361 # the list of all possible combinations
362 my @list = ();
363
364 my $result = "";
365
366 my $j = 0;
367 my @array = @{$self->[1]};
368 while ($j <= $#array)
369 {
370
371 # do the definition
372 my $i = 0;
373
374 #add the key word
375 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
376
377 #add the proceeding words
378 while ($i < $j)
379 {
380 $result = $result . $array[$i] . " ";
381 $i++;
382 }
383 #add the key word
384 $result = $result . "</td><td halign=left>" . $array[$j] .
385 "</td><td halign=left>";
386
387 #add the trailing words
388 $i++;
389 while ($i <= $#array )
390 {
391 $result = $result . $array[$i] . " ";
392 $i++;
393 }
394
395 #add the actual acronym
396
397 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
398
399 push @list, $result;
400 $j++;
401 }
402 return @list;
403}
404
405#this is the one used when building the collection ...
406sub to_string {
407 my $self = shift (@_);
408
409 my $result = $self->[0] . " ";
410
411 # do the definition
412 my @array = @{$self->[1]};
413 my $i = 0;
414 while ($i <= $#array)
415 {
416 $result = $result . $array[$i];
417 if ($i+1 <= $#array)
418 {
419 $result = $result . " ";
420 }
421 $i++;
422 }
423 return $result;
424}
425
426sub check {
427 my $self = shift (@_);
428
429 if (length($self->to_acronym()) < $min_acro_length)
430 {
431# print "acronym " . $self->to_string() . " rejected (too short I)\n";
432 return 0;
433 }
434 if ($self->words_in_acronym_definition() < $min_def_length)
435 {
436# print "acronym " . $self->to_string() . " rejected (too short II)\n";
437 return 0;
438 }
439 if ($min_length_saving * $self->letters_in_acronym() >
440 $self->letters_in_acronym_definition())
441 {
442# print "acronym " . $self->to_string() . " rejected (too short III)\n";
443# print "" . $min_length_saving .
444# "|" . $self->letters_in_acronym() .
445# "|" . $self->letters_in_acronym_definition() . "\n";
446 return 0;
447 }
448 if (!$allow_all_caps &&
449 $self->to_def_string() eq uc($self->to_def_string()))
450 {
451# print "acronym " . $self->to_string() . " rejected (all upper)\n";
452 return 0;
453 }
454 if (!$allow_all_caps)
455 {
456 my $upper_count = 0;
457 my $lower_count = 0;
458 my @letters = $self->to_def_string();
459 for my $letter (split //, $self->to_def_string())
460 {
461 if ($letter eq uc($letter))
462 {
463 $upper_count++;
464 } else {
465 $lower_count++;
466 }
467 }
468 return 0 if ($upper_count > $lower_count);
469 }
470 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
471 {
472 return 0;
473 }
474# print "acronym " . $self->to_string() . " not rejected\n";
475 return 1;
476}
477
478###########################################################################
479# static functions
480###########################################################################
481
482sub recurse {
483 my ($acro_offset, #offset of word we're finding acronyms for
484 $text_offset,
485 $letter_offset,
486 @def_so_far) = @_;
487
488 my $word = $split_text[$text_offset];
489 my $acro = $split_text[$acro_offset];
490 $word = "" if !defined $word;
491 $acro = "" if !defined $acro;
492
493# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
494# $letter_offset . ", " . @def_so_far . ")\n";
495
496 #check for termination ...
497 if ($letter_offset >= length($acro))
498 {
499 my $acronym = new acronym();
500 $acronym->[0] = $acro;
501 push @{$acronym->[1]}, @def_so_far;
502 if ($acronym->check())
503 {
504 push @acronym_list, ( $acronym );
505 }
506# print "acronym created\n";
507 return;
508 }
509 #check for recursion
510 if (!$allow_recursive)
511 {
512 if ($word eq $acro)
513 {
514# print "recursion detected\n";
515 return;
516 }
517 }
518
519 #skip a stop-word
520 my $i = 0;
521 if ($letter_offset != 0)
522 {
523 while ($i <= $#stop_words)
524 {
525 if ($stop_words[$i] eq uc($word))
526 {
527# print "::found stop word::" . $stop_words[$i] . "\n";
528 &recurse($acro_offset,
529 $text_offset+1,
530 $letter_offset,
531 @def_so_far, $word);
532 }
533 $i++;
534 }
535 }
536 $i = 1;
537 #using the first $i letters ...
538 while ($letter_offset+$i <= length($acro) )
539 {
540# print "". ((substr $word, 0, $i) . " " .
541# (substr $acro, $letter_offset, $i) . "\n");
542 if (((!$case_match) &&
543 (uc(substr $word, 0, $i) eq
544 uc(substr $acro, $letter_offset, $i)))
545 ||
546 (($case_match) &&
547 ((substr $word, 0, $i) eq
548 (substr $acro, $letter_offset, $i))))
549 {
550# print "::match::\n";
551# print "" . ((substr $word, 0, $i) . " " .
552# (substr $acro, $letter_offset, $i) . "\n");
553 &recurse($acro_offset,
554 $text_offset+1,
555 $letter_offset+$i,
556 @def_so_far, $word);
557 } else {
558 return;
559 }
560 $i++;
561 }
562 return;
563}
564
565#the main
566sub acronyms {
567 #clean up the text
568 my $processed_text = shift @_;
569 $$processed_text =~ s/<[^>]*>/ /g;
570 $$processed_text =~ s/\W/ /g;
571 $$processed_text =~ s/[0-9_]/ /g;
572 $$processed_text =~ s/\s+/ /g;
573 $$processed_text =~ s/(\n|\>)References.*/ /i;
574 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
575 $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
576 $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
577
578 #clear some global variables
579 @split_text = ();
580 @acronym_list = ();
581
582 return &acronyms_from_clean_text($processed_text);
583}
584
585sub acronyms_from_clean_text {
586 my ($processed_text) = @_;
587
588 @split_text = split / /, $$processed_text;
589
590# my $i = 0;
591# while ($i <= $#split_text)
592# {
593# print $split_text[$i] . "\n";
594# $i++;
595# }
596
597 my $first = 0;
598 my $last = $#split_text +1;
599 my $acro_counter = $first;
600
601 while ($acro_counter < $last)
602 {
603 my $word = $split_text[$acro_counter];
604
605 if ((!$upper_case) ||
606 (uc($word) eq $word))
607 {
608
609 if (length $word >= $min_acro_length)
610 {
611 my $def_counter = 0;
612 if ($acro_counter - $max_offset > 0)
613 {
614 $def_counter = $acro_counter - $max_offset;
615 }
616 my $local_last = $acro_counter + $max_offset;
617 if ($local_last > $last)
618 {
619 $local_last = $last;
620 }
621 while ($def_counter <= $local_last)
622 {
623 &recurse($acro_counter,$def_counter,0,());
624 $def_counter++;
625 }
626 }
627 }
628 $acro_counter++;
629 }
630
631 return \@acronym_list;
632}
633
634
635
636sub test {
637
638# my $blarg = new acronym();
639# my $simple;
640# $simple = 10;
641# $blarg->initialise($simple, $simple, $simple);
642# my $blarg2 = $blarg->clone();
643# print $blarg->to_string();
644# print $blarg2;
645# print "\n";
646#
647 my $tla = new acronym();
648 $tla->[0] = "TLA";
649
650 my @array = ("Three", "Letter", "Acronym");
651# my $i = 0;
652# while ($i <= $#array)
653# {
654# print @array[$i] . "\n";
655# $i++;
656# }
657
658 print "\n";
659 push @{$tla->[1]}, ("Three" );
660 push @{$tla->[1]}, ("Letter" );
661 push @{$tla->[1]}, ("Acronym" );
662 print $tla->to_string(). "\n";
663 print "\n";
664 print "\n";
665 my $tla2 = $tla->clone();
666 push @{$tla2->[1]}, ("One");
667 push @{$tla2->[1]}, ("Two");
668 $tla2->[0] = "ALT";
669 print $tla->to_string(). "\n";
670 print $tla2->to_string(). "\n";
671
672 print "\n";
673 print "\n";
674
675 print "Testing recursion ...\n";
676 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
677
678 foreach my $acro (@$acros)
679 {
680 if ($acro->check)
681 {
682 print "accepted: " .$acro->to_string() . "\n";
683# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
684# "|" . $acro->words_in_acronym_definition() .
685# "|" . $acro->letters_in_acronym_definition() .
686# "|" . $acro->letters_in_acronym() . "|\n";
687 } else {
688# print "found but rejected: " .$acro->to_string() . "\n";
689 }
690 }
691}
692
693#uncomment this line to test this package
694#&test();
695
6961;
697
698
Note: See TracBrowser for help on using the repository browser.