source: trunk/gsdl/perllib/acronym.pm@ 1404

Last change on this file since 1404 was 1404, checked in by say1, 24 years ago

fixed acronyms option file. trimmed text at start of bibliographies to prevent emphermeral acronyms. tightened allow_all_caps code

  • Property svn:keywords set to Author Date Id Revision
File size: 18.9 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28use strict;
29use diagnostics;
30
31package acronym;
32
33###########################################################################
34# global variables
35###########################################################################
36# valiables to control the recall/precision tradeoff
37
38#the maximum range to look for acronyms
39my $max_offset = 30;
40#acronyms must be upper case
41my $upper_case = 1;
42#acronym case must match
43my $case_match = 1;
44#minimum acronym length
45my $min_def_length = 3;
46#minimum acronym length
47my $min_acro_length = 3;
48#minimum acronym length saving
49my $min_length_saving = 4;
50#allow recusive acronyms
51my $allow_recursive = 0;
52#let definitions be all capitals
53my $allow_all_caps = 0;
54
55my @stop_words = split / /, "OF AT THE IN TO AND";
56
57#the text split into an array, one word per element
58my @split_text = ();
59my @acronym_list = ();
60
61#the file to collate acronyms into
62my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
63my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
64
65my %acronyms_found_in_collection = ();
66my %acronyms_banned_from_collection = ();
67
68my $writing_acronyms = 1;
69my $accumulate_acronyms = 1;
70my $markup_accumulate_acronyms = 1;
71my $markup_local_acronyms = 1;
72
73
74
75###########################################################################
76# file saving / loading stuff
77###########################################################################
78
79sub initialise_acronyms {
80
81 my $local_max_offset = $max_offset;
82 my $local_upper_case = $upper_case;
83 my $local_case_match = $case_match ;
84 my $local_min_def_length = $min_def_length;
85 my $local_min_acro_length = $min_acro_length;
86 my $local_min_length_saving = $min_length_saving;
87 my $local_allow_recursive = $allow_recursive;
88 my $local_allow_all_caps = $allow_all_caps;
89 my @local_stop_words = @stop_words;
90 my $local_acronym_accumulate_file = $acronym_accumulate_file;
91
92
93 my $file_text = "";
94 if (open ACRONYM_HANDLE, "<$acronym_options_file")
95 {
96 $file_text = do { local $/; <ACRONYM_HANDLE> };
97 }
98 if ($file_text eq "")
99 {
100 print STDERR "failed to open $acronym_options_file\n";
101 open ACRONYM_HANDLE, ">$acronym_options_file\n";
102 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
103 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
104 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
105 print ACRONYM_HANDLE "#valid here.\n\n";
106 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
107 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
108 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
109 print ACRONYM_HANDLE "#explain them fully \n\n";
110 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
111 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
112 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
113 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
114 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
115 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
116 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
117 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
118 print ACRONYM_HANDLE "#let definitions be all capitals\n";
119 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
120 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
121 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
122 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
123 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
124 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
125 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
126 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
127 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
128 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
129 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
130 print ACRONYM_HANDLE "\n";
131 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
132 print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n";
133 print ACRONYM_HANDLE "\n";
134 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
135 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
136 print ACRONYM_HANDLE "# \n";
137 print ACRONYM_HANDLE "# \n";
138 print ACRONYM_HANDLE "# \n";
139 print STDERR "written new options file to $acronym_options_file...\n";
140 } else {
141 print STDERR "read file $acronym_options_file...\n";
142 eval $file_text ;
143 warn $@ if $@;
144 print STDERR "evaluated file $acronym_options_file...\n";
145 }
146
147
148 $max_offset = $local_max_offset;
149 $upper_case = $local_upper_case;
150 $case_match = $local_case_match ;
151 $min_def_length = $local_min_def_length;
152 $min_acro_length = $local_min_acro_length;
153 $min_length_saving = $local_min_length_saving;
154 $allow_recursive = $local_allow_recursive;
155 $allow_all_caps = $local_allow_all_caps;
156 @stop_words = @local_stop_words;
157
158$local_acronym_accumulate_file = $local_acronym_accumulate_file;
159
160 &read_all_acronyms_from_file();
161# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
162# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
163 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
164 {
165 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
166 print ACRONYM_HANDLE "#\n";
167 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
168 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
169 print ACRONYM_HANDLE "#read-only. \n";
170 print ACRONYM_HANDLE "#\n";
171 print ACRONYM_HANDLE "#start of acronyms...\n";
172 $writing_acronyms = 1;
173 } else {
174 warn "failed to open $acronym_accumulate_file for writing\n";
175 $writing_acronyms = 0;
176 }
177}
178
179#close the list of accumulated acronyms
180sub finalise_acronyms {
181 if ($writing_acronyms)
182 {
183 print ACRONYM_HANDLE "#end of acronyms.\n";
184 close ACRONYM_HANDLE;
185 }
186}
187
188#eval a file of accumulated acronyms
189sub read_all_acronyms_from_file {
190
191 my $file_text = " ";
192 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
193 {
194 $file_text = do { local $/; <ACRONYM_HANDLE> };
195 } else {
196 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
197 }
198 eval $file_text;
199 #promotes warnings/errors from evaluated file to current context
200 warn $@ if $@;
201}
202
203#called from within the file of accumulated acronyms to indicate a good acronym
204sub add {
205 my $self = shift (@_);
206 if (defined ($acronyms_found_in_collection{$self->[0]}))
207 {
208 my $def = $self->to_def_string();
209 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
210 {
211 return;
212 }
213 $acronyms_found_in_collection{$self->[0]} =
214 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
215 } else {
216 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
217 }
218}
219
220#called from within the file of accumulated acronyms to indicate a bad acronym
221sub ban {
222 my $self = shift (@_);
223
224 if (!defined $acronyms_banned_from_collection{$self->[0]})
225 {
226 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
227 } else {
228 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
229 }
230}
231
232
233#write a good acronym to the accumulated acronyms file
234sub write_to_file {
235 my $self = shift (@_);
236 if ($writing_acronyms)
237 {
238 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
239 $self->to_def_string() .
240 "\")->add();\n";
241 }
242}
243
244
245###########################################################################
246# mark functionality
247###########################################################################
248
249#small routine to sort by length
250sub sort_by_length {
251 length($b) <=> length($a) or $a cmp $b
252}
253
254sub markup_acronyms {
255 my $text = shift (@_);
256 my $verbosity_obj = shift (@_);
257 if (defined $text)
258 {
259 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
260 {
261 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
262 printf STDERR " " . $acro . ","
263 if ($verbosity_obj->{'verbosity'} >= 2);
264 }
265 }
266 return $text;
267}
268
269
270
271###########################################################################
272# member functions
273###########################################################################
274
275
276sub new {
277 my $trash = shift (@_);
278 my $acro = shift (@_);
279 my $def = shift (@_);
280
281 my $self = [
282 "", # 0 acronym
283 [], # 1 definition
284 ];
285
286 $self->[0] = $acro if defined $acro;
287 push @{$self->[1]}, split / /, $def if defined $def;
288
289 bless $self;
290}
291
292sub clone {
293 my $self = shift (@_);
294
295 my $copy = new acronym;
296
297 $copy->[0] = $self->[0];
298 push @{$copy->[1]}, @{$self->[1]};
299 bless $copy;
300
301 return $copy;
302}
303
304#return the acronym
305sub to_acronym {
306 my $self = shift (@_);
307 my @array = @{$self->[1]};
308
309 return $self->[0];
310}
311
312#return the number of words in the acronym definition
313sub words_in_acronym_definition {
314 my $self = shift (@_);
315 my @array = @{$self->[1]};
316
317 return $#array + 1;
318}
319
320#return the number of letters in the acronym definition
321sub letters_in_acronym_definition {
322 my $self = shift (@_);
323
324 return length($self->to_def_string());
325}
326
327#return the number of letters in the acronym definition
328sub letters_in_acronym {
329 my $self = shift (@_);
330
331 return length($self->to_acronym());
332}
333
334#return the acronym definition
335sub to_def_string {
336 my $self = shift (@_);
337
338 my $result = "";
339
340 # do the definition
341 my @array = @{$self->[1]};
342 my $i = 0;
343 while ($i <= $#array)
344 {
345 $result = $result . $array[$i];
346
347 if ($i+1 <= $#array)
348 {
349 $result = $result . " ";
350 }
351 $i++;
352 }
353 return $result;
354}
355
356
357#print out the kwic for the acronym
358sub to_string_kwic {
359 my $self = shift (@_);
360
361 # the list of all possible combinations
362 my @list = ();
363
364 my $result = "";
365
366 my $j = 0;
367 my @array = @{$self->[1]};
368 while ($j <= $#array)
369 {
370
371 # do the definition
372 my $i = 0;
373
374 #add the key word
375 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
376
377 #add the proceeding words
378 while ($i < $j)
379 {
380 $result = $result . $array[$i] . " ";
381 $i++;
382 }
383 #add the key word
384 $result = $result . "</td><td halign=left>" . $array[$j] .
385 "</td><td halign=left>";
386
387 #add the trailing words
388 $i++;
389 while ($i <= $#array )
390 {
391 $result = $result . $array[$i] . " ";
392 $i++;
393 }
394
395 #add the actual acronym
396
397 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
398
399 push @list, $result;
400 $j++;
401 }
402 return @list;
403}
404
405#this is the one used when building the collection ...
406sub to_string {
407 my $self = shift (@_);
408
409 my $result = $self->[0] . " ";
410
411 # do the definition
412 my @array = @{$self->[1]};
413 my $i = 0;
414 while ($i <= $#array)
415 {
416 $result = $result . $array[$i];
417 if ($i+1 <= $#array)
418 {
419 $result = $result . " ";
420 }
421 $i++;
422 }
423 return $result;
424}
425
426sub check {
427 my $self = shift (@_);
428
429 if (length($self->to_acronym()) < $min_acro_length)
430 {
431# print "acronym " . $self->to_string() . " rejected (too short I)\n";
432 return 0;
433 }
434 if ($self->words_in_acronym_definition() < $min_def_length)
435 {
436# print "acronym " . $self->to_string() . " rejected (too short II)\n";
437 return 0;
438 }
439 if ($min_length_saving * $self->letters_in_acronym() >
440 $self->letters_in_acronym_definition())
441 {
442# print "acronym " . $self->to_string() . " rejected (too short III)\n";
443# print "" . $min_length_saving .
444# "|" . $self->letters_in_acronym() .
445# "|" . $self->letters_in_acronym_definition() . "\n";
446 return 0;
447 }
448 if (!$allow_all_caps &&
449 $self->to_def_string() eq uc($self->to_def_string()))
450 {
451# print "acronym " . $self->to_string() . " rejected (all upper)\n";
452 return 0;
453 }
454 if (!$allow_all_caps)
455 {
456 my $upper_count = 0;
457 my $lower_count = 0;
458 my @letters = $self->to_def_string();
459 for my $letter (split //, $self->to_def_string())
460 {
461 if ($letter eq uc($letter))
462 {
463 $upper_count++;
464 } else {
465 $lower_count++;
466 }
467 }
468 return 0 if ($upper_count > $lower_count);
469 }
470 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
471 {
472 return 0;
473 }
474# print "acronym " . $self->to_string() . " not rejected\n";
475 return 1;
476}
477
478###########################################################################
479# static functions
480###########################################################################
481
482sub recurse {
483 my ($acro_offset, #offset of word we're finding acronyms for
484 $text_offset,
485 $letter_offset,
486 @def_so_far) = @_;
487
488 my $word = $split_text[$text_offset];
489 my $acro = $split_text[$acro_offset];
490 $word = "" if !defined $word;
491 $acro = "" if !defined $acro;
492
493# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
494# $letter_offset . ", " . @def_so_far . ")\n";
495
496 #check for termination ...
497 if ($letter_offset >= length($acro))
498 {
499 my $acronym = new acronym();
500 $acronym->[0] = $acro;
501 push @{$acronym->[1]}, @def_so_far;
502 if ($acronym->check())
503 {
504 push @acronym_list, ( $acronym );
505 }
506# print "acronym created\n";
507 return;
508 }
509 #check for recursion
510 if (!$allow_recursive)
511 {
512 if ($word eq $acro)
513 {
514# print "recursion detected\n";
515 return;
516 }
517 }
518
519 #skip a stop-word
520 my $i = 0;
521 if ($letter_offset != 0)
522 {
523 while ($i <= $#stop_words)
524 {
525 if ($stop_words[$i] eq uc($word))
526 {
527# print "::found stop word::" . $stop_words[$i] . "\n";
528 &recurse($acro_offset,
529 $text_offset+1,
530 $letter_offset,
531 @def_so_far, $word);
532 }
533 $i++;
534 }
535 }
536 $i = 1;
537 #using the first $i letters ...
538 while ($letter_offset+$i <= length($acro) )
539 {
540# print "". ((substr $word, 0, $i) . " " .
541# (substr $acro, $letter_offset, $i) . "\n");
542 if (((!$case_match) &&
543 (uc(substr $word, 0, $i) eq
544 uc(substr $acro, $letter_offset, $i)))
545 ||
546 (($case_match) &&
547 ((substr $word, 0, $i) eq
548 (substr $acro, $letter_offset, $i))))
549 {
550# print "::match::\n";
551# print "" . ((substr $word, 0, $i) . " " .
552# (substr $acro, $letter_offset, $i) . "\n");
553 &recurse($acro_offset,
554 $text_offset+1,
555 $letter_offset+$i,
556 @def_so_far, $word);
557 } else {
558 return;
559 }
560 $i++;
561 }
562 return;
563}
564
565#the main
566sub acronyms {
567 #clean up the text
568 my $processed_text = shift @_;
569 $$processed_text =~ s/<[^>]*>/ /g;
570 $$processed_text =~ s/[^\w]/ /g;
571 $$processed_text =~ s/[0-9_]/ /g;
572 $$processed_text =~ s/\s+/ /g;
573 $$processed_text =~ s/(\n|\>)References.*/ /i;
574 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
575
576 #clear some global variables
577 @split_text = ();
578 @acronym_list = ();
579
580 return &acronyms_from_clean_text($processed_text);
581}
582
583sub acronyms_from_clean_text {
584 my ($processed_text) = @_;
585
586 @split_text = split / /, $$processed_text;
587
588# my $i = 0;
589# while ($i <= $#split_text)
590# {
591# print $split_text[$i] . "\n";
592# $i++;
593# }
594
595 my $first = 0;
596 my $last = $#split_text +1;
597 my $acro_counter = $first;
598
599 while ($acro_counter < $last)
600 {
601 my $word = $split_text[$acro_counter];
602
603 if ((!$upper_case) ||
604 (uc($word) eq $word))
605 {
606
607 if (length $word >= $min_acro_length)
608 {
609 my $def_counter = 0;
610 if ($acro_counter - $max_offset > 0)
611 {
612 $def_counter = $acro_counter - $max_offset;
613 }
614 my $local_last = $acro_counter + $max_offset;
615 if ($local_last > $last)
616 {
617 $local_last = $last;
618 }
619 while ($def_counter <= $local_last)
620 {
621 &recurse($acro_counter,$def_counter,0,());
622 $def_counter++;
623 }
624 }
625 }
626 $acro_counter++;
627 }
628
629 return \@acronym_list;
630}
631
632
633
634sub test {
635
636# my $blarg = new acronym();
637# my $simple;
638# $simple = 10;
639# $blarg->initialise($simple, $simple, $simple);
640# my $blarg2 = $blarg->clone();
641# print $blarg->to_string();
642# print $blarg2;
643# print "\n";
644#
645 my $tla = new acronym();
646 $tla->[0] = "TLA";
647
648 my @array = ("Three", "Letter", "Acronym");
649# my $i = 0;
650# while ($i <= $#array)
651# {
652# print @array[$i] . "\n";
653# $i++;
654# }
655
656 print "\n";
657 push @{$tla->[1]}, ("Three" );
658 push @{$tla->[1]}, ("Letter" );
659 push @{$tla->[1]}, ("Acronym" );
660 print $tla->to_string(). "\n";
661 print "\n";
662 print "\n";
663 my $tla2 = $tla->clone();
664 push @{$tla2->[1]}, ("One");
665 push @{$tla2->[1]}, ("Two");
666 $tla2->[0] = "ALT";
667 print $tla->to_string(). "\n";
668 print $tla2->to_string(). "\n";
669
670 print "\n";
671 print "\n";
672
673 print "Testing recursion ...\n";
674 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
675
676 foreach my $acro (@$acros)
677 {
678 if ($acro->check)
679 {
680 print "accepted: " .$acro->to_string() . "\n";
681# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
682# "|" . $acro->words_in_acronym_definition() .
683# "|" . $acro->letters_in_acronym_definition() .
684# "|" . $acro->letters_in_acronym() . "|\n";
685 } else {
686# print "found but rejected: " .$acro->to_string() . "\n";
687 }
688 }
689}
690
691#uncomment this line to test this package
692#&test();
693
6941;
695
696
Note: See TracBrowser for help on using the repository browser.