source: trunk/gsdl/perllib/acronym.pm@ 7645

Last change on this file since 7645 was 7645, checked in by jrm21, 20 years ago

don't fail if we can't load the diagnostics package.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
RevLine 
[1361]1##########################################################################
[1242]2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[1361]25# class to handle acronyms
26###########################################################################
[1242]27
[7645]28eval "require diagnostics"; # some perl distros (eg mac) don't have this
[1242]29
30package acronym;
31
[1361]32###########################################################################
33# global variables
34###########################################################################
35# valiables to control the recall/precision tradeoff
[1242]36
[1404]37#the maximum range to look for acronyms
[1361]38my $max_offset = 30;
39#acronyms must be upper case
40my $upper_case = 1;
41#acronym case must match
42my $case_match = 1;
43#minimum acronym length
44my $min_def_length = 3;
45#minimum acronym length
46my $min_acro_length = 3;
47#minimum acronym length saving
[1393]48my $min_length_saving = 4;
[1361]49#allow recusive acronyms
[1404]50my $allow_recursive = 0;
[1396]51#let definitions be all capitals
52my $allow_all_caps = 0;
[1361]53
[1393]54my @stop_words = split / /, "OF AT THE IN TO AND";
[1361]55
56#the text split into an array, one word per element
57my @split_text = ();
58my @acronym_list = ();
59
[1393]60#the file to collate acronyms into
61my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
62my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
[1361]63
[1393]64my %acronyms_found_in_collection = ();
65my %acronyms_banned_from_collection = ();
66
67my $writing_acronyms = 1;
68my $accumulate_acronyms = 1;
69my $markup_accumulate_acronyms = 1;
70my $markup_local_acronyms = 1;
71
72
73
[1361]74###########################################################################
[1393]75# file saving / loading stuff
76###########################################################################
77
[1396]78sub initialise_acronyms {
[1404]79
80 my $local_max_offset = $max_offset;
81 my $local_upper_case = $upper_case;
82 my $local_case_match = $case_match ;
83 my $local_min_def_length = $min_def_length;
84 my $local_min_acro_length = $min_acro_length;
85 my $local_min_length_saving = $min_length_saving;
86 my $local_allow_recursive = $allow_recursive;
87 my $local_allow_all_caps = $allow_all_caps;
88 my @local_stop_words = @stop_words;
89 my $local_acronym_accumulate_file = $acronym_accumulate_file;
[1393]90
[1404]91
[1393]92 my $file_text = "";
93 if (open ACRONYM_HANDLE, "<$acronym_options_file")
94 {
95 $file_text = do { local $/; <ACRONYM_HANDLE> };
96 }
97 if ($file_text eq "")
98 {
99 print STDERR "failed to open $acronym_options_file\n";
100 open ACRONYM_HANDLE, ">$acronym_options_file\n";
101 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
102 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
103 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
104 print ACRONYM_HANDLE "#valid here.\n\n";
105 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
106 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
107 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
108 print ACRONYM_HANDLE "#explain them fully \n\n";
109 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
[1404]110 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
111 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
112 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
113 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
114 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
[1393]115 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
[1404]116 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
[1396]117 print ACRONYM_HANDLE "#let definitions be all capitals\n";
[1404]118 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
[1393]119 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
[1404]120 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
[1393]121 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
[1404]122 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
123 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
124 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
[1393]125 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
126 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
[1404]127 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
128 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
[1393]129 print ACRONYM_HANDLE "\n";
130 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
[1404]131 print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n";
[1393]132 print ACRONYM_HANDLE "\n";
133 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
134 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
135 print ACRONYM_HANDLE "# \n";
136 print ACRONYM_HANDLE "# \n";
137 print ACRONYM_HANDLE "# \n";
[1404]138 print STDERR "written new options file to $acronym_options_file...\n";
139 } else {
140 print STDERR "read file $acronym_options_file...\n";
141 eval $file_text ;
142 warn $@ if $@;
143 print STDERR "evaluated file $acronym_options_file...\n";
[1393]144 }
[1405]145
[1404]146 $max_offset = $local_max_offset;
147 $upper_case = $local_upper_case;
148 $case_match = $local_case_match ;
149 $min_def_length = $local_min_def_length;
150 $min_acro_length = $local_min_acro_length;
151 $min_length_saving = $local_min_length_saving;
152 $allow_recursive = $local_allow_recursive;
153 $allow_all_caps = $local_allow_all_caps;
154 @stop_words = @local_stop_words;
155
[1405]156 $local_acronym_accumulate_file = $local_acronym_accumulate_file;
[1404]157
[1405]158
[1393]159 &read_all_acronyms_from_file();
160# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
161# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
162 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
163 {
164 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
165 print ACRONYM_HANDLE "#\n";
166 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
167 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
168 print ACRONYM_HANDLE "#read-only. \n";
169 print ACRONYM_HANDLE "#\n";
170 print ACRONYM_HANDLE "#start of acronyms...\n";
171 $writing_acronyms = 1;
172 } else {
173 warn "failed to open $acronym_accumulate_file for writing\n";
174 $writing_acronyms = 0;
175 }
176}
177
178#close the list of accumulated acronyms
[1396]179sub finalise_acronyms {
[1393]180 if ($writing_acronyms)
181 {
182 print ACRONYM_HANDLE "#end of acronyms.\n";
183 close ACRONYM_HANDLE;
184 }
185}
186
187#eval a file of accumulated acronyms
188sub read_all_acronyms_from_file {
189
[1396]190 my $file_text = " ";
[1393]191 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
192 {
193 $file_text = do { local $/; <ACRONYM_HANDLE> };
194 } else {
195 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
196 }
197 eval $file_text;
198 #promotes warnings/errors from evaluated file to current context
199 warn $@ if $@;
200}
201
202#called from within the file of accumulated acronyms to indicate a good acronym
203sub add {
204 my $self = shift (@_);
205 if (defined ($acronyms_found_in_collection{$self->[0]}))
206 {
207 my $def = $self->to_def_string();
208 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
209 {
210 return;
211 }
212 $acronyms_found_in_collection{$self->[0]} =
213 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
214 } else {
215 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
216 }
217}
218
219#called from within the file of accumulated acronyms to indicate a bad acronym
220sub ban {
221 my $self = shift (@_);
222
223 if (!defined $acronyms_banned_from_collection{$self->[0]})
224 {
225 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
226 } else {
227 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
228 }
229}
230
231
232#write a good acronym to the accumulated acronyms file
233sub write_to_file {
234 my $self = shift (@_);
235 if ($writing_acronyms)
236 {
237 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
238 $self->to_def_string() .
239 "\")->add();\n";
240 }
241}
242
243
244###########################################################################
245# mark functionality
246###########################################################################
247
248#small routine to sort by length
249sub sort_by_length {
250 length($b) <=> length($a) or $a cmp $b
251}
252
253sub markup_acronyms {
254 my $text = shift (@_);
255 my $verbosity_obj = shift (@_);
256 if (defined $text)
257 {
258 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
259 {
260 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
261 printf STDERR " " . $acro . ","
262 if ($verbosity_obj->{'verbosity'} >= 2);
263 }
264 }
265 return $text;
266}
267
268
269
270###########################################################################
[1361]271# member functions
272###########################################################################
273
274
[1242]275sub new {
[1393]276 my $trash = shift (@_);
277 my $acro = shift (@_);
278 my $def = shift (@_);
279
[1242]280 my $self = [
[1393]281 "", # 0 acronym
282 [], # 1 definition
283 ];
284
285 $self->[0] = $acro if defined $acro;
286 push @{$self->[1]}, split / /, $def if defined $def;
287
[1242]288 bless $self;
289}
290
291sub clone {
292 my $self = shift (@_);
293
294 my $copy = new acronym;
295
296 $copy->[0] = $self->[0];
297 push @{$copy->[1]}, @{$self->[1]};
[1361]298 bless $copy;
299
[1242]300 return $copy;
301}
302
[1361]303#return the acronym
304sub to_acronym {
305 my $self = shift (@_);
306 my @array = @{$self->[1]};
[1242]307
[1361]308 return $self->[0];
309}
310
311#return the number of words in the acronym definition
312sub words_in_acronym_definition {
313 my $self = shift (@_);
314 my @array = @{$self->[1]};
315
316 return $#array + 1;
317}
318
319#return the number of letters in the acronym definition
320sub letters_in_acronym_definition {
321 my $self = shift (@_);
322
323 return length($self->to_def_string());
324}
325
326#return the number of letters in the acronym definition
327sub letters_in_acronym {
328 my $self = shift (@_);
329
330 return length($self->to_acronym());
331}
332
333#return the acronym definition
334sub to_def_string {
335 my $self = shift (@_);
336
337 my $result = "";
338
339 # do the definition
340 my @array = @{$self->[1]};
341 my $i = 0;
342 while ($i <= $#array)
343 {
344 $result = $result . $array[$i];
345
346 if ($i+1 <= $#array)
347 {
348 $result = $result . " ";
349 }
350 $i++;
351 }
352 return $result;
353}
354
355
[1242]356#print out the kwic for the acronym
357sub to_string_kwic {
358 my $self = shift (@_);
359
360 # the list of all possible combinations
361 my @list = ();
362
363 my $result = "";
364
365 my $j = 0;
366 my @array = @{$self->[1]};
367 while ($j <= $#array)
368 {
369
370 # do the definition
371 my $i = 0;
[1361]372
373 #add the key word
374 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
375
376 #add the proceeding words
377 while ($i < $j)
[1242]378 {
[1361]379 $result = $result . $array[$i] . " ";
[1242]380 $i++;
381 }
[1361]382 #add the key word
383 $result = $result . "</td><td halign=left>" . $array[$j] .
384 "</td><td halign=left>";
[1242]385
[1361]386 #add the trailing words
387 $i++;
388 while ($i <= $#array )
389 {
390 $result = $result . $array[$i] . " ";
391 $i++;
392 }
393
394 #add the actual acronym
395
396 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
397
[1242]398 push @list, $result;
399 $j++;
400 }
401 return @list;
402}
403
404#this is the one used when building the collection ...
405sub to_string {
406 my $self = shift (@_);
407
408 my $result = $self->[0] . " ";
409
410 # do the definition
411 my @array = @{$self->[1]};
412 my $i = 0;
413 while ($i <= $#array)
414 {
[1361]415 $result = $result . $array[$i];
416 if ($i+1 <= $#array)
417 {
418 $result = $result . " ";
419 }
[1242]420 $i++;
421 }
422 return $result;
423}
424
[1361]425sub check {
426 my $self = shift (@_);
427
428 if (length($self->to_acronym()) < $min_acro_length)
429 {
430# print "acronym " . $self->to_string() . " rejected (too short I)\n";
431 return 0;
432 }
433 if ($self->words_in_acronym_definition() < $min_def_length)
434 {
435# print "acronym " . $self->to_string() . " rejected (too short II)\n";
436 return 0;
437 }
438 if ($min_length_saving * $self->letters_in_acronym() >
439 $self->letters_in_acronym_definition())
440 {
441# print "acronym " . $self->to_string() . " rejected (too short III)\n";
442# print "" . $min_length_saving .
[1393]443# "|" . $self->letters_in_acronym() .
444# "|" . $self->letters_in_acronym_definition() . "\n";
[1361]445 return 0;
446 }
[1396]447 if (!$allow_all_caps &&
448 $self->to_def_string() eq uc($self->to_def_string()))
449 {
450# print "acronym " . $self->to_string() . " rejected (all upper)\n";
451 return 0;
452 }
[1404]453 if (!$allow_all_caps)
454 {
455 my $upper_count = 0;
456 my $lower_count = 0;
457 my @letters = $self->to_def_string();
458 for my $letter (split //, $self->to_def_string())
459 {
460 if ($letter eq uc($letter))
461 {
462 $upper_count++;
463 } else {
464 $lower_count++;
465 }
466 }
467 return 0 if ($upper_count > $lower_count);
468 }
469 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
470 {
471 return 0;
472 }
[1361]473# print "acronym " . $self->to_string() . " not rejected\n";
474 return 1;
475}
476
477###########################################################################
478# static functions
479###########################################################################
480
481sub recurse {
482 my ($acro_offset, #offset of word we're finding acronyms for
483 $text_offset,
484 $letter_offset,
485 @def_so_far) = @_;
486
487 my $word = $split_text[$text_offset];
488 my $acro = $split_text[$acro_offset];
489 $word = "" if !defined $word;
490 $acro = "" if !defined $acro;
491
492# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
493# $letter_offset . ", " . @def_so_far . ")\n";
494
495 #check for termination ...
496 if ($letter_offset >= length($acro))
497 {
498 my $acronym = new acronym();
499 $acronym->[0] = $acro;
500 push @{$acronym->[1]}, @def_so_far;
501 if ($acronym->check())
502 {
503 push @acronym_list, ( $acronym );
504 }
505# print "acronym created\n";
506 return;
507 }
508 #check for recursion
509 if (!$allow_recursive)
510 {
511 if ($word eq $acro)
512 {
513# print "recursion detected\n";
514 return;
515 }
516 }
517
518 #skip a stop-word
519 my $i = 0;
520 if ($letter_offset != 0)
521 {
522 while ($i <= $#stop_words)
523 {
524 if ($stop_words[$i] eq uc($word))
525 {
526# print "::found stop word::" . $stop_words[$i] . "\n";
527 &recurse($acro_offset,
528 $text_offset+1,
529 $letter_offset,
530 @def_so_far, $word);
531 }
532 $i++;
533 }
534 }
535 $i = 1;
536 #using the first $i letters ...
537 while ($letter_offset+$i <= length($acro) )
538 {
539# print "". ((substr $word, 0, $i) . " " .
540# (substr $acro, $letter_offset, $i) . "\n");
541 if (((!$case_match) &&
542 (uc(substr $word, 0, $i) eq
543 uc(substr $acro, $letter_offset, $i)))
544 ||
545 (($case_match) &&
546 ((substr $word, 0, $i) eq
547 (substr $acro, $letter_offset, $i))))
548 {
549# print "::match::\n";
550# print "" . ((substr $word, 0, $i) . " " .
551# (substr $acro, $letter_offset, $i) . "\n");
552 &recurse($acro_offset,
553 $text_offset+1,
554 $letter_offset+$i,
555 @def_so_far, $word);
556 } else {
557 return;
558 }
559 $i++;
560 }
561 return;
562}
563
[1404]564#the main
[1242]565sub acronyms {
[1361]566 #clean up the text
[1242]567 my $processed_text = shift @_;
[1404]568 $$processed_text =~ s/<[^>]*>/ /g;
[1405]569 $$processed_text =~ s/\W/ /g;
[1404]570 $$processed_text =~ s/[0-9_]/ /g;
[1242]571 $$processed_text =~ s/\s+/ /g;
[1404]572 $$processed_text =~ s/(\n|\>)References.*/ /i;
573 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
[1405]574 $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
575 $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
[1242]576
[1361]577 #clear some global variables
578 @split_text = ();
579 @acronym_list = ();
580
581 return &acronyms_from_clean_text($processed_text);
[1242]582}
583
584sub acronyms_from_clean_text {
585 my ($processed_text) = @_;
586
[1361]587 @split_text = split / /, $$processed_text;
[1242]588
589# my $i = 0;
[1361]590# while ($i <= $#split_text)
[1242]591# {
[1361]592# print $split_text[$i] . "\n";
[1242]593# $i++;
594# }
595
596 my $first = 0;
[1361]597 my $last = $#split_text +1;
[1242]598 my $acro_counter = $first;
599
600 while ($acro_counter < $last)
601 {
[1361]602 my $word = $split_text[$acro_counter];
[1242]603
[1361]604 if ((!$upper_case) ||
605 (uc($word) eq $word))
[1242]606 {
[1361]607
608 if (length $word >= $min_acro_length)
[1242]609 {
[1361]610 my $def_counter = 0;
611 if ($acro_counter - $max_offset > 0)
[1242]612 {
[1361]613 $def_counter = $acro_counter - $max_offset;
[1242]614 }
[1361]615 my $local_last = $acro_counter + $max_offset;
616 if ($local_last > $last)
[1242]617 {
[1361]618 $local_last = $last;
[1242]619 }
[1361]620 while ($def_counter <= $local_last)
621 {
622 &recurse($acro_counter,$def_counter,0,());
623 $def_counter++;
624 }
[1242]625 }
626 }
627 $acro_counter++;
628 }
629
[1361]630 return \@acronym_list;
[1242]631}
632
633
634
635sub test {
636
637# my $blarg = new acronym();
638# my $simple;
639# $simple = 10;
640# $blarg->initialise($simple, $simple, $simple);
641# my $blarg2 = $blarg->clone();
642# print $blarg->to_string();
643# print $blarg2;
644# print "\n";
645#
646 my $tla = new acronym();
647 $tla->[0] = "TLA";
648
649 my @array = ("Three", "Letter", "Acronym");
650# my $i = 0;
651# while ($i <= $#array)
652# {
653# print @array[$i] . "\n";
654# $i++;
655# }
656
657 print "\n";
658 push @{$tla->[1]}, ("Three" );
659 push @{$tla->[1]}, ("Letter" );
660 push @{$tla->[1]}, ("Acronym" );
661 print $tla->to_string(). "\n";
662 print "\n";
663 print "\n";
664 my $tla2 = $tla->clone();
665 push @{$tla2->[1]}, ("One");
666 push @{$tla2->[1]}, ("Two");
667 $tla2->[0] = "ALT";
668 print $tla->to_string(). "\n";
669 print $tla2->to_string(). "\n";
670
671 print "\n";
672 print "\n";
673
[1361]674 print "Testing recursion ...\n";
675 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
676
677 foreach my $acro (@$acros)
678 {
679 if ($acro->check)
680 {
681 print "accepted: " .$acro->to_string() . "\n";
682# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
683# "|" . $acro->words_in_acronym_definition() .
684# "|" . $acro->letters_in_acronym_definition() .
685# "|" . $acro->letters_in_acronym() . "|\n";
686 } else {
687# print "found but rejected: " .$acro->to_string() . "\n";
688 }
689 }
[1242]690}
691
692#uncomment this line to test this package
693#&test();
694
6951;
[1361]696
697
Note: See TracBrowser for help on using the repository browser.