source: gsdl/trunk/perllib/acronym.pm@ 15117

Last change on this file since 15117 was 15117, checked in by ak19, 15 years ago

Dr Bainbridge made some changes to tidy up the code (replaced file global params with args and removed unnecessary statements)

  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28eval "require diagnostics"; # some perl distros (eg mac) don't have this
29
30package acronym;
31
32###########################################################################
33# global variables
34###########################################################################
35# valiables to control the recall/precision tradeoff
36
37#the maximum range to look for acronyms
38my $max_offset = 30;
39#acronyms must be upper case
40my $upper_case = 1;
41#acronym case must match
42my $case_match = 1;
43#minimum acronym length
44my $min_def_length = 3;
45#minimum acronym length
46my $min_acro_length = 3;
47#minimum acronym length saving
48my $min_length_saving = 4;
49#allow recusive acronyms
50my $allow_recursive = 0;
51#let definitions be all capitals
52my $allow_all_caps = 0;
53
54my @stop_words = split / /, "OF AT THE IN TO AND";
55
56#the text split into an array, one word per element
57my @split_text = ();
58my @acronym_list = ();
59
60
61my %acronyms_found_in_collection = ();
62my %acronyms_banned_from_collection = ();
63
64my $writing_acronyms = 1;
65my $accumulate_acronyms = 1;
66my $markup_accumulate_acronyms = 1;
67my $markup_local_acronyms = 1;
68
69
70
71###########################################################################
72# file saving / loading stuff
73###########################################################################
74
75sub initialise_acronyms {
76
77 my $local_max_offset = $max_offset;
78 my $local_upper_case = $upper_case;
79 my $local_case_match = $case_match ;
80 my $local_min_def_length = $min_def_length;
81 my $local_min_acro_length = $min_acro_length;
82 my $local_min_length_saving = $min_length_saving;
83 my $local_allow_recursive = $allow_recursive;
84 my $local_allow_all_caps = $allow_all_caps;
85 my @local_stop_words = @stop_words;
86
87
88 # the file to collate acronyms into
89
90 my $def_acronym_acc_file_pm = '&util::filename_cat($ENV{\'GSDLCOLLECTDIR\'}, "etc","acronym_definitions.pm");';
91 my $acronym_accumulate_file = eval { $def_acronym_acc_file_pm };
92 my $acronym_options_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc","acronym_options.pm");
93
94
95 my $file_text = "";
96 if (open ACRONYM_HANDLE, "<$acronym_options_file")
97 {
98 $file_text = do { local $/; <ACRONYM_HANDLE> };
99 }
100 if ($file_text eq "")
101 {
102 print STDERR "failed to open $acronym_options_file\n";
103 open ACRONYM_HANDLE, ">$acronym_options_file\n";
104 print ACRONYM_HANDLE "use util;\n";
105 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
106 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
107 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
108 print ACRONYM_HANDLE "#valid here.\n\n";
109 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
110 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
111 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
112 print ACRONYM_HANDLE "#explain them fully \n\n";
113 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
114 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
115 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
116 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
117 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
118 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
119 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
120 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
121 print ACRONYM_HANDLE "#let definitions be all capitals\n";
122 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
123 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
124 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
125 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
126 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
127 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
128 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
129 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
130 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
131 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
132 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
133 print ACRONYM_HANDLE "\n";
134 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
135 print ACRONYM_HANDLE "\$acronym_accumulate_file = $def_acronym_acc_file_pm\n";
136 print ACRONYM_HANDLE "\n";
137 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
138 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
139 print ACRONYM_HANDLE "# \n";
140 print ACRONYM_HANDLE "# \n";
141 print ACRONYM_HANDLE "# \n";
142 print STDERR "written new options file to $acronym_options_file...\n";
143 } else {
144 print STDERR "read file $acronym_options_file...\n";
145 eval $file_text ;
146 warn $@ if $@;
147 print STDERR "evaluated file $acronym_options_file...\n";
148 }
149
150 $max_offset = $local_max_offset;
151 $upper_case = $local_upper_case;
152 $case_match = $local_case_match ;
153 $min_def_length = $local_min_def_length;
154 $min_acro_length = $local_min_acro_length;
155 $min_length_saving = $local_min_length_saving;
156 $allow_recursive = $local_allow_recursive;
157 $allow_all_caps = $local_allow_all_caps;
158 @stop_words = @local_stop_words;
159
160
161 &read_all_acronyms_from_file($acronym_accumulate_file);
162# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
163# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
164 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
165 {
166 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
167 print ACRONYM_HANDLE "#\n";
168 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
169 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
170 print ACRONYM_HANDLE "#read-only. \n";
171 print ACRONYM_HANDLE "#\n";
172 print ACRONYM_HANDLE "#start of acronyms...\n";
173 $writing_acronyms = 1;
174 } else {
175 warn "failed to open $acronym_accumulate_file for writing\n";
176 $writing_acronyms = 0;
177 }
178}
179
180#close the list of accumulated acronyms
181sub finalise_acronyms {
182 if ($writing_acronyms)
183 {
184 print ACRONYM_HANDLE "#end of acronyms.\n";
185 close ACRONYM_HANDLE;
186 }
187}
188
189#eval a file of accumulated acronyms
190sub read_all_acronyms_from_file {
191 my ($acronym_accumulate_file) = @_;
192
193 my $file_text = " ";
194 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
195 {
196 $file_text = do { local $/; <ACRONYM_HANDLE> };
197 } else {
198 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
199 }
200 eval $file_text;
201 #promotes warnings/errors from evaluated file to current context
202 warn $@ if $@;
203}
204
205#called from within the file of accumulated acronyms to indicate a good acronym
206sub add {
207 my $self = shift (@_);
208 if (defined ($acronyms_found_in_collection{$self->[0]}))
209 {
210 my $def = $self->to_def_string();
211 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
212 {
213 return;
214 }
215 $acronyms_found_in_collection{$self->[0]} =
216 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
217 } else {
218 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
219 }
220}
221
222#called from within the file of accumulated acronyms to indicate a bad acronym
223sub ban {
224 my $self = shift (@_);
225
226 if (!defined $acronyms_banned_from_collection{$self->[0]})
227 {
228 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
229 } else {
230 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
231 }
232}
233
234
235#write a good acronym to the accumulated acronyms file
236sub write_to_file {
237 my $self = shift (@_);
238 if ($writing_acronyms)
239 {
240 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
241 $self->to_def_string() .
242 "\")->add();\n";
243 }
244}
245
246
247###########################################################################
248# mark functionality
249###########################################################################
250
251#small routine to sort by length
252sub sort_by_length {
253 length($b) <=> length($a) or $a cmp $b
254}
255
256sub markup_acronyms {
257 my $text = shift (@_);
258 my $verbosity_obj = shift (@_);
259 if (defined $text)
260 {
261 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
262 {
263 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
264 printf STDERR " " . $acro . ","
265 if ($verbosity_obj->{'verbosity'} >= 2);
266 }
267 }
268 return $text;
269}
270
271
272
273###########################################################################
274# member functions
275###########################################################################
276
277
278sub new {
279 my $trash = shift (@_);
280 my $acro = shift (@_);
281 my $def = shift (@_);
282
283 my $self = [
284 "", # 0 acronym
285 [], # 1 definition
286 ];
287
288 $self->[0] = $acro if defined $acro;
289 push @{$self->[1]}, split / /, $def if defined $def;
290
291 bless $self;
292}
293
294sub clone {
295 my $self = shift (@_);
296
297 my $copy = new acronym;
298
299 $copy->[0] = $self->[0];
300 push @{$copy->[1]}, @{$self->[1]};
301 bless $copy;
302
303 return $copy;
304}
305
306#return the acronym
307sub to_acronym {
308 my $self = shift (@_);
309 my @array = @{$self->[1]};
310
311 return $self->[0];
312}
313
314#return the number of words in the acronym definition
315sub words_in_acronym_definition {
316 my $self = shift (@_);
317 my @array = @{$self->[1]};
318
319 return $#array + 1;
320}
321
322#return the number of letters in the acronym definition
323sub letters_in_acronym_definition {
324 my $self = shift (@_);
325
326 return length($self->to_def_string());
327}
328
329#return the number of letters in the acronym definition
330sub letters_in_acronym {
331 my $self = shift (@_);
332
333 return length($self->to_acronym());
334}
335
336#return the acronym definition
337sub to_def_string {
338 my $self = shift (@_);
339
340 my $result = "";
341
342 # do the definition
343 my @array = @{$self->[1]};
344 my $i = 0;
345 while ($i <= $#array)
346 {
347 $result = $result . $array[$i];
348
349 if ($i+1 <= $#array)
350 {
351 $result = $result . " ";
352 }
353 $i++;
354 }
355 return $result;
356}
357
358
359#print out the kwic for the acronym
360sub to_string_kwic {
361 my $self = shift (@_);
362
363 # the list of all possible combinations
364 my @list = ();
365
366 my $result = "";
367
368 my $j = 0;
369 my @array = @{$self->[1]};
370 while ($j <= $#array)
371 {
372
373 # do the definition
374 my $i = 0;
375
376 #add the key word
377 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
378
379 #add the proceeding words
380 while ($i < $j)
381 {
382 $result = $result . $array[$i] . " ";
383 $i++;
384 }
385 #add the key word
386 $result = $result . "</td><td halign=left>" . $array[$j] .
387 "</td><td halign=left>";
388
389 #add the trailing words
390 $i++;
391 while ($i <= $#array )
392 {
393 $result = $result . $array[$i] . " ";
394 $i++;
395 }
396
397 #add the actual acronym
398
399 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
400
401 push @list, $result;
402 $j++;
403 }
404 return @list;
405}
406
407#this is the one used when building the collection ...
408sub to_string {
409 my $self = shift (@_);
410
411 my $result = $self->[0] . " ";
412
413 # do the definition
414 my @array = @{$self->[1]};
415 my $i = 0;
416 while ($i <= $#array)
417 {
418 $result = $result . $array[$i];
419 if ($i+1 <= $#array)
420 {
421 $result = $result . " ";
422 }
423 $i++;
424 }
425 return $result;
426}
427
428sub check {
429 my $self = shift (@_);
430
431 if (length($self->to_acronym()) < $min_acro_length)
432 {
433# print "acronym " . $self->to_string() . " rejected (too short I)\n";
434 return 0;
435 }
436 if ($self->words_in_acronym_definition() < $min_def_length)
437 {
438# print "acronym " . $self->to_string() . " rejected (too short II)\n";
439 return 0;
440 }
441 if ($min_length_saving * $self->letters_in_acronym() >
442 $self->letters_in_acronym_definition())
443 {
444# print "acronym " . $self->to_string() . " rejected (too short III)\n";
445# print "" . $min_length_saving .
446# "|" . $self->letters_in_acronym() .
447# "|" . $self->letters_in_acronym_definition() . "\n";
448 return 0;
449 }
450 if (!$allow_all_caps &&
451 $self->to_def_string() eq uc($self->to_def_string()))
452 {
453# print "acronym " . $self->to_string() . " rejected (all upper)\n";
454 return 0;
455 }
456 if (!$allow_all_caps)
457 {
458 my $upper_count = 0;
459 my $lower_count = 0;
460 my @letters = $self->to_def_string();
461 for my $letter (split //, $self->to_def_string())
462 {
463 if ($letter eq uc($letter))
464 {
465 $upper_count++;
466 } else {
467 $lower_count++;
468 }
469 }
470 return 0 if ($upper_count > $lower_count);
471 }
472 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
473 {
474 return 0;
475 }
476# print "acronym " . $self->to_string() . " not rejected\n";
477 return 1;
478}
479
480###########################################################################
481# static functions
482###########################################################################
483
484sub recurse {
485 my ($acro_offset, #offset of word we're finding acronyms for
486 $text_offset,
487 $letter_offset,
488 @def_so_far) = @_;
489
490 my $word = $split_text[$text_offset];
491 my $acro = $split_text[$acro_offset];
492 $word = "" if !defined $word;
493 $acro = "" if !defined $acro;
494
495# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
496# $letter_offset . ", " . @def_so_far . ")\n";
497
498 #check for termination ...
499 if ($letter_offset >= length($acro))
500 {
501 my $acronym = new acronym();
502 $acronym->[0] = $acro;
503 push @{$acronym->[1]}, @def_so_far;
504 if ($acronym->check())
505 {
506 push @acronym_list, ( $acronym );
507 }
508# print "acronym created\n";
509 return;
510 }
511 #check for recursion
512 if (!$allow_recursive)
513 {
514 if ($word eq $acro)
515 {
516# print "recursion detected\n";
517 return;
518 }
519 }
520
521 #skip a stop-word
522 my $i = 0;
523 if ($letter_offset != 0)
524 {
525 while ($i <= $#stop_words)
526 {
527 if ($stop_words[$i] eq uc($word))
528 {
529# print "::found stop word::" . $stop_words[$i] . "\n";
530 &recurse($acro_offset,
531 $text_offset+1,
532 $letter_offset,
533 @def_so_far, $word);
534 }
535 $i++;
536 }
537 }
538 $i = 1;
539 #using the first $i letters ...
540 while ($letter_offset+$i <= length($acro) )
541 {
542# print "". ((substr $word, 0, $i) . " " .
543# (substr $acro, $letter_offset, $i) . "\n");
544 if (((!$case_match) &&
545 (uc(substr $word, 0, $i) eq
546 uc(substr $acro, $letter_offset, $i)))
547 ||
548 (($case_match) &&
549 ((substr $word, 0, $i) eq
550 (substr $acro, $letter_offset, $i))))
551 {
552# print "::match::\n";
553# print "" . ((substr $word, 0, $i) . " " .
554# (substr $acro, $letter_offset, $i) . "\n");
555 &recurse($acro_offset,
556 $text_offset+1,
557 $letter_offset+$i,
558 @def_so_far, $word);
559 } else {
560 return;
561 }
562 $i++;
563 }
564 return;
565}
566
567#the main
568sub acronyms {
569 #clean up the text
570 my $processed_text = shift @_;
571 $$processed_text =~ s/<[^>]*>/ /g;
572 $$processed_text =~ s/\W/ /g;
573 $$processed_text =~ s/[0-9_]/ /g;
574 $$processed_text =~ s/\s+/ /g;
575 $$processed_text =~ s/(\n|\>)References.*/ /i;
576 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
577 $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
578 $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
579
580 #clear some global variables
581 @split_text = ();
582 @acronym_list = ();
583
584 return &acronyms_from_clean_text($processed_text);
585}
586
587sub acronyms_from_clean_text {
588 my ($processed_text) = @_;
589
590 @split_text = split / /, $$processed_text;
591
592# my $i = 0;
593# while ($i <= $#split_text)
594# {
595# print $split_text[$i] . "\n";
596# $i++;
597# }
598
599 my $first = 0;
600 my $last = $#split_text +1;
601 my $acro_counter = $first;
602
603 while ($acro_counter < $last)
604 {
605 my $word = $split_text[$acro_counter];
606
607 if ((!$upper_case) ||
608 (uc($word) eq $word))
609 {
610
611 if (length $word >= $min_acro_length)
612 {
613 my $def_counter = 0;
614 if ($acro_counter - $max_offset > 0)
615 {
616 $def_counter = $acro_counter - $max_offset;
617 }
618 my $local_last = $acro_counter + $max_offset;
619 if ($local_last > $last)
620 {
621 $local_last = $last;
622 }
623 while ($def_counter <= $local_last)
624 {
625 &recurse($acro_counter,$def_counter,0,());
626 $def_counter++;
627 }
628 }
629 }
630 $acro_counter++;
631 }
632
633 return \@acronym_list;
634}
635
636
637
638sub test {
639
640# my $blarg = new acronym();
641# my $simple;
642# $simple = 10;
643# $blarg->initialise($simple, $simple, $simple);
644# my $blarg2 = $blarg->clone();
645# print $blarg->to_string();
646# print $blarg2;
647# print "\n";
648#
649 my $tla = new acronym();
650 $tla->[0] = "TLA";
651
652 my @array = ("Three", "Letter", "Acronym");
653# my $i = 0;
654# while ($i <= $#array)
655# {
656# print @array[$i] . "\n";
657# $i++;
658# }
659
660 print "\n";
661 push @{$tla->[1]}, ("Three" );
662 push @{$tla->[1]}, ("Letter" );
663 push @{$tla->[1]}, ("Acronym" );
664 print $tla->to_string(). "\n";
665 print "\n";
666 print "\n";
667 my $tla2 = $tla->clone();
668 push @{$tla2->[1]}, ("One");
669 push @{$tla2->[1]}, ("Two");
670 $tla2->[0] = "ALT";
671 print $tla->to_string(). "\n";
672 print $tla2->to_string(). "\n";
673
674 print "\n";
675 print "\n";
676
677 print "Testing recursion ...\n";
678 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
679
680 foreach my $acro (@$acros)
681 {
682 if ($acro->check)
683 {
684 print "accepted: " .$acro->to_string() . "\n";
685# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
686# "|" . $acro->words_in_acronym_definition() .
687# "|" . $acro->letters_in_acronym_definition() .
688# "|" . $acro->letters_in_acronym() . "|\n";
689 } else {
690# print "found but rejected: " .$acro->to_string() . "\n";
691 }
692 }
693}
694
695#uncomment this line to test this package
696#&test();
697
6981;
699
700
Note: See TracBrowser for help on using the repository browser.