source: gsdl/trunk/perllib/acronym.pm@ 15894

Last change on this file since 15894 was 15894, checked in by mdewsnip, 16 years ago

Added "use strict" to the files missing it.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28eval "require diagnostics"; # some perl distros (eg mac) don't have this
29
30package acronym;
31
32use strict;
33
34###########################################################################
35# global variables
36###########################################################################
37# valiables to control the recall/precision tradeoff
38
39#the maximum range to look for acronyms
40my $max_offset = 30;
41#acronyms must be upper case
42my $upper_case = 1;
43#acronym case must match
44my $case_match = 1;
45#minimum acronym length
46my $min_def_length = 3;
47#minimum acronym length
48my $min_acro_length = 3;
49#minimum acronym length saving
50my $min_length_saving = 4;
51#allow recusive acronyms
52my $allow_recursive = 0;
53#let definitions be all capitals
54my $allow_all_caps = 0;
55
56my @stop_words = split / /, "OF AT THE IN TO AND";
57
58#the text split into an array, one word per element
59my @split_text = ();
60my @acronym_list = ();
61
62
63my %acronyms_found_in_collection = ();
64my %acronyms_banned_from_collection = ();
65
66my $writing_acronyms = 1;
67my $accumulate_acronyms = 1;
68my $markup_accumulate_acronyms = 1;
69my $markup_local_acronyms = 1;
70
71
72
73###########################################################################
74# file saving / loading stuff
75###########################################################################
76
77sub initialise_acronyms {
78
79 my $local_max_offset = $max_offset;
80 my $local_upper_case = $upper_case;
81 my $local_case_match = $case_match ;
82 my $local_min_def_length = $min_def_length;
83 my $local_min_acro_length = $min_acro_length;
84 my $local_min_length_saving = $min_length_saving;
85 my $local_allow_recursive = $allow_recursive;
86 my $local_allow_all_caps = $allow_all_caps;
87 my @local_stop_words = @stop_words;
88
89
90 # the file to collate acronyms into
91
92 my $def_acronym_acc_file_pm = '&util::filename_cat($ENV{\'GSDLCOLLECTDIR\'}, "etc","acronym_definitions.pm");';
93 my $acronym_accumulate_file = eval { $def_acronym_acc_file_pm };
94 my $acronym_options_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc","acronym_options.pm");
95
96
97 my $file_text = "";
98 if (open ACRONYM_HANDLE, "<$acronym_options_file")
99 {
100 $file_text = do { local $/; <ACRONYM_HANDLE> };
101 }
102 if ($file_text eq "")
103 {
104 print STDERR "failed to open $acronym_options_file\n";
105 open ACRONYM_HANDLE, ">$acronym_options_file\n";
106 print ACRONYM_HANDLE "use util;\n";
107 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
108 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
109 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
110 print ACRONYM_HANDLE "#valid here.\n\n";
111 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
112 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
113 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
114 print ACRONYM_HANDLE "#explain them fully \n\n";
115 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
116 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
117 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
118 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
119 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
120 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
121 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
122 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
123 print ACRONYM_HANDLE "#let definitions be all capitals\n";
124 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
125 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
126 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
127 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
128 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
129 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
130 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
131 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
132 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
133 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
134 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
135 print ACRONYM_HANDLE "\n";
136 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
137 print ACRONYM_HANDLE "\$acronym_accumulate_file = $def_acronym_acc_file_pm\n";
138 print ACRONYM_HANDLE "\n";
139 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
140 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
141 print ACRONYM_HANDLE "# \n";
142 print ACRONYM_HANDLE "# \n";
143 print ACRONYM_HANDLE "# \n";
144 print STDERR "written new options file to $acronym_options_file...\n";
145 } else {
146 print STDERR "read file $acronym_options_file...\n";
147 eval $file_text ;
148 warn $@ if $@;
149 print STDERR "evaluated file $acronym_options_file...\n";
150 }
151
152 $max_offset = $local_max_offset;
153 $upper_case = $local_upper_case;
154 $case_match = $local_case_match ;
155 $min_def_length = $local_min_def_length;
156 $min_acro_length = $local_min_acro_length;
157 $min_length_saving = $local_min_length_saving;
158 $allow_recursive = $local_allow_recursive;
159 $allow_all_caps = $local_allow_all_caps;
160 @stop_words = @local_stop_words;
161
162
163 &read_all_acronyms_from_file($acronym_accumulate_file);
164# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
165# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
166 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
167 {
168 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
169 print ACRONYM_HANDLE "#\n";
170 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
171 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
172 print ACRONYM_HANDLE "#read-only. \n";
173 print ACRONYM_HANDLE "#\n";
174 print ACRONYM_HANDLE "#start of acronyms...\n";
175 $writing_acronyms = 1;
176 } else {
177 warn "failed to open $acronym_accumulate_file for writing\n";
178 $writing_acronyms = 0;
179 }
180}
181
182#close the list of accumulated acronyms
183sub finalise_acronyms {
184 if ($writing_acronyms)
185 {
186 print ACRONYM_HANDLE "#end of acronyms.\n";
187 close ACRONYM_HANDLE;
188 }
189}
190
191#eval a file of accumulated acronyms
192sub read_all_acronyms_from_file {
193 my ($acronym_accumulate_file) = @_;
194
195 my $file_text = " ";
196 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
197 {
198 $file_text = do { local $/; <ACRONYM_HANDLE> };
199 } else {
200 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
201 }
202 eval $file_text;
203 #promotes warnings/errors from evaluated file to current context
204 warn $@ if $@;
205}
206
207#called from within the file of accumulated acronyms to indicate a good acronym
208sub add {
209 my $self = shift (@_);
210 if (defined ($acronyms_found_in_collection{$self->[0]}))
211 {
212 my $def = $self->to_def_string();
213 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
214 {
215 return;
216 }
217 $acronyms_found_in_collection{$self->[0]} =
218 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
219 } else {
220 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
221 }
222}
223
224#called from within the file of accumulated acronyms to indicate a bad acronym
225sub ban {
226 my $self = shift (@_);
227
228 if (!defined $acronyms_banned_from_collection{$self->[0]})
229 {
230 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
231 } else {
232 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
233 }
234}
235
236
237#write a good acronym to the accumulated acronyms file
238sub write_to_file {
239 my $self = shift (@_);
240 if ($writing_acronyms)
241 {
242 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
243 $self->to_def_string() .
244 "\")->add();\n";
245 }
246}
247
248
249###########################################################################
250# mark functionality
251###########################################################################
252
253#small routine to sort by length
254sub sort_by_length {
255 length($b) <=> length($a) or $a cmp $b
256}
257
258sub markup_acronyms {
259 my $text = shift (@_);
260 my $verbosity_obj = shift (@_);
261 if (defined $text)
262 {
263 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
264 {
265 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
266 printf STDERR " " . $acro . ","
267 if ($verbosity_obj->{'verbosity'} >= 2);
268 }
269 }
270 return $text;
271}
272
273
274
275###########################################################################
276# member functions
277###########################################################################
278
279
280sub new {
281 my $trash = shift (@_);
282 my $acro = shift (@_);
283 my $def = shift (@_);
284
285 my $self = [
286 "", # 0 acronym
287 [], # 1 definition
288 ];
289
290 $self->[0] = $acro if defined $acro;
291 push @{$self->[1]}, split / /, $def if defined $def;
292
293 bless $self;
294}
295
296sub clone {
297 my $self = shift (@_);
298
299 my $copy = new acronym;
300
301 $copy->[0] = $self->[0];
302 push @{$copy->[1]}, @{$self->[1]};
303 bless $copy;
304
305 return $copy;
306}
307
308#return the acronym
309sub to_acronym {
310 my $self = shift (@_);
311 my @array = @{$self->[1]};
312
313 return $self->[0];
314}
315
316#return the number of words in the acronym definition
317sub words_in_acronym_definition {
318 my $self = shift (@_);
319 my @array = @{$self->[1]};
320
321 return $#array + 1;
322}
323
324#return the number of letters in the acronym definition
325sub letters_in_acronym_definition {
326 my $self = shift (@_);
327
328 return length($self->to_def_string());
329}
330
331#return the number of letters in the acronym definition
332sub letters_in_acronym {
333 my $self = shift (@_);
334
335 return length($self->to_acronym());
336}
337
338#return the acronym definition
339sub to_def_string {
340 my $self = shift (@_);
341
342 my $result = "";
343
344 # do the definition
345 my @array = @{$self->[1]};
346 my $i = 0;
347 while ($i <= $#array)
348 {
349 $result = $result . $array[$i];
350
351 if ($i+1 <= $#array)
352 {
353 $result = $result . " ";
354 }
355 $i++;
356 }
357 return $result;
358}
359
360
361#print out the kwic for the acronym
362sub to_string_kwic {
363 my $self = shift (@_);
364
365 # the list of all possible combinations
366 my @list = ();
367
368 my $result = "";
369
370 my $j = 0;
371 my @array = @{$self->[1]};
372 while ($j <= $#array)
373 {
374
375 # do the definition
376 my $i = 0;
377
378 #add the key word
379 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
380
381 #add the proceeding words
382 while ($i < $j)
383 {
384 $result = $result . $array[$i] . " ";
385 $i++;
386 }
387 #add the key word
388 $result = $result . "</td><td halign=left>" . $array[$j] .
389 "</td><td halign=left>";
390
391 #add the trailing words
392 $i++;
393 while ($i <= $#array )
394 {
395 $result = $result . $array[$i] . " ";
396 $i++;
397 }
398
399 #add the actual acronym
400
401 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
402
403 push @list, $result;
404 $j++;
405 }
406 return @list;
407}
408
409#this is the one used when building the collection ...
410sub to_string {
411 my $self = shift (@_);
412
413 my $result = $self->[0] . " ";
414
415 # do the definition
416 my @array = @{$self->[1]};
417 my $i = 0;
418 while ($i <= $#array)
419 {
420 $result = $result . $array[$i];
421 if ($i+1 <= $#array)
422 {
423 $result = $result . " ";
424 }
425 $i++;
426 }
427 return $result;
428}
429
430sub check {
431 my $self = shift (@_);
432
433 if (length($self->to_acronym()) < $min_acro_length)
434 {
435# print "acronym " . $self->to_string() . " rejected (too short I)\n";
436 return 0;
437 }
438 if ($self->words_in_acronym_definition() < $min_def_length)
439 {
440# print "acronym " . $self->to_string() . " rejected (too short II)\n";
441 return 0;
442 }
443 if ($min_length_saving * $self->letters_in_acronym() >
444 $self->letters_in_acronym_definition())
445 {
446# print "acronym " . $self->to_string() . " rejected (too short III)\n";
447# print "" . $min_length_saving .
448# "|" . $self->letters_in_acronym() .
449# "|" . $self->letters_in_acronym_definition() . "\n";
450 return 0;
451 }
452 if (!$allow_all_caps &&
453 $self->to_def_string() eq uc($self->to_def_string()))
454 {
455# print "acronym " . $self->to_string() . " rejected (all upper)\n";
456 return 0;
457 }
458 if (!$allow_all_caps)
459 {
460 my $upper_count = 0;
461 my $lower_count = 0;
462 my @letters = $self->to_def_string();
463 for my $letter (split //, $self->to_def_string())
464 {
465 if ($letter eq uc($letter))
466 {
467 $upper_count++;
468 } else {
469 $lower_count++;
470 }
471 }
472 return 0 if ($upper_count > $lower_count);
473 }
474 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
475 {
476 return 0;
477 }
478# print "acronym " . $self->to_string() . " not rejected\n";
479 return 1;
480}
481
482###########################################################################
483# static functions
484###########################################################################
485
486sub recurse {
487 my ($acro_offset, #offset of word we're finding acronyms for
488 $text_offset,
489 $letter_offset,
490 @def_so_far) = @_;
491
492 my $word = $split_text[$text_offset];
493 my $acro = $split_text[$acro_offset];
494 $word = "" if !defined $word;
495 $acro = "" if !defined $acro;
496
497# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
498# $letter_offset . ", " . @def_so_far . ")\n";
499
500 #check for termination ...
501 if ($letter_offset >= length($acro))
502 {
503 my $acronym = new acronym();
504 $acronym->[0] = $acro;
505 push @{$acronym->[1]}, @def_so_far;
506 if ($acronym->check())
507 {
508 push @acronym_list, ( $acronym );
509 }
510# print "acronym created\n";
511 return;
512 }
513 #check for recursion
514 if (!$allow_recursive)
515 {
516 if ($word eq $acro)
517 {
518# print "recursion detected\n";
519 return;
520 }
521 }
522
523 #skip a stop-word
524 my $i = 0;
525 if ($letter_offset != 0)
526 {
527 while ($i <= $#stop_words)
528 {
529 if ($stop_words[$i] eq uc($word))
530 {
531# print "::found stop word::" . $stop_words[$i] . "\n";
532 &recurse($acro_offset,
533 $text_offset+1,
534 $letter_offset,
535 @def_so_far, $word);
536 }
537 $i++;
538 }
539 }
540 $i = 1;
541 #using the first $i letters ...
542 while ($letter_offset+$i <= length($acro) )
543 {
544# print "". ((substr $word, 0, $i) . " " .
545# (substr $acro, $letter_offset, $i) . "\n");
546 if (((!$case_match) &&
547 (uc(substr $word, 0, $i) eq
548 uc(substr $acro, $letter_offset, $i)))
549 ||
550 (($case_match) &&
551 ((substr $word, 0, $i) eq
552 (substr $acro, $letter_offset, $i))))
553 {
554# print "::match::\n";
555# print "" . ((substr $word, 0, $i) . " " .
556# (substr $acro, $letter_offset, $i) . "\n");
557 &recurse($acro_offset,
558 $text_offset+1,
559 $letter_offset+$i,
560 @def_so_far, $word);
561 } else {
562 return;
563 }
564 $i++;
565 }
566 return;
567}
568
569#the main
570sub acronyms {
571 #clean up the text
572 my $processed_text = shift @_;
573 $$processed_text =~ s/<[^>]*>/ /g;
574 $$processed_text =~ s/\W/ /g;
575 $$processed_text =~ s/[0-9_]/ /g;
576 $$processed_text =~ s/\s+/ /g;
577 $$processed_text =~ s/(\n|\>)References.*/ /i;
578 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
579 $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
580 $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
581
582 #clear some global variables
583 @split_text = ();
584 @acronym_list = ();
585
586 return &acronyms_from_clean_text($processed_text);
587}
588
589sub acronyms_from_clean_text {
590 my ($processed_text) = @_;
591
592 @split_text = split / /, $$processed_text;
593
594# my $i = 0;
595# while ($i <= $#split_text)
596# {
597# print $split_text[$i] . "\n";
598# $i++;
599# }
600
601 my $first = 0;
602 my $last = $#split_text +1;
603 my $acro_counter = $first;
604
605 while ($acro_counter < $last)
606 {
607 my $word = $split_text[$acro_counter];
608
609 if ((!$upper_case) ||
610 (uc($word) eq $word))
611 {
612
613 if (length $word >= $min_acro_length)
614 {
615 my $def_counter = 0;
616 if ($acro_counter - $max_offset > 0)
617 {
618 $def_counter = $acro_counter - $max_offset;
619 }
620 my $local_last = $acro_counter + $max_offset;
621 if ($local_last > $last)
622 {
623 $local_last = $last;
624 }
625 while ($def_counter <= $local_last)
626 {
627 &recurse($acro_counter,$def_counter,0,());
628 $def_counter++;
629 }
630 }
631 }
632 $acro_counter++;
633 }
634
635 return \@acronym_list;
636}
637
638
639
640sub test {
641
642# my $blarg = new acronym();
643# my $simple;
644# $simple = 10;
645# $blarg->initialise($simple, $simple, $simple);
646# my $blarg2 = $blarg->clone();
647# print $blarg->to_string();
648# print $blarg2;
649# print "\n";
650#
651 my $tla = new acronym();
652 $tla->[0] = "TLA";
653
654 my @array = ("Three", "Letter", "Acronym");
655# my $i = 0;
656# while ($i <= $#array)
657# {
658# print @array[$i] . "\n";
659# $i++;
660# }
661
662 print "\n";
663 push @{$tla->[1]}, ("Three" );
664 push @{$tla->[1]}, ("Letter" );
665 push @{$tla->[1]}, ("Acronym" );
666 print $tla->to_string(). "\n";
667 print "\n";
668 print "\n";
669 my $tla2 = $tla->clone();
670 push @{$tla2->[1]}, ("One");
671 push @{$tla2->[1]}, ("Two");
672 $tla2->[0] = "ALT";
673 print $tla->to_string(). "\n";
674 print $tla2->to_string(). "\n";
675
676 print "\n";
677 print "\n";
678
679 print "Testing recursion ...\n";
680 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
681
682 foreach my $acro (@$acros)
683 {
684 if ($acro->check)
685 {
686 print "accepted: " .$acro->to_string() . "\n";
687# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
688# "|" . $acro->words_in_acronym_definition() .
689# "|" . $acro->letters_in_acronym_definition() .
690# "|" . $acro->letters_in_acronym() . "|\n";
691 } else {
692# print "found but rejected: " .$acro->to_string() . "\n";
693 }
694 }
695}
696
697#uncomment this line to test this package
698#&test();
699
7001;
701
702
Note: See TracBrowser for help on using the repository browser.