source: main/trunk/greenstone2/perllib/acronym.pm@ 32578

Last change on this file since 32578 was 18751, checked in by kjdon, 15 years ago

added missing 'use util'

  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28eval "require diagnostics"; # some perl distros (eg mac) don't have this
29
30package acronym;
31
32use util;
33use strict;
34
35###########################################################################
36# global variables
37###########################################################################
38# valiables to control the recall/precision tradeoff
39
40#the maximum range to look for acronyms
41my $max_offset = 30;
42#acronyms must be upper case
43my $upper_case = 1;
44#acronym case must match
45my $case_match = 1;
46#minimum acronym length
47my $min_def_length = 3;
48#minimum acronym length
49my $min_acro_length = 3;
50#minimum acronym length saving
51my $min_length_saving = 4;
52#allow recusive acronyms
53my $allow_recursive = 0;
54#let definitions be all capitals
55my $allow_all_caps = 0;
56
57my @stop_words = split / /, "OF AT THE IN TO AND";
58
59#the text split into an array, one word per element
60my @split_text = ();
61my @acronym_list = ();
62
63
64my %acronyms_found_in_collection = ();
65my %acronyms_banned_from_collection = ();
66
67my $writing_acronyms = 1;
68my $accumulate_acronyms = 1;
69my $markup_accumulate_acronyms = 1;
70my $markup_local_acronyms = 1;
71
72
73
74###########################################################################
75# file saving / loading stuff
76###########################################################################
77
78sub initialise_acronyms {
79
80 my $local_max_offset = $max_offset;
81 my $local_upper_case = $upper_case;
82 my $local_case_match = $case_match ;
83 my $local_min_def_length = $min_def_length;
84 my $local_min_acro_length = $min_acro_length;
85 my $local_min_length_saving = $min_length_saving;
86 my $local_allow_recursive = $allow_recursive;
87 my $local_allow_all_caps = $allow_all_caps;
88 my @local_stop_words = @stop_words;
89
90
91 # the file to collate acronyms into
92
93 my $def_acronym_acc_file_pm = '&util::filename_cat($ENV{\'GSDLCOLLECTDIR\'}, "etc","acronym_definitions.pm");';
94 my $acronym_accumulate_file = eval { $def_acronym_acc_file_pm };
95 my $acronym_options_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc","acronym_options.pm");
96
97
98 my $file_text = "";
99 if (open ACRONYM_HANDLE, "<$acronym_options_file")
100 {
101 $file_text = do { local $/; <ACRONYM_HANDLE> };
102 }
103 if ($file_text eq "")
104 {
105 print STDERR "failed to open $acronym_options_file\n";
106 open ACRONYM_HANDLE, ">$acronym_options_file\n";
107 print ACRONYM_HANDLE "use util;\n";
108 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
109 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
110 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
111 print ACRONYM_HANDLE "#valid here.\n\n";
112 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
113 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
114 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
115 print ACRONYM_HANDLE "#explain them fully \n\n";
116 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
117 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
118 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
119 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
120 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
121 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
122 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
123 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
124 print ACRONYM_HANDLE "#let definitions be all capitals\n";
125 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
126 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
127 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
128 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
129 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
130 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
131 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
132 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
133 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
134 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
135 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
136 print ACRONYM_HANDLE "\n";
137 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
138 print ACRONYM_HANDLE "\$acronym_accumulate_file = $def_acronym_acc_file_pm\n";
139 print ACRONYM_HANDLE "\n";
140 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
141 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
142 print ACRONYM_HANDLE "# \n";
143 print ACRONYM_HANDLE "# \n";
144 print ACRONYM_HANDLE "# \n";
145 print STDERR "written new options file to $acronym_options_file...\n";
146 } else {
147 print STDERR "read file $acronym_options_file...\n";
148 eval $file_text ;
149 warn $@ if $@;
150 print STDERR "evaluated file $acronym_options_file...\n";
151 }
152
153 $max_offset = $local_max_offset;
154 $upper_case = $local_upper_case;
155 $case_match = $local_case_match ;
156 $min_def_length = $local_min_def_length;
157 $min_acro_length = $local_min_acro_length;
158 $min_length_saving = $local_min_length_saving;
159 $allow_recursive = $local_allow_recursive;
160 $allow_all_caps = $local_allow_all_caps;
161 @stop_words = @local_stop_words;
162
163
164 &read_all_acronyms_from_file($acronym_accumulate_file);
165# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
166# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
167 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
168 {
169 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
170 print ACRONYM_HANDLE "#\n";
171 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
172 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
173 print ACRONYM_HANDLE "#read-only. \n";
174 print ACRONYM_HANDLE "#\n";
175 print ACRONYM_HANDLE "#start of acronyms...\n";
176 $writing_acronyms = 1;
177 } else {
178 warn "failed to open $acronym_accumulate_file for writing\n";
179 $writing_acronyms = 0;
180 }
181}
182
183#close the list of accumulated acronyms
184sub finalise_acronyms {
185 if ($writing_acronyms)
186 {
187 print ACRONYM_HANDLE "#end of acronyms.\n";
188 close ACRONYM_HANDLE;
189 }
190}
191
192#eval a file of accumulated acronyms
193sub read_all_acronyms_from_file {
194 my ($acronym_accumulate_file) = @_;
195
196 my $file_text = " ";
197 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
198 {
199 $file_text = do { local $/; <ACRONYM_HANDLE> };
200 } else {
201 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
202 }
203 eval $file_text;
204 #promotes warnings/errors from evaluated file to current context
205 warn $@ if $@;
206}
207
208#called from within the file of accumulated acronyms to indicate a good acronym
209sub add {
210 my $self = shift (@_);
211 if (defined ($acronyms_found_in_collection{$self->[0]}))
212 {
213 my $def = $self->to_def_string();
214 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
215 {
216 return;
217 }
218 $acronyms_found_in_collection{$self->[0]} =
219 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
220 } else {
221 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
222 }
223}
224
225#called from within the file of accumulated acronyms to indicate a bad acronym
226sub ban {
227 my $self = shift (@_);
228
229 if (!defined $acronyms_banned_from_collection{$self->[0]})
230 {
231 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
232 } else {
233 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
234 }
235}
236
237
238#write a good acronym to the accumulated acronyms file
239sub write_to_file {
240 my $self = shift (@_);
241 if ($writing_acronyms)
242 {
243 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
244 $self->to_def_string() .
245 "\")->add();\n";
246 }
247}
248
249
250###########################################################################
251# mark functionality
252###########################################################################
253
254#small routine to sort by length
255sub sort_by_length {
256 length($b) <=> length($a) or $a cmp $b
257}
258
259sub markup_acronyms {
260 my $text = shift (@_);
261 my $verbosity_obj = shift (@_);
262 if (defined $text)
263 {
264 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
265 {
266 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
267 printf STDERR " " . $acro . ","
268 if ($verbosity_obj->{'verbosity'} >= 2);
269 }
270 }
271 return $text;
272}
273
274
275
276###########################################################################
277# member functions
278###########################################################################
279
280
281sub new {
282 my $trash = shift (@_);
283 my $acro = shift (@_);
284 my $def = shift (@_);
285
286 my $self = [
287 "", # 0 acronym
288 [], # 1 definition
289 ];
290
291 $self->[0] = $acro if defined $acro;
292 push @{$self->[1]}, split / /, $def if defined $def;
293
294 bless $self;
295}
296
297sub clone {
298 my $self = shift (@_);
299
300 my $copy = new acronym;
301
302 $copy->[0] = $self->[0];
303 push @{$copy->[1]}, @{$self->[1]};
304 bless $copy;
305
306 return $copy;
307}
308
309#return the acronym
310sub to_acronym {
311 my $self = shift (@_);
312 my @array = @{$self->[1]};
313
314 return $self->[0];
315}
316
317#return the number of words in the acronym definition
318sub words_in_acronym_definition {
319 my $self = shift (@_);
320 my @array = @{$self->[1]};
321
322 return $#array + 1;
323}
324
325#return the number of letters in the acronym definition
326sub letters_in_acronym_definition {
327 my $self = shift (@_);
328
329 return length($self->to_def_string());
330}
331
332#return the number of letters in the acronym definition
333sub letters_in_acronym {
334 my $self = shift (@_);
335
336 return length($self->to_acronym());
337}
338
339#return the acronym definition
340sub to_def_string {
341 my $self = shift (@_);
342
343 my $result = "";
344
345 # do the definition
346 my @array = @{$self->[1]};
347 my $i = 0;
348 while ($i <= $#array)
349 {
350 $result = $result . $array[$i];
351
352 if ($i+1 <= $#array)
353 {
354 $result = $result . " ";
355 }
356 $i++;
357 }
358 return $result;
359}
360
361
362#print out the kwic for the acronym
363sub to_string_kwic {
364 my $self = shift (@_);
365
366 # the list of all possible combinations
367 my @list = ();
368
369 my $result = "";
370
371 my $j = 0;
372 my @array = @{$self->[1]};
373 while ($j <= $#array)
374 {
375
376 # do the definition
377 my $i = 0;
378
379 #add the key word
380 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
381
382 #add the proceeding words
383 while ($i < $j)
384 {
385 $result = $result . $array[$i] . " ";
386 $i++;
387 }
388 #add the key word
389 $result = $result . "</td><td halign=left>" . $array[$j] .
390 "</td><td halign=left>";
391
392 #add the trailing words
393 $i++;
394 while ($i <= $#array )
395 {
396 $result = $result . $array[$i] . " ";
397 $i++;
398 }
399
400 #add the actual acronym
401
402 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
403
404 push @list, $result;
405 $j++;
406 }
407 return @list;
408}
409
410#this is the one used when building the collection ...
411sub to_string {
412 my $self = shift (@_);
413
414 my $result = $self->[0] . " ";
415
416 # do the definition
417 my @array = @{$self->[1]};
418 my $i = 0;
419 while ($i <= $#array)
420 {
421 $result = $result . $array[$i];
422 if ($i+1 <= $#array)
423 {
424 $result = $result . " ";
425 }
426 $i++;
427 }
428 return $result;
429}
430
431sub check {
432 my $self = shift (@_);
433
434 if (length($self->to_acronym()) < $min_acro_length)
435 {
436# print "acronym " . $self->to_string() . " rejected (too short I)\n";
437 return 0;
438 }
439 if ($self->words_in_acronym_definition() < $min_def_length)
440 {
441# print "acronym " . $self->to_string() . " rejected (too short II)\n";
442 return 0;
443 }
444 if ($min_length_saving * $self->letters_in_acronym() >
445 $self->letters_in_acronym_definition())
446 {
447# print "acronym " . $self->to_string() . " rejected (too short III)\n";
448# print "" . $min_length_saving .
449# "|" . $self->letters_in_acronym() .
450# "|" . $self->letters_in_acronym_definition() . "\n";
451 return 0;
452 }
453 if (!$allow_all_caps &&
454 $self->to_def_string() eq uc($self->to_def_string()))
455 {
456# print "acronym " . $self->to_string() . " rejected (all upper)\n";
457 return 0;
458 }
459 if (!$allow_all_caps)
460 {
461 my $upper_count = 0;
462 my $lower_count = 0;
463 my @letters = $self->to_def_string();
464 for my $letter (split //, $self->to_def_string())
465 {
466 if ($letter eq uc($letter))
467 {
468 $upper_count++;
469 } else {
470 $lower_count++;
471 }
472 }
473 return 0 if ($upper_count > $lower_count);
474 }
475 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
476 {
477 return 0;
478 }
479# print "acronym " . $self->to_string() . " not rejected\n";
480 return 1;
481}
482
483###########################################################################
484# static functions
485###########################################################################
486
487sub recurse {
488 my ($acro_offset, #offset of word we're finding acronyms for
489 $text_offset,
490 $letter_offset,
491 @def_so_far) = @_;
492
493 my $word = $split_text[$text_offset];
494 my $acro = $split_text[$acro_offset];
495 $word = "" if !defined $word;
496 $acro = "" if !defined $acro;
497
498# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
499# $letter_offset . ", " . @def_so_far . ")\n";
500
501 #check for termination ...
502 if ($letter_offset >= length($acro))
503 {
504 my $acronym = new acronym();
505 $acronym->[0] = $acro;
506 push @{$acronym->[1]}, @def_so_far;
507 if ($acronym->check())
508 {
509 push @acronym_list, ( $acronym );
510 }
511# print "acronym created\n";
512 return;
513 }
514 #check for recursion
515 if (!$allow_recursive)
516 {
517 if ($word eq $acro)
518 {
519# print "recursion detected\n";
520 return;
521 }
522 }
523
524 #skip a stop-word
525 my $i = 0;
526 if ($letter_offset != 0)
527 {
528 while ($i <= $#stop_words)
529 {
530 if ($stop_words[$i] eq uc($word))
531 {
532# print "::found stop word::" . $stop_words[$i] . "\n";
533 &recurse($acro_offset,
534 $text_offset+1,
535 $letter_offset,
536 @def_so_far, $word);
537 }
538 $i++;
539 }
540 }
541 $i = 1;
542 #using the first $i letters ...
543 while ($letter_offset+$i <= length($acro) )
544 {
545# print "". ((substr $word, 0, $i) . " " .
546# (substr $acro, $letter_offset, $i) . "\n");
547 if (((!$case_match) &&
548 (uc(substr $word, 0, $i) eq
549 uc(substr $acro, $letter_offset, $i)))
550 ||
551 (($case_match) &&
552 ((substr $word, 0, $i) eq
553 (substr $acro, $letter_offset, $i))))
554 {
555# print "::match::\n";
556# print "" . ((substr $word, 0, $i) . " " .
557# (substr $acro, $letter_offset, $i) . "\n");
558 &recurse($acro_offset,
559 $text_offset+1,
560 $letter_offset+$i,
561 @def_so_far, $word);
562 } else {
563 return;
564 }
565 $i++;
566 }
567 return;
568}
569
570#the main
571sub acronyms {
572 #clean up the text
573 my $processed_text = shift @_;
574 $$processed_text =~ s/<[^>]*>/ /g;
575 $$processed_text =~ s/\W/ /g;
576 $$processed_text =~ s/[0-9_]/ /g;
577 $$processed_text =~ s/\s+/ /g;
578 $$processed_text =~ s/(\n|\>)References.*/ /i;
579 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
580 $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
581 $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
582
583 #clear some global variables
584 @split_text = ();
585 @acronym_list = ();
586
587 return &acronyms_from_clean_text($processed_text);
588}
589
590sub acronyms_from_clean_text {
591 my ($processed_text) = @_;
592
593 @split_text = split / /, $$processed_text;
594
595# my $i = 0;
596# while ($i <= $#split_text)
597# {
598# print $split_text[$i] . "\n";
599# $i++;
600# }
601
602 my $first = 0;
603 my $last = $#split_text +1;
604 my $acro_counter = $first;
605
606 while ($acro_counter < $last)
607 {
608 my $word = $split_text[$acro_counter];
609
610 if ((!$upper_case) ||
611 (uc($word) eq $word))
612 {
613
614 if (length $word >= $min_acro_length)
615 {
616 my $def_counter = 0;
617 if ($acro_counter - $max_offset > 0)
618 {
619 $def_counter = $acro_counter - $max_offset;
620 }
621 my $local_last = $acro_counter + $max_offset;
622 if ($local_last > $last)
623 {
624 $local_last = $last;
625 }
626 while ($def_counter <= $local_last)
627 {
628 &recurse($acro_counter,$def_counter,0,());
629 $def_counter++;
630 }
631 }
632 }
633 $acro_counter++;
634 }
635
636 return \@acronym_list;
637}
638
639
640
641sub test {
642
643# my $blarg = new acronym();
644# my $simple;
645# $simple = 10;
646# $blarg->initialise($simple, $simple, $simple);
647# my $blarg2 = $blarg->clone();
648# print $blarg->to_string();
649# print $blarg2;
650# print "\n";
651#
652 my $tla = new acronym();
653 $tla->[0] = "TLA";
654
655 my @array = ("Three", "Letter", "Acronym");
656# my $i = 0;
657# while ($i <= $#array)
658# {
659# print @array[$i] . "\n";
660# $i++;
661# }
662
663 print "\n";
664 push @{$tla->[1]}, ("Three" );
665 push @{$tla->[1]}, ("Letter" );
666 push @{$tla->[1]}, ("Acronym" );
667 print $tla->to_string(). "\n";
668 print "\n";
669 print "\n";
670 my $tla2 = $tla->clone();
671 push @{$tla2->[1]}, ("One");
672 push @{$tla2->[1]}, ("Two");
673 $tla2->[0] = "ALT";
674 print $tla->to_string(). "\n";
675 print $tla2->to_string(). "\n";
676
677 print "\n";
678 print "\n";
679
680 print "Testing recursion ...\n";
681 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
682
683 foreach my $acro (@$acros)
684 {
685 if ($acro->check)
686 {
687 print "accepted: " .$acro->to_string() . "\n";
688# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
689# "|" . $acro->words_in_acronym_definition() .
690# "|" . $acro->letters_in_acronym_definition() .
691# "|" . $acro->letters_in_acronym() . "|\n";
692 } else {
693# print "found but rejected: " .$acro->to_string() . "\n";
694 }
695 }
696}
697
698#uncomment this line to test this package
699#&test();
700
7011;
702
703
Note: See TracBrowser for help on using the repository browser.