root/trunk/gsdl/perllib/acronym.pm @ 7645

Revision 7645, 19.0 KB (checked in by jrm21, 16 years ago)

don't fail if we can't load the diagnostics package.

  • Property svn:keywords set to Author Date Id Revision
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25#    class to handle acronyms
26###########################################################################
27
28eval "require diagnostics"; # some perl distros (eg mac) don't have this
29
30package acronym;
31
32###########################################################################
33#    global variables
34###########################################################################
35# valiables to control the recall/precision tradeoff
36
37#the maximum range to look for acronyms
38my $max_offset = 30;
39#acronyms must be upper case
40my $upper_case = 1;
41#acronym case must match
42my $case_match = 1;
43#minimum acronym length
44my $min_def_length = 3;
45#minimum acronym length
46my $min_acro_length = 3;
47#minimum acronym length saving
48my $min_length_saving = 4;
49#allow recusive acronyms
50my $allow_recursive = 0;
51#let definitions be all capitals
52my $allow_all_caps = 0;
53
54my @stop_words = split / /, "OF AT THE IN TO AND";
55
56#the text split into an array, one word per element
57my @split_text = ();
58my @acronym_list = ();
59
60#the file to collate acronyms into
61my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
62my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
63
64my %acronyms_found_in_collection = ();
65my %acronyms_banned_from_collection = ();
66
67my $writing_acronyms = 1;
68my $accumulate_acronyms = 1;
69my $markup_accumulate_acronyms = 1;
70my $markup_local_acronyms = 1;
71
72
73
74###########################################################################
75#   file saving / loading stuff
76###########################################################################
77
78sub initialise_acronyms {
79
80    my $local_max_offset = $max_offset;
81    my $local_upper_case = $upper_case;
82    my $local_case_match = $case_match ;
83    my $local_min_def_length = $min_def_length;
84    my $local_min_acro_length = $min_acro_length;
85    my $local_min_length_saving = $min_length_saving;
86    my $local_allow_recursive = $allow_recursive;
87    my $local_allow_all_caps = $allow_all_caps;
88    my @local_stop_words = @stop_words;
89    my $local_acronym_accumulate_file = $acronym_accumulate_file;
90   
91   
92    my $file_text = "";
93    if (open ACRONYM_HANDLE, "<$acronym_options_file")
94    {
95    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
96    }
97    if ($file_text eq "")
98    {
99    print STDERR "failed to open $acronym_options_file\n";
100    open ACRONYM_HANDLE, ">$acronym_options_file\n";
101    print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
102    print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
103    print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
104    print ACRONYM_HANDLE "#valid here.\n\n";
105    print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
106    print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
107    print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
108    print ACRONYM_HANDLE "#explain them fully \n\n";
109    print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
110    print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
111    print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
112    print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
113    print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
114    print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
115    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
116    print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
117    print ACRONYM_HANDLE "#let definitions be all capitals\n";
118    print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
119    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
120    print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
121    print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
122    print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
123    print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
124    print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
125    print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
126    print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
127    print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
128    print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
129    print ACRONYM_HANDLE "\n";
130    print ACRONYM_HANDLE "#the file to collate acronyms into\n";
131    print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n";
132    print ACRONYM_HANDLE "\n";
133    print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
134    print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
135    print ACRONYM_HANDLE "# \n";
136    print ACRONYM_HANDLE "# \n";
137    print ACRONYM_HANDLE "# \n";
138    print STDERR "written new options file to $acronym_options_file...\n";
139    } else {
140    print STDERR "read file $acronym_options_file...\n";
141    eval $file_text ;
142    warn $@ if $@;
143    print STDERR "evaluated file $acronym_options_file...\n";
144    }
145
146    $max_offset = $local_max_offset;
147    $upper_case = $local_upper_case;
148    $case_match = $local_case_match ;
149    $min_def_length = $local_min_def_length;
150    $min_acro_length = $local_min_acro_length;
151    $min_length_saving = $local_min_length_saving;
152    $allow_recursive = $local_allow_recursive;
153    $allow_all_caps = $local_allow_all_caps;
154    @stop_words = @local_stop_words;
155   
156    $local_acronym_accumulate_file = $local_acronym_accumulate_file;
157   
158
159    &read_all_acronyms_from_file();
160#    rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
161#   int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
162    if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
163    {
164    print ACRONYM_HANDLE "#This is an automatically generated file.\n";
165    print ACRONYM_HANDLE "#\n";
166    print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
167    print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
168    print ACRONYM_HANDLE "#read-only. \n";
169    print ACRONYM_HANDLE "#\n";
170    print ACRONYM_HANDLE "#start of acronyms...\n";
171    $writing_acronyms = 1;
172    } else {
173    warn "failed to open $acronym_accumulate_file for writing\n";
174    $writing_acronyms = 0;
175    }
176}
177
178#close the list of accumulated acronyms
179sub finalise_acronyms {
180    if ($writing_acronyms)
181    {
182    print ACRONYM_HANDLE "#end of acronyms.\n"; 
183    close ACRONYM_HANDLE;
184    }
185}
186
187#eval a file of accumulated acronyms
188sub read_all_acronyms_from_file {
189   
190    my $file_text = " ";
191    if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
192    {
193    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
194    } else {
195    print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
196    }
197    eval $file_text;
198    #promotes warnings/errors from evaluated file to current context
199    warn $@ if $@;
200}
201
202#called from within the file of accumulated acronyms to indicate a good acronym
203sub add {
204    my $self = shift (@_);
205    if (defined ($acronyms_found_in_collection{$self->[0]}))
206    {
207    my $def = $self->to_def_string();
208    if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
209    {
210        return;
211    }
212    $acronyms_found_in_collection{$self->[0]} =
213        $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
214    } else {
215    $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
216    }
217}
218
219#called from within the file of accumulated acronyms to indicate a bad acronym
220sub ban {
221    my $self = shift (@_);
222   
223    if (!defined $acronyms_banned_from_collection{$self->[0]})
224    {
225    $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
226    } else {
227    $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
228    }
229}
230
231
232#write a good acronym to the accumulated acronyms file
233sub write_to_file {
234    my $self = shift (@_);
235    if ($writing_acronyms)
236    {
237    print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
238        $self->to_def_string() .
239        "\")->add();\n";
240    }
241}
242
243
244###########################################################################
245# mark functionality   
246###########################################################################
247
248#small routine to sort by length
249sub sort_by_length {
250    length($b) <=> length($a) or $a cmp $b
251}
252
253sub markup_acronyms {
254    my  $text = shift (@_);
255    my  $verbosity_obj = shift (@_);
256    if (defined $text)
257    {
258    for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
259    {
260        $text  =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
261        printf STDERR " " .  $acro . ","
262        if ($verbosity_obj->{'verbosity'} >= 2);
263    }
264    }
265    return $text;
266}
267
268
269
270###########################################################################
271#    member functions
272###########################################################################
273
274
275sub new {
276    my $trash = shift (@_);
277    my $acro = shift (@_);
278    my $def  = shift (@_);
279   
280    my $self = [
281        "", # 0 acronym
282        [], # 1 definition
283               ];
284   
285    $self->[0] = $acro                    if defined $acro;
286    push @{$self->[1]},  split / /, $def  if defined $def;
287   
288    bless $self;
289}
290
291sub clone  {
292    my $self = shift (@_);
293
294    my $copy = new acronym;
295
296    $copy->[0] = $self->[0];
297    push @{$copy->[1]}, @{$self->[1]};
298    bless $copy;
299
300    return $copy;
301}
302
303#return the acronym
304sub to_acronym {
305    my $self = shift (@_);
306    my @array = @{$self->[1]};
307
308    return $self->[0];
309}
310
311#return the number of words in the acronym definition
312sub words_in_acronym_definition {
313    my $self = shift (@_);
314    my @array = @{$self->[1]};
315
316    return $#array + 1;
317}
318
319#return the number of letters in the acronym definition
320sub letters_in_acronym_definition {
321    my $self = shift (@_);
322
323    return length($self->to_def_string());
324}
325
326#return the number of letters in the acronym definition
327sub letters_in_acronym {
328    my $self = shift (@_);
329
330    return length($self->to_acronym());
331}
332
333#return the acronym definition
334sub to_def_string {
335    my $self = shift (@_);
336
337    my $result = "";
338
339    # do the definition
340    my @array = @{$self->[1]};
341    my $i = 0;
342    while ($i <= $#array)
343    {
344    $result = $result . $array[$i];
345
346    if ($i+1 <= $#array)
347    {
348        $result = $result . " ";
349    }
350    $i++;
351    }
352    return $result;
353}
354
355
356#print out the kwic for the acronym
357sub to_string_kwic {
358    my $self = shift (@_);
359
360    # the list of all possible combinations
361    my @list = ();
362
363    my $result = "";
364
365    my $j = 0;
366    my @array = @{$self->[1]};
367    while ($j <= $#array)
368    {
369
370    # do the definition
371    my $i = 0;
372
373    #add the key word
374    $result = "<td halign=left>"  . $array[$j] . "</td><td halign=right>";
375
376    #add the proceeding words
377    while ($i < $j)
378    {
379        $result = $result .  $array[$i] . " ";
380        $i++;
381    }
382    #add the key word
383    $result = $result . "</td><td halign=left>"  . $array[$j] .
384        "</td><td halign=left>";
385
386    #add the trailing words
387    $i++;
388    while ($i <= $#array )
389    {
390        $result = $result .  $array[$i] . " ";
391        $i++;
392    }
393
394    #add the actual acronym
395
396    $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
397
398    push @list, $result;
399    $j++;
400    }
401    return @list;
402}
403
404#this is the one used when building the collection ...
405sub to_string {
406    my $self = shift (@_);
407
408    my $result = $self->[0] . " ";
409
410    # do the definition
411    my @array = @{$self->[1]};
412    my $i = 0;
413    while ($i <= $#array)
414    {
415    $result = $result . $array[$i];
416    if ($i+1 <= $#array)
417    {
418        $result = $result . " ";
419    }
420    $i++;
421    }
422    return $result;
423}
424
425sub check {
426    my $self = shift (@_);
427
428    if (length($self->to_acronym()) < $min_acro_length)
429    {
430#   print "acronym " . $self->to_string() . " rejected (too short I)\n";
431    return 0;
432    }
433    if ($self->words_in_acronym_definition() < $min_def_length)
434    {
435#   print "acronym " . $self->to_string() . " rejected (too short II)\n";
436    return 0;
437    }
438    if ($min_length_saving * $self->letters_in_acronym() >
439    $self->letters_in_acronym_definition())
440    {
441#   print "acronym " . $self->to_string() . " rejected (too short III)\n";
442#   print "" . $min_length_saving .
443#       "|" . $self->letters_in_acronym() .
444#       "|" . $self->letters_in_acronym_definition() . "\n";
445    return 0;
446    }
447    if (!$allow_all_caps &&
448    $self->to_def_string() eq uc($self->to_def_string()))
449    {
450#   print "acronym " . $self->to_string() . " rejected (all upper)\n";
451    return 0;
452    }
453    if (!$allow_all_caps)
454    {
455    my $upper_count = 0;
456    my $lower_count = 0;
457    my @letters = $self->to_def_string();
458    for my $letter (split //, $self->to_def_string())
459    {
460        if ($letter eq uc($letter))
461        {
462        $upper_count++;
463        } else {
464        $lower_count++;
465        }       
466    }
467    return 0 if ($upper_count > $lower_count);
468    }
469    if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
470    {
471    return 0;
472    }
473#    print "acronym " . $self->to_string() . " not rejected\n";
474    return 1;
475}
476
477###########################################################################
478#    static functions
479###########################################################################
480
481sub recurse {
482    my ($acro_offset,       #offset of word we're finding acronyms for
483    $text_offset,       
484    $letter_offset,
485    @def_so_far) = @_;
486
487    my $word = $split_text[$text_offset];
488    my $acro = $split_text[$acro_offset];
489    $word = "" if !defined $word;
490    $acro = "" if !defined $acro;
491   
492#    print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
493#   $letter_offset  . ", " . @def_so_far . ")\n";
494
495    #check for termination ...
496    if ($letter_offset >= length($acro))
497    {   
498    my $acronym = new acronym();
499    $acronym->[0] = $acro;
500    push @{$acronym->[1]}, @def_so_far;
501    if ($acronym->check())
502    {
503        push @acronym_list, ( $acronym );
504    }
505#   print "acronym created\n";
506    return;
507    }
508    #check for recursion
509    if (!$allow_recursive)
510    {
511    if ($word eq $acro)
512    {
513#       print "recursion detected\n";
514        return;
515    }
516    }
517   
518    #skip a stop-word
519    my $i = 0;
520    if ($letter_offset != 0)
521    {
522    while ($i <= $#stop_words)
523    {
524        if ($stop_words[$i] eq uc($word))
525        {
526#       print "::found stop word::" . $stop_words[$i] . "\n";
527        &recurse($acro_offset,
528             $text_offset+1,
529             $letter_offset,
530             @def_so_far, $word);
531        }
532        $i++;
533    }
534    }
535    $i = 1;
536    #using the first $i letters ...
537    while ($letter_offset+$i <= length($acro) )
538    {
539#   print "". ((substr $word, 0, $i) . " " .
540#       (substr $acro, $letter_offset, $i) . "\n");
541    if (((!$case_match) &&
542         (uc(substr $word, 0, $i) eq
543          uc(substr $acro, $letter_offset, $i)))
544        ||
545        (($case_match) &&
546         ((substr $word, 0, $i) eq
547          (substr $acro, $letter_offset, $i))))
548    {
549#       print "::match::\n";
550#       print "" . ((substr $word, 0, $i) . " " .
551#          (substr $acro, $letter_offset, $i) . "\n");
552        &recurse($acro_offset,
553             $text_offset+1,
554             $letter_offset+$i,
555             @def_so_far, $word);
556    } else {
557        return;
558    }       
559    $i++;
560    }
561    return;
562}
563
564#the main
565sub acronyms {
566    #clean up the text
567    my $processed_text =  shift @_;
568    $$processed_text =~ s/<[^>]*>/ /g;
569    $$processed_text =~ s/\W/ /g;
570    $$processed_text =~ s/[0-9_]/ /g;
571    $$processed_text =~ s/\s+/ /g;
572    $$processed_text =~ s/(\n|\>)References.*/ /i;
573    $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
574    $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
575    $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
576
577    #clear some global variables
578    @split_text = ();
579    @acronym_list = ();
580
581    return &acronyms_from_clean_text($processed_text);
582}
583
584sub acronyms_from_clean_text {
585    my ($processed_text) = @_;
586
587    @split_text = split / /, $$processed_text;
588
589#    my $i = 0;
590#    while ($i <= $#split_text)
591#    {
592#   print $split_text[$i] . "\n";
593#   $i++;
594#    }
595
596    my $first = 0;
597    my $last = $#split_text +1;
598    my $acro_counter = $first;
599   
600    while ($acro_counter < $last)
601    {
602    my $word = $split_text[$acro_counter];
603
604    if ((!$upper_case) ||
605        (uc($word) eq $word))
606    {
607       
608        if (length $word >= $min_acro_length)
609        {
610        my $def_counter = 0;
611        if ($acro_counter - $max_offset > 0)
612        {
613            $def_counter = $acro_counter - $max_offset;
614        }
615        my $local_last = $acro_counter  + $max_offset;
616        if ($local_last > $last)
617        {
618            $local_last = $last;
619        }
620        while ($def_counter <= $local_last)
621        {
622            &recurse($acro_counter,$def_counter,0,());
623            $def_counter++;
624        }
625        }
626    }
627    $acro_counter++;
628    }
629
630    return \@acronym_list;
631}
632
633
634
635sub test {
636
637#    my $blarg = new acronym();
638#    my $simple;
639#    $simple = 10;
640#    $blarg->initialise($simple, $simple, $simple);
641#    my $blarg2 = $blarg->clone();
642#    print $blarg->to_string();
643#    print $blarg2;
644#    print "\n";
645#   
646    my $tla = new acronym();
647    $tla->[0] = "TLA";
648
649    my @array = ("Three", "Letter", "Acronym");
650#    my $i = 0;
651#    while ($i <= $#array)
652#    {
653#   print @array[$i] . "\n";
654#   $i++;
655#    }
656
657    print "\n";
658    push @{$tla->[1]}, ("Three" );
659    push @{$tla->[1]}, ("Letter" );
660    push @{$tla->[1]}, ("Acronym" );
661    print $tla->to_string(). "\n";
662    print "\n";
663    print "\n";
664    my $tla2 = $tla->clone();
665    push @{$tla2->[1]}, ("One");
666    push @{$tla2->[1]}, ("Two");
667    $tla2->[0] = "ALT";
668    print $tla->to_string(). "\n";
669    print $tla2->to_string(). "\n";
670
671    print "\n";
672    print "\n";
673
674    print "Testing recursion ...\n";
675    my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
676   
677    foreach my $acro (@$acros)
678    {
679    if ($acro->check)
680        {
681           print "accepted: " .$acro->to_string() . "\n";
682#           print "|" .  $acro->to_acronym() . "|" .  $acro->to_def_string() .
683#              "|" .  $acro->words_in_acronym_definition() .
684#              "|" .  $acro->letters_in_acronym_definition() .
685#              "|" .  $acro->letters_in_acronym() . "|\n";
686        } else {
687#          print "found but rejected: " .$acro->to_string() . "\n";
688        }
689    }
690}
691
692#uncomment this line to test this package
693#&test();
694
6951;
696
697
Note: See TracBrowser for help on using the browser.