root/gsdl/trunk/perllib/acronym.pm @ 15117

Revision 15117, 19.1 KB (checked in by ak19, 13 years ago)

Dr Bainbridge made some changes to tidy up the code (replaced file global params with args and removed unnecessary statements)

  • Property svn:keywords set to Author Date Id Revision
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25#    class to handle acronyms
26###########################################################################
27
28eval "require diagnostics"; # some perl distros (eg mac) don't have this
29
30package acronym;
31
32###########################################################################
33#    global variables
34###########################################################################
35# valiables to control the recall/precision tradeoff
36
37#the maximum range to look for acronyms
38my $max_offset = 30;
39#acronyms must be upper case
40my $upper_case = 1;
41#acronym case must match
42my $case_match = 1;
43#minimum acronym length
44my $min_def_length = 3;
45#minimum acronym length
46my $min_acro_length = 3;
47#minimum acronym length saving
48my $min_length_saving = 4;
49#allow recusive acronyms
50my $allow_recursive = 0;
51#let definitions be all capitals
52my $allow_all_caps = 0;
53
54my @stop_words = split / /, "OF AT THE IN TO AND";
55
56#the text split into an array, one word per element
57my @split_text = ();
58my @acronym_list = ();
59
60
61my %acronyms_found_in_collection = ();
62my %acronyms_banned_from_collection = ();
63
64my $writing_acronyms = 1;
65my $accumulate_acronyms = 1;
66my $markup_accumulate_acronyms = 1;
67my $markup_local_acronyms = 1;
68
69
70
71###########################################################################
72#   file saving / loading stuff
73###########################################################################
74
75sub initialise_acronyms {
76
77    my $local_max_offset = $max_offset;
78    my $local_upper_case = $upper_case;
79    my $local_case_match = $case_match ;
80    my $local_min_def_length = $min_def_length;
81    my $local_min_acro_length = $min_acro_length;
82    my $local_min_length_saving = $min_length_saving;
83    my $local_allow_recursive = $allow_recursive;
84    my $local_allow_all_caps = $allow_all_caps;
85    my @local_stop_words = @stop_words;
86   
87   
88    # the file to collate acronyms into
89
90    my $def_acronym_acc_file_pm = '&util::filename_cat($ENV{\'GSDLCOLLECTDIR\'}, "etc","acronym_definitions.pm");';
91    my $acronym_accumulate_file = eval { $def_acronym_acc_file_pm };
92    my $acronym_options_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc","acronym_options.pm");
93
94
95    my $file_text = "";
96    if (open ACRONYM_HANDLE, "<$acronym_options_file")
97    {
98    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
99    }
100    if ($file_text eq "")
101    {
102    print STDERR "failed to open $acronym_options_file\n";
103    open ACRONYM_HANDLE, ">$acronym_options_file\n";
104    print ACRONYM_HANDLE "use util;\n";
105    print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
106    print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
107    print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
108    print ACRONYM_HANDLE "#valid here.\n\n";
109    print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
110    print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
111    print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
112    print ACRONYM_HANDLE "#explain them fully \n\n";
113    print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
114    print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
115    print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
116    print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
117    print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
118    print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
119    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
120    print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
121    print ACRONYM_HANDLE "#let definitions be all capitals\n";
122    print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
123    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
124    print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
125    print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
126    print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
127    print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
128    print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
129    print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
130    print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
131    print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
132    print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
133    print ACRONYM_HANDLE "\n";
134    print ACRONYM_HANDLE "#the file to collate acronyms into\n";
135    print ACRONYM_HANDLE "\$acronym_accumulate_file = $def_acronym_acc_file_pm\n";
136    print ACRONYM_HANDLE "\n";
137    print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
138    print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
139    print ACRONYM_HANDLE "# \n";
140    print ACRONYM_HANDLE "# \n";
141    print ACRONYM_HANDLE "# \n";
142    print STDERR "written new options file to $acronym_options_file...\n";
143    } else {
144    print STDERR "read file $acronym_options_file...\n";
145    eval $file_text ;
146    warn $@ if $@;
147    print STDERR "evaluated file $acronym_options_file...\n";
148    }
149
150    $max_offset = $local_max_offset;
151    $upper_case = $local_upper_case;
152    $case_match = $local_case_match ;
153    $min_def_length = $local_min_def_length;
154    $min_acro_length = $local_min_acro_length;
155    $min_length_saving = $local_min_length_saving;
156    $allow_recursive = $local_allow_recursive;
157    $allow_all_caps = $local_allow_all_caps;
158    @stop_words = @local_stop_words;
159       
160
161    &read_all_acronyms_from_file($acronym_accumulate_file);
162#    rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
163#   int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
164    if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
165    {
166    print ACRONYM_HANDLE "#This is an automatically generated file.\n";
167    print ACRONYM_HANDLE "#\n";
168    print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
169    print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
170    print ACRONYM_HANDLE "#read-only. \n";
171    print ACRONYM_HANDLE "#\n";
172    print ACRONYM_HANDLE "#start of acronyms...\n";
173    $writing_acronyms = 1;
174    } else {
175    warn "failed to open $acronym_accumulate_file for writing\n";
176    $writing_acronyms = 0;
177    }
178}
179
180#close the list of accumulated acronyms
181sub finalise_acronyms {
182    if ($writing_acronyms)
183    {
184    print ACRONYM_HANDLE "#end of acronyms.\n"; 
185    close ACRONYM_HANDLE;
186    }
187}
188
189#eval a file of accumulated acronyms
190sub read_all_acronyms_from_file {
191    my ($acronym_accumulate_file) = @_;
192
193    my $file_text = " ";
194    if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
195    {
196    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
197    } else {
198    print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
199    }
200    eval $file_text;
201    #promotes warnings/errors from evaluated file to current context
202    warn $@ if $@;
203}
204
205#called from within the file of accumulated acronyms to indicate a good acronym
206sub add {
207    my $self = shift (@_);
208    if (defined ($acronyms_found_in_collection{$self->[0]}))
209    {
210    my $def = $self->to_def_string();
211    if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
212    {
213        return;
214    }
215    $acronyms_found_in_collection{$self->[0]} =
216        $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
217    } else {
218    $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
219    }
220}
221
222#called from within the file of accumulated acronyms to indicate a bad acronym
223sub ban {
224    my $self = shift (@_);
225   
226    if (!defined $acronyms_banned_from_collection{$self->[0]})
227    {
228    $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
229    } else {
230    $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
231    }
232}
233
234
235#write a good acronym to the accumulated acronyms file
236sub write_to_file {
237    my $self = shift (@_);
238    if ($writing_acronyms)
239    {
240    print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
241        $self->to_def_string() .
242        "\")->add();\n";
243    }
244}
245
246
247###########################################################################
248# mark functionality   
249###########################################################################
250
251#small routine to sort by length
252sub sort_by_length {
253    length($b) <=> length($a) or $a cmp $b
254}
255
256sub markup_acronyms {
257    my  $text = shift (@_);
258    my  $verbosity_obj = shift (@_);
259    if (defined $text)
260    {
261    for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
262    {
263        $text  =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
264        printf STDERR " " .  $acro . ","
265        if ($verbosity_obj->{'verbosity'} >= 2);
266    }
267    }
268    return $text;
269}
270
271
272
273###########################################################################
274#    member functions
275###########################################################################
276
277
278sub new {
279    my $trash = shift (@_);
280    my $acro = shift (@_);
281    my $def  = shift (@_);
282   
283    my $self = [
284        "", # 0 acronym
285        [], # 1 definition
286               ];
287   
288    $self->[0] = $acro                    if defined $acro;
289    push @{$self->[1]},  split / /, $def  if defined $def;
290   
291    bless $self;
292}
293
294sub clone  {
295    my $self = shift (@_);
296
297    my $copy = new acronym;
298
299    $copy->[0] = $self->[0];
300    push @{$copy->[1]}, @{$self->[1]};
301    bless $copy;
302
303    return $copy;
304}
305
306#return the acronym
307sub to_acronym {
308    my $self = shift (@_);
309    my @array = @{$self->[1]};
310
311    return $self->[0];
312}
313
314#return the number of words in the acronym definition
315sub words_in_acronym_definition {
316    my $self = shift (@_);
317    my @array = @{$self->[1]};
318
319    return $#array + 1;
320}
321
322#return the number of letters in the acronym definition
323sub letters_in_acronym_definition {
324    my $self = shift (@_);
325
326    return length($self->to_def_string());
327}
328
329#return the number of letters in the acronym definition
330sub letters_in_acronym {
331    my $self = shift (@_);
332
333    return length($self->to_acronym());
334}
335
336#return the acronym definition
337sub to_def_string {
338    my $self = shift (@_);
339
340    my $result = "";
341
342    # do the definition
343    my @array = @{$self->[1]};
344    my $i = 0;
345    while ($i <= $#array)
346    {
347    $result = $result . $array[$i];
348
349    if ($i+1 <= $#array)
350    {
351        $result = $result . " ";
352    }
353    $i++;
354    }
355    return $result;
356}
357
358
359#print out the kwic for the acronym
360sub to_string_kwic {
361    my $self = shift (@_);
362
363    # the list of all possible combinations
364    my @list = ();
365
366    my $result = "";
367
368    my $j = 0;
369    my @array = @{$self->[1]};
370    while ($j <= $#array)
371    {
372
373    # do the definition
374    my $i = 0;
375
376    #add the key word
377    $result = "<td halign=left>"  . $array[$j] . "</td><td halign=right>";
378
379    #add the proceeding words
380    while ($i < $j)
381    {
382        $result = $result .  $array[$i] . " ";
383        $i++;
384    }
385    #add the key word
386    $result = $result . "</td><td halign=left>"  . $array[$j] .
387        "</td><td halign=left>";
388
389    #add the trailing words
390    $i++;
391    while ($i <= $#array )
392    {
393        $result = $result .  $array[$i] . " ";
394        $i++;
395    }
396
397    #add the actual acronym
398
399    $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
400
401    push @list, $result;
402    $j++;
403    }
404    return @list;
405}
406
407#this is the one used when building the collection ...
408sub to_string {
409    my $self = shift (@_);
410
411    my $result = $self->[0] . " ";
412
413    # do the definition
414    my @array = @{$self->[1]};
415    my $i = 0;
416    while ($i <= $#array)
417    {
418    $result = $result . $array[$i];
419    if ($i+1 <= $#array)
420    {
421        $result = $result . " ";
422    }
423    $i++;
424    }
425    return $result;
426}
427
428sub check {
429    my $self = shift (@_);
430
431    if (length($self->to_acronym()) < $min_acro_length)
432    {
433#   print "acronym " . $self->to_string() . " rejected (too short I)\n";
434    return 0;
435    }
436    if ($self->words_in_acronym_definition() < $min_def_length)
437    {
438#   print "acronym " . $self->to_string() . " rejected (too short II)\n";
439    return 0;
440    }
441    if ($min_length_saving * $self->letters_in_acronym() >
442    $self->letters_in_acronym_definition())
443    {
444#   print "acronym " . $self->to_string() . " rejected (too short III)\n";
445#   print "" . $min_length_saving .
446#       "|" . $self->letters_in_acronym() .
447#       "|" . $self->letters_in_acronym_definition() . "\n";
448    return 0;
449    }
450    if (!$allow_all_caps &&
451    $self->to_def_string() eq uc($self->to_def_string()))
452    {
453#   print "acronym " . $self->to_string() . " rejected (all upper)\n";
454    return 0;
455    }
456    if (!$allow_all_caps)
457    {
458    my $upper_count = 0;
459    my $lower_count = 0;
460    my @letters = $self->to_def_string();
461    for my $letter (split //, $self->to_def_string())
462    {
463        if ($letter eq uc($letter))
464        {
465        $upper_count++;
466        } else {
467        $lower_count++;
468        }       
469    }
470    return 0 if ($upper_count > $lower_count);
471    }
472    if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
473    {
474    return 0;
475    }
476#    print "acronym " . $self->to_string() . " not rejected\n";
477    return 1;
478}
479
480###########################################################################
481#    static functions
482###########################################################################
483
484sub recurse {
485    my ($acro_offset,       #offset of word we're finding acronyms for
486    $text_offset,       
487    $letter_offset,
488    @def_so_far) = @_;
489
490    my $word = $split_text[$text_offset];
491    my $acro = $split_text[$acro_offset];
492    $word = "" if !defined $word;
493    $acro = "" if !defined $acro;
494   
495#    print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
496#   $letter_offset  . ", " . @def_so_far . ")\n";
497
498    #check for termination ...
499    if ($letter_offset >= length($acro))
500    {   
501    my $acronym = new acronym();
502    $acronym->[0] = $acro;
503    push @{$acronym->[1]}, @def_so_far;
504    if ($acronym->check())
505    {
506        push @acronym_list, ( $acronym );
507    }
508#   print "acronym created\n";
509    return;
510    }
511    #check for recursion
512    if (!$allow_recursive)
513    {
514    if ($word eq $acro)
515    {
516#       print "recursion detected\n";
517        return;
518    }
519    }
520   
521    #skip a stop-word
522    my $i = 0;
523    if ($letter_offset != 0)
524    {
525    while ($i <= $#stop_words)
526    {
527        if ($stop_words[$i] eq uc($word))
528        {
529#       print "::found stop word::" . $stop_words[$i] . "\n";
530        &recurse($acro_offset,
531             $text_offset+1,
532             $letter_offset,
533             @def_so_far, $word);
534        }
535        $i++;
536    }
537    }
538    $i = 1;
539    #using the first $i letters ...
540    while ($letter_offset+$i <= length($acro) )
541    {
542#   print "". ((substr $word, 0, $i) . " " .
543#       (substr $acro, $letter_offset, $i) . "\n");
544    if (((!$case_match) &&
545         (uc(substr $word, 0, $i) eq
546          uc(substr $acro, $letter_offset, $i)))
547        ||
548        (($case_match) &&
549         ((substr $word, 0, $i) eq
550          (substr $acro, $letter_offset, $i))))
551    {
552#       print "::match::\n";
553#       print "" . ((substr $word, 0, $i) . " " .
554#          (substr $acro, $letter_offset, $i) . "\n");
555        &recurse($acro_offset,
556             $text_offset+1,
557             $letter_offset+$i,
558             @def_so_far, $word);
559    } else {
560        return;
561    }       
562    $i++;
563    }
564    return;
565}
566
567#the main
568sub acronyms {
569    #clean up the text
570    my $processed_text =  shift @_;
571    $$processed_text =~ s/<[^>]*>/ /g;
572    $$processed_text =~ s/\W/ /g;
573    $$processed_text =~ s/[0-9_]/ /g;
574    $$processed_text =~ s/\s+/ /g;
575    $$processed_text =~ s/(\n|\>)References.*/ /i;
576    $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
577    $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
578    $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
579
580    #clear some global variables
581    @split_text = ();
582    @acronym_list = ();
583
584    return &acronyms_from_clean_text($processed_text);
585}
586
587sub acronyms_from_clean_text {
588    my ($processed_text) = @_;
589
590    @split_text = split / /, $$processed_text;
591
592#    my $i = 0;
593#    while ($i <= $#split_text)
594#    {
595#   print $split_text[$i] . "\n";
596#   $i++;
597#    }
598
599    my $first = 0;
600    my $last = $#split_text +1;
601    my $acro_counter = $first;
602   
603    while ($acro_counter < $last)
604    {
605    my $word = $split_text[$acro_counter];
606
607    if ((!$upper_case) ||
608        (uc($word) eq $word))
609    {
610       
611        if (length $word >= $min_acro_length)
612        {
613        my $def_counter = 0;
614        if ($acro_counter - $max_offset > 0)
615        {
616            $def_counter = $acro_counter - $max_offset;
617        }
618        my $local_last = $acro_counter  + $max_offset;
619        if ($local_last > $last)
620        {
621            $local_last = $last;
622        }
623        while ($def_counter <= $local_last)
624        {
625            &recurse($acro_counter,$def_counter,0,());
626            $def_counter++;
627        }
628        }
629    }
630    $acro_counter++;
631    }
632
633    return \@acronym_list;
634}
635
636
637
638sub test {
639
640#    my $blarg = new acronym();
641#    my $simple;
642#    $simple = 10;
643#    $blarg->initialise($simple, $simple, $simple);
644#    my $blarg2 = $blarg->clone();
645#    print $blarg->to_string();
646#    print $blarg2;
647#    print "\n";
648#   
649    my $tla = new acronym();
650    $tla->[0] = "TLA";
651
652    my @array = ("Three", "Letter", "Acronym");
653#    my $i = 0;
654#    while ($i <= $#array)
655#    {
656#   print @array[$i] . "\n";
657#   $i++;
658#    }
659
660    print "\n";
661    push @{$tla->[1]}, ("Three" );
662    push @{$tla->[1]}, ("Letter" );
663    push @{$tla->[1]}, ("Acronym" );
664    print $tla->to_string(). "\n";
665    print "\n";
666    print "\n";
667    my $tla2 = $tla->clone();
668    push @{$tla2->[1]}, ("One");
669    push @{$tla2->[1]}, ("Two");
670    $tla2->[0] = "ALT";
671    print $tla->to_string(). "\n";
672    print $tla2->to_string(). "\n";
673
674    print "\n";
675    print "\n";
676
677    print "Testing recursion ...\n";
678    my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
679   
680    foreach my $acro (@$acros)
681    {
682    if ($acro->check)
683        {
684           print "accepted: " .$acro->to_string() . "\n";
685#           print "|" .  $acro->to_acronym() . "|" .  $acro->to_def_string() .
686#              "|" .  $acro->words_in_acronym_definition() .
687#              "|" .  $acro->letters_in_acronym_definition() .
688#              "|" .  $acro->letters_in_acronym() . "|\n";
689        } else {
690#          print "found but rejected: " .$acro->to_string() . "\n";
691        }
692    }
693}
694
695#uncomment this line to test this package
696#&test();
697
6981;
699
700
Note: See TracBrowser for help on using the browser.