Ignore:
Timestamp:
2000-08-11T11:12:15+12:00 (24 years ago)
Author:
say1
Message:

acronym markup functionality

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/acronym.pm

    r1361 r1393  
    4747my $min_acro_length = 3;
    4848#minimum acronym length saving
    49 my $min_length_saving = 3;
     49my $min_length_saving = 4;
    5050#allow recusive acronyms
    5151my $allow_recursive = "";
    5252
    53 my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND";
    54 #my @stop_words = split / /, "OF AT THE IN TO AND";
     53my @stop_words = split / /, "OF AT THE IN TO AND";
    5554
    5655#the text split into an array, one word per element
     
    5857my @acronym_list = ();
    5958
     59#the file to collate acronyms into
     60my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
     61my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
     62
     63my %acronyms_found_in_collection = ();
     64my %acronyms_banned_from_collection = ();
     65
     66my $writing_acronyms = 1;
     67my $accumulate_acronyms = 1;
     68my $markup_accumulate_acronyms = 1;
     69my $markup_local_acronyms = 1;
     70
     71
     72
     73###########################################################################
     74#   file saving / loading stuff
     75###########################################################################
     76
     77sub init_acronyms {
     78   
     79    my $file_text = "";
     80    if (open ACRONYM_HANDLE, "<$acronym_options_file")
     81    {
     82    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
     83    }
     84    if ($file_text eq "")
     85    {
     86    print STDERR "failed to open $acronym_options_file\n";
     87    open ACRONYM_HANDLE, ">$acronym_options_file\n";
     88    print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
     89    print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
     90    print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
     91    print ACRONYM_HANDLE "#valid here.\n\n";
     92    print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
     93    print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
     94    print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
     95    print ACRONYM_HANDLE "#explain them fully \n\n";
     96    print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
     97    print ACRONYM_HANDLE "\$max_offset = 30;\n\n";
     98    print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n";
     99    print ACRONYM_HANDLE "\$upper_case = 1;\n\n";
     100    print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n";
     101    print ACRONYM_HANDLE "\$case_match = 1;\n\n";
     102    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
     103    print ACRONYM_HANDLE "\$min_def_length = 3;\n\n";
     104    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
     105    print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n";
     106    print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
     107    print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n";
     108    print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n";
     109    print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n";
     110    print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
     111    print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
     112    print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
     113    print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n";
     114    print ACRONYM_HANDLE "\n";
     115    print ACRONYM_HANDLE "#the file to collate acronyms into\n";
     116    print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n";
     117    print ACRONYM_HANDLE "\n";
     118    print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n";
     119    print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
     120    print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
     121    print ACRONYM_HANDLE "# \n";
     122    print ACRONYM_HANDLE "# \n";
     123    print ACRONYM_HANDLE "# \n";
     124    }
     125    eval $file_text;
     126    #promotes warnings/errors from evaluated file to current context
     127    warn $@ if $@;
     128
     129
     130
     131    &read_all_acronyms_from_file();
     132#    rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
     133#   int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
     134    if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
     135    {
     136    print ACRONYM_HANDLE "#This is an automatically generated file.\n";
     137    print ACRONYM_HANDLE "#\n";
     138    print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
     139    print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
     140    print ACRONYM_HANDLE "#read-only. \n";
     141    print ACRONYM_HANDLE "#\n";
     142    print ACRONYM_HANDLE "#start of acronyms...\n";
     143    $writing_acronyms = 1;
     144    } else {
     145    warn "failed to open $acronym_accumulate_file for writing\n";
     146    $writing_acronyms = 0;
     147    }
     148}
     149
     150#close the list of accumulated acronyms
     151END {
     152    if ($writing_acronyms)
     153    {
     154    print ACRONYM_HANDLE "#end of acronyms.\n"; 
     155    close ACRONYM_HANDLE;
     156    }
     157}
     158
     159#eval a file of accumulated acronyms
     160sub read_all_acronyms_from_file {
     161   
     162    my $file_text = "";
     163    if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
     164    {
     165    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
     166    } else {
     167    print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
     168    }
     169    eval $file_text;
     170    #promotes warnings/errors from evaluated file to current context
     171    warn $@ if $@;
     172}
     173
     174#called from within the file of accumulated acronyms to indicate a good acronym
     175sub add {
     176    my $self = shift (@_);
     177    if (defined ($acronyms_found_in_collection{$self->[0]}))
     178    {
     179    my $def = $self->to_def_string();
     180    if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
     181    {
     182        return;
     183    }
     184    $acronyms_found_in_collection{$self->[0]} =
     185        $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
     186    } else {
     187    $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
     188    }
     189}
     190
     191#called from within the file of accumulated acronyms to indicate a bad acronym
     192sub ban {
     193    my $self = shift (@_);
     194   
     195    if (!defined $acronyms_banned_from_collection{$self->[0]})
     196    {
     197    $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
     198    } else {
     199    $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
     200    }
     201}
     202
     203
     204#write a good acronym to the accumulated acronyms file
     205sub write_to_file {
     206    my $self = shift (@_);
     207    if ($writing_acronyms)
     208    {
     209    print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
     210        $self->to_def_string() .
     211        "\")->add();\n";
     212    }
     213}
     214
     215
     216###########################################################################
     217# mark functionality   
     218###########################################################################
     219
     220#small routine to sort by length
     221sub sort_by_length {
     222    length($b) <=> length($a) or $a cmp $b
     223}
     224
     225sub markup_acronyms {
     226    my  $text = shift (@_);
     227    my  $verbosity_obj = shift (@_);
     228    if (defined $text)
     229    {
     230    for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
     231    {
     232        $text  =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
     233        printf STDERR " " .  $acro . ","
     234        if ($verbosity_obj->{'verbosity'} >= 2);
     235    }
     236    }
     237    return $text;
     238}
     239
     240
    60241
    61242###########################################################################
     
    65246
    66247sub new {
     248    my $trash = shift (@_);
     249    my $acro = shift (@_);
     250    my $def  = shift (@_);
     251   
    67252    my $self = [
    68     "", # 0 acronym
    69     [], # 1 definition
    70     ];
     253        "", # 0 acronym
     254        [], # 1 definition
     255               ];
     256   
     257    $self->[0] = $acro                    if defined $acro;
     258    push @{$self->[1]},  split / /, $def  if defined $def;
     259   
    71260    bless $self;
    72261}
    73 
    74 
    75262
    76263sub clone  {
     
    226413#   print "acronym " . $self->to_string() . " rejected (too short III)\n";
    227414#   print "" . $min_length_saving .
    228         "|" . $self->letters_in_acronym() .
    229         "|" . $self->letters_in_acronym_definition() . "\n";
     415#       "|" . $self->letters_in_acronym() .
     416#       "|" . $self->letters_in_acronym_definition() . "\n";
    230417    return 0;
    231418    }
     
    446633#&test();
    447634
     635&init_acronyms();
     636
    4486371;
    449638
Note: See TracChangeset for help on using the changeset viewer.