Changeset 1404 for trunk


Ignore:
Timestamp:
2000-08-15T14:28:47+12:00 (24 years ago)
Author:
say1
Message:

fixed acronyms option file. trimmed text at start of bibliographies to prevent emphermeral acronyms. tightened allow_all_caps code

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/acronym.pm

    r1396 r1404  
    3636# valiables to control the recall/precision tradeoff
    3737
    38 #the maximum range to look for acronyms
     38#the maximum range to look for acronyms 
    3939my $max_offset = 30;
    4040#acronyms must be upper case
     
    4949my $min_length_saving = 4;
    5050#allow recusive acronyms
    51 my $allow_recursive = "";
     51my $allow_recursive = 0;
    5252#let definitions be all capitals
    5353my $allow_all_caps = 0;
     
    7878
    7979sub initialise_acronyms {
     80
     81    my $local_max_offset = $max_offset;
     82    my $local_upper_case = $upper_case;
     83    my $local_case_match = $case_match ;
     84    my $local_min_def_length = $min_def_length;
     85    my $local_min_acro_length = $min_acro_length;
     86    my $local_min_length_saving = $min_length_saving;
     87    my $local_allow_recursive = $allow_recursive;
     88    my $local_allow_all_caps = $allow_all_caps;
     89    my @local_stop_words = @stop_words;
     90    my $local_acronym_accumulate_file = $acronym_accumulate_file;
     91   
    8092   
    8193    my $file_text = "";
     
    97109    print ACRONYM_HANDLE "#explain them fully \n\n";
    98110    print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
    99     print ACRONYM_HANDLE "\$max_offset = 30;\n\n";
    100     print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n";
    101     print ACRONYM_HANDLE "\$upper_case = 1;\n\n";
    102     print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n";
    103     print ACRONYM_HANDLE "\$case_match = 1;\n\n";
     111    print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
     112    print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
     113    print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
     114    print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
     115    print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
    104116    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
    105     print ACRONYM_HANDLE "\$min_def_length = 3;\n\n";
     117    print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
    106118    print ACRONYM_HANDLE "#let definitions be all capitals\n";
    107     print ACRONYM_HANDLE "\$allow_all_caps = 0;\n\n";
     119    print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
    108120    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
    109     print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n";
     121    print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
    110122    print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
    111     print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n";
    112     print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n";
    113     print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n";
     123    print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
     124    print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
     125    print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
    114126    print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
    115127    print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
    116     print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
    117     print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n";
     128    print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
     129    print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
    118130    print ACRONYM_HANDLE "\n";
    119131    print ACRONYM_HANDLE "#the file to collate acronyms into\n";
    120     print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n";
     132    print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n";
    121133    print ACRONYM_HANDLE "\n";
    122     print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n";
    123134    print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
    124135    print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
     
    126137    print ACRONYM_HANDLE "# \n";
    127138    print ACRONYM_HANDLE "# \n";
    128     }
    129     eval $file_text;
    130     #promotes warnings/errors from evaluated file to current context
    131     warn $@ if $@;
    132 
    133 
    134 
     139    print STDERR "written new options file to $acronym_options_file...\n";
     140    } else {
     141    print STDERR "read file $acronym_options_file...\n";
     142    eval $file_text ;
     143    warn $@ if $@;
     144    print STDERR "evaluated file $acronym_options_file...\n";
     145    }
     146   
     147   
     148    $max_offset = $local_max_offset;
     149    $upper_case = $local_upper_case;
     150    $case_match = $local_case_match ;
     151    $min_def_length = $local_min_def_length;
     152    $min_acro_length = $local_min_acro_length;
     153    $min_length_saving = $local_min_length_saving;
     154    $allow_recursive = $local_allow_recursive;
     155    $allow_all_caps = $local_allow_all_caps;
     156    @stop_words = @local_stop_words;
     157   
     158$local_acronym_accumulate_file = $local_acronym_accumulate_file;
     159   
    135160    &read_all_acronyms_from_file();
    136161#    rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
     
    427452    return 0;
    428453    }
     454    if (!$allow_all_caps)
     455    {
     456    my $upper_count = 0;
     457    my $lower_count = 0;
     458    my @letters = $self->to_def_string();
     459    for my $letter (split //, $self->to_def_string())
     460    {
     461        if ($letter eq uc($letter))
     462        {
     463        $upper_count++;
     464        } else {
     465        $lower_count++;
     466        }       
     467    }
     468    return 0 if ($upper_count > $lower_count);
     469    }
     470    if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
     471    {
     472    return 0;
     473    }
    429474#    print "acronym " . $self->to_string() . " not rejected\n";
    430475    return 1;
     
    518563}
    519564
    520 
     565#the main
    521566sub acronyms {
    522567    #clean up the text
    523568    my $processed_text =  shift @_;
    524     $$processed_text =~ s/[^A-Za-z]/ /g;
     569    $$processed_text =~ s/<[^>]*>/ /g;
     570    $$processed_text =~ s/[^\w]/ /g;
     571    $$processed_text =~ s/[0-9_]/ /g;
    525572    $$processed_text =~ s/\s+/ /g;
    526     $$processed_text =~ s/(\n|\>)References.*/ /g;
    527     $$processed_text =~ s/(\n|\>)Bibliography.*/ /g;
     573    $$processed_text =~ s/(\n|\>)References.*/ /i;
     574    $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
    528575
    529576    #clear some global variables
Note: See TracChangeset for help on using the changeset viewer.