- Timestamp:
- 2000-08-15T14:28:47+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/acronym.pm
r1396 r1404 36 36 # valiables to control the recall/precision tradeoff 37 37 38 #the maximum range to look for acronyms 38 #the maximum range to look for acronyms 39 39 my $max_offset = 30; 40 40 #acronyms must be upper case … … 49 49 my $min_length_saving = 4; 50 50 #allow recusive acronyms 51 my $allow_recursive = "";51 my $allow_recursive = 0; 52 52 #let definitions be all capitals 53 53 my $allow_all_caps = 0; … … 78 78 79 79 sub initialise_acronyms { 80 81 my $local_max_offset = $max_offset; 82 my $local_upper_case = $upper_case; 83 my $local_case_match = $case_match ; 84 my $local_min_def_length = $min_def_length; 85 my $local_min_acro_length = $min_acro_length; 86 my $local_min_length_saving = $min_length_saving; 87 my $local_allow_recursive = $allow_recursive; 88 my $local_allow_all_caps = $allow_all_caps; 89 my @local_stop_words = @stop_words; 90 my $local_acronym_accumulate_file = $acronym_accumulate_file; 91 80 92 81 93 my $file_text = ""; … … 97 109 print ACRONYM_HANDLE "#explain them fully \n\n"; 98 110 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n"; 99 print ACRONYM_HANDLE "\$ max_offset = 30;\n\n";100 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true )\n";101 print ACRONYM_HANDLE "\$ upper_case = 1;\n\n";102 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true )\n";103 print ACRONYM_HANDLE "\$ case_match = 1;\n\n";111 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n"; 112 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n"; 113 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n"; 114 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n"; 115 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n"; 104 116 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n"; 105 print ACRONYM_HANDLE "\$ min_def_length = 3;\n\n";117 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n"; 106 118 print ACRONYM_HANDLE "#let definitions be all capitals\n"; 107 print ACRONYM_HANDLE "\$ allow_all_caps = 0;\n\n";119 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n"; 108 120 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n"; 109 print ACRONYM_HANDLE "\$ min_acro_length = 3;\n\n";121 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n"; 110 122 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n"; 111 print ACRONYM_HANDLE "\$ min_length_saving = 4;\n\n";112 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false , 1 = true)\n";113 print ACRONYM_HANDLE "\$ allow_recursive = 0;\n\n";123 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n"; 124 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n"; 125 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n"; 114 126 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n"; 115 127 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n"; 116 print ACRONYM_HANDLE "#\@ stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";117 print ACRONYM_HANDLE "\@ stop_words = split / /, \"OF AT THE IN TO AND\";\n";128 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n"; 129 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n"; 118 130 print ACRONYM_HANDLE "\n"; 119 131 print ACRONYM_HANDLE "#the file to collate acronyms into\n"; 120 print ACRONYM_HANDLE "\$ acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\". \"/etc/acronym_definitions.pm\";\n";132 print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n"; 121 133 print ACRONYM_HANDLE "\n"; 122 print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n";123 134 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n"; 124 135 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n"; … … 126 137 print ACRONYM_HANDLE "# \n"; 127 138 print ACRONYM_HANDLE "# \n"; 128 } 129 eval $file_text; 130 #promotes warnings/errors from evaluated file to current context 131 warn $@ if $@; 132 133 134 139 print STDERR "written new options file to $acronym_options_file...\n"; 140 } else { 141 print STDERR "read file $acronym_options_file...\n"; 142 eval $file_text ; 143 warn $@ if $@; 144 print STDERR "evaluated file $acronym_options_file...\n"; 145 } 146 147 148 $max_offset = $local_max_offset; 149 $upper_case = $local_upper_case; 150 $case_match = $local_case_match ; 151 $min_def_length = $local_min_def_length; 152 $min_acro_length = $local_min_acro_length; 153 $min_length_saving = $local_min_length_saving; 154 $allow_recursive = $local_allow_recursive; 155 $allow_all_caps = $local_allow_all_caps; 156 @stop_words = @local_stop_words; 157 158 $local_acronym_accumulate_file = $local_acronym_accumulate_file; 159 135 160 &read_all_acronyms_from_file(); 136 161 # rename $acronym_file, $acronym_file . "." . int(rand (2<<7)). … … 427 452 return 0; 428 453 } 454 if (!$allow_all_caps) 455 { 456 my $upper_count = 0; 457 my $lower_count = 0; 458 my @letters = $self->to_def_string(); 459 for my $letter (split //, $self->to_def_string()) 460 { 461 if ($letter eq uc($letter)) 462 { 463 $upper_count++; 464 } else { 465 $lower_count++; 466 } 467 } 468 return 0 if ($upper_count > $lower_count); 469 } 470 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i ) 471 { 472 return 0; 473 } 429 474 # print "acronym " . $self->to_string() . " not rejected\n"; 430 475 return 1; … … 518 563 } 519 564 520 565 #the main 521 566 sub acronyms { 522 567 #clean up the text 523 568 my $processed_text = shift @_; 524 $$processed_text =~ s/[^A-Za-z]/ /g; 569 $$processed_text =~ s/<[^>]*>/ /g; 570 $$processed_text =~ s/[^\w]/ /g; 571 $$processed_text =~ s/[0-9_]/ /g; 525 572 $$processed_text =~ s/\s+/ /g; 526 $$processed_text =~ s/(\n|\>)References.*/ / g;527 $$processed_text =~ s/(\n|\>)Bibliography.*/ / g;573 $$processed_text =~ s/(\n|\>)References.*/ /i; 574 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i; 528 575 529 576 #clear some global variables
Note:
See TracChangeset
for help on using the changeset viewer.