Changeset 1393 for trunk/gsdl/perllib/acronym.pm
- Timestamp:
- 2000-08-11T11:12:15+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/acronym.pm
r1361 r1393 47 47 my $min_acro_length = 3; 48 48 #minimum acronym length saving 49 my $min_length_saving = 3;49 my $min_length_saving = 4; 50 50 #allow recusive acronyms 51 51 my $allow_recursive = ""; 52 52 53 my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND"; 54 #my @stop_words = split / /, "OF AT THE IN TO AND"; 53 my @stop_words = split / /, "OF AT THE IN TO AND"; 55 54 56 55 #the text split into an array, one word per element … … 58 57 my @acronym_list = (); 59 58 59 #the file to collate acronyms into 60 my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm"; 61 my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm"; 62 63 my %acronyms_found_in_collection = (); 64 my %acronyms_banned_from_collection = (); 65 66 my $writing_acronyms = 1; 67 my $accumulate_acronyms = 1; 68 my $markup_accumulate_acronyms = 1; 69 my $markup_local_acronyms = 1; 70 71 72 73 ########################################################################### 74 # file saving / loading stuff 75 ########################################################################### 76 77 sub init_acronyms { 78 79 my $file_text = ""; 80 if (open ACRONYM_HANDLE, "<$acronym_options_file") 81 { 82 $file_text = do { local $/; <ACRONYM_HANDLE> }; 83 } 84 if ($file_text eq "") 85 { 86 print STDERR "failed to open $acronym_options_file\n"; 87 open ACRONYM_HANDLE, ">$acronym_options_file\n"; 88 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n"; 89 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n"; 90 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n"; 91 print ACRONYM_HANDLE "#valid here.\n\n"; 92 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n"; 93 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n"; 94 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n"; 95 print ACRONYM_HANDLE "#explain them fully \n\n"; 96 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n"; 97 print ACRONYM_HANDLE "\$max_offset = 30;\n\n"; 98 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n"; 99 print ACRONYM_HANDLE "\$upper_case = 1;\n\n"; 100 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n"; 101 print ACRONYM_HANDLE "\$case_match = 1;\n\n"; 102 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n"; 103 print ACRONYM_HANDLE "\$min_def_length = 3;\n\n"; 104 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n"; 105 print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n"; 106 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n"; 107 print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n"; 108 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n"; 109 print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n"; 110 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n"; 111 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n"; 112 print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n"; 113 print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n"; 114 print ACRONYM_HANDLE "\n"; 115 print ACRONYM_HANDLE "#the file to collate acronyms into\n"; 116 print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n"; 117 print ACRONYM_HANDLE "\n"; 118 print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n"; 119 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n"; 120 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n"; 121 print ACRONYM_HANDLE "# \n"; 122 print ACRONYM_HANDLE "# \n"; 123 print ACRONYM_HANDLE "# \n"; 124 } 125 eval $file_text; 126 #promotes warnings/errors from evaluated file to current context 127 warn $@ if $@; 128 129 130 131 &read_all_acronyms_from_file(); 132 # rename $acronym_file, $acronym_file . "." . int(rand (2<<7)). 133 # int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7)); 134 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file") 135 { 136 print ACRONYM_HANDLE "#This is an automatically generated file.\n"; 137 print ACRONYM_HANDLE "#\n"; 138 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n"; 139 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n"; 140 print ACRONYM_HANDLE "#read-only. \n"; 141 print ACRONYM_HANDLE "#\n"; 142 print ACRONYM_HANDLE "#start of acronyms...\n"; 143 $writing_acronyms = 1; 144 } else { 145 warn "failed to open $acronym_accumulate_file for writing\n"; 146 $writing_acronyms = 0; 147 } 148 } 149 150 #close the list of accumulated acronyms 151 END { 152 if ($writing_acronyms) 153 { 154 print ACRONYM_HANDLE "#end of acronyms.\n"; 155 close ACRONYM_HANDLE; 156 } 157 } 158 159 #eval a file of accumulated acronyms 160 sub read_all_acronyms_from_file { 161 162 my $file_text = ""; 163 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file") 164 { 165 $file_text = do { local $/; <ACRONYM_HANDLE> }; 166 } else { 167 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n"; 168 } 169 eval $file_text; 170 #promotes warnings/errors from evaluated file to current context 171 warn $@ if $@; 172 } 173 174 #called from within the file of accumulated acronyms to indicate a good acronym 175 sub add { 176 my $self = shift (@_); 177 if (defined ($acronyms_found_in_collection{$self->[0]})) 178 { 179 my $def = $self->to_def_string(); 180 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/) 181 { 182 return; 183 } 184 $acronyms_found_in_collection{$self->[0]} = 185 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string(); 186 } else { 187 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string(); 188 } 189 } 190 191 #called from within the file of accumulated acronyms to indicate a bad acronym 192 sub ban { 193 my $self = shift (@_); 194 195 if (!defined $acronyms_banned_from_collection{$self->[0]}) 196 { 197 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string(); 198 } else { 199 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string(); 200 } 201 } 202 203 204 #write a good acronym to the accumulated acronyms file 205 sub write_to_file { 206 my $self = shift (@_); 207 if ($writing_acronyms) 208 { 209 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" . 210 $self->to_def_string() . 211 "\")->add();\n"; 212 } 213 } 214 215 216 ########################################################################### 217 # mark functionality 218 ########################################################################### 219 220 #small routine to sort by length 221 sub sort_by_length { 222 length($b) <=> length($a) or $a cmp $b 223 } 224 225 sub markup_acronyms { 226 my $text = shift (@_); 227 my $verbosity_obj = shift (@_); 228 if (defined $text) 229 { 230 for my $acro (sort sort_by_length keys %acronyms_found_in_collection) 231 { 232 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm; 233 printf STDERR " " . $acro . "," 234 if ($verbosity_obj->{'verbosity'} >= 2); 235 } 236 } 237 return $text; 238 } 239 240 60 241 61 242 ########################################################################### … … 65 246 66 247 sub new { 248 my $trash = shift (@_); 249 my $acro = shift (@_); 250 my $def = shift (@_); 251 67 252 my $self = [ 68 "", # 0 acronym 69 [], # 1 definition 70 ]; 253 "", # 0 acronym 254 [], # 1 definition 255 ]; 256 257 $self->[0] = $acro if defined $acro; 258 push @{$self->[1]}, split / /, $def if defined $def; 259 71 260 bless $self; 72 261 } 73 74 75 262 76 263 sub clone { … … 226 413 # print "acronym " . $self->to_string() . " rejected (too short III)\n"; 227 414 # print "" . $min_length_saving . 228 "|" . $self->letters_in_acronym() .229 "|" . $self->letters_in_acronym_definition() . "\n";415 # "|" . $self->letters_in_acronym() . 416 # "|" . $self->letters_in_acronym_definition() . "\n"; 230 417 return 0; 231 418 } … … 446 633 #&test(); 447 634 635 &init_acronyms(); 636 448 637 1; 449 638
Note:
See TracChangeset
for help on using the changeset viewer.