Changeset 1361
- Timestamp:
- 2000-08-05T13:50:33+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/acronym.pm
r1336 r1361 1 ########################################################################## #1 ########################################################################## 2 2 # 3 3 # acronym.pm -- … … 23 23 # 24 24 ########################################################################### 25 26 # class to hold acronyms25 # class to handle acronyms 26 ########################################################################### 27 27 28 28 use strict; 29 #use diagnostics;29 use diagnostics; 30 30 31 31 package acronym; 32 #use Class::Struct; 32 33 ########################################################################### 34 # global variables 35 ########################################################################### 36 # valiables to control the recall/precision tradeoff 37 38 #the maximum range to look for acronyms 39 my $max_offset = 30; 40 #acronyms must be upper case 41 my $upper_case = 1; 42 #acronym case must match 43 my $case_match = 1; 44 #minimum acronym length 45 my $min_def_length = 3; 46 #minimum acronym length 47 my $min_acro_length = 3; 48 #minimum acronym length saving 49 my $min_length_saving = 3; 50 #allow recusive acronyms 51 my $allow_recursive = ""; 52 53 my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND"; 54 #my @stop_words = split / /, "OF AT THE IN TO AND"; 55 56 #the text split into an array, one word per element 57 my @split_text = (); 58 my @acronym_list = (); 59 60 61 ########################################################################### 62 # member functions 63 ########################################################################### 33 64 34 65 … … 37 68 "", # 0 acronym 38 69 [], # 1 definition 39 [], # 2 stop_words40 0, # 3 letters_for_far41 42 70 ]; 43 71 bless $self; … … 53 81 $copy->[0] = $self->[0]; 54 82 push @{$copy->[1]}, @{$self->[1]}; 55 push @{$copy->[2]}, @{$self->[2]}; 56 $copy->[3] = $self->[3]; 57 $copy->[4] = $self->[4]; 58 $copy->[5] = $self->[5]; 59 $copy->[6] = $self->[6]; 60 83 bless $copy; 84 61 85 return $copy; 62 86 } 63 87 64 65 #print out the kwic for the acronym 66 sub to_string_kwic { 67 my $self = shift (@_); 68 69 # the list of all possible combinations 70 my @list = (); 88 #return the acronym 89 sub to_acronym { 90 my $self = shift (@_); 91 my @array = @{$self->[1]}; 92 93 return $self->[0]; 94 } 95 96 #return the number of words in the acronym definition 97 sub words_in_acronym_definition { 98 my $self = shift (@_); 99 my @array = @{$self->[1]}; 100 101 return $#array + 1; 102 } 103 104 #return the number of letters in the acronym definition 105 sub letters_in_acronym_definition { 106 my $self = shift (@_); 107 108 return length($self->to_def_string()); 109 } 110 111 #return the number of letters in the acronym definition 112 sub letters_in_acronym { 113 my $self = shift (@_); 114 115 return length($self->to_acronym()); 116 } 117 118 #return the acronym definition 119 sub to_def_string { 120 my $self = shift (@_); 71 121 72 122 my $result = ""; 73 74 my $j = 0;75 my @array = @{$self->[1]};76 while ($j <= $#array)77 {78 79 $result = "";80 81 # do the definition82 my $i = 0;83 while ($i <= $#array)84 {85 my $current = ($i + $j) % ($#array+1);86 $result = $result . $array[$current] . " ";87 $i++;88 }89 $result = $result . "(" . $self->[0] . ")";90 91 push @list, $result;92 $j++;93 }94 return @list;95 }96 97 #this is the one used when building the collection ...98 sub to_string {99 my $self = shift (@_);100 101 my $result = $self->[0] . " ";102 123 103 124 # do the definition … … 106 127 while ($i <= $#array) 107 128 { 108 my $resultnext = $result . $array[$i] . " "; 109 $result = $resultnext; 129 $result = $result . $array[$i]; 130 131 if ($i+1 <= $#array) 132 { 133 $result = $result . " "; 134 } 110 135 $i++; 111 136 } … … 113 138 } 114 139 140 141 #print out the kwic for the acronym 142 sub to_string_kwic { 143 my $self = shift (@_); 144 145 # the list of all possible combinations 146 my @list = (); 147 148 my $result = ""; 149 150 my $j = 0; 151 my @array = @{$self->[1]}; 152 while ($j <= $#array) 153 { 154 155 # do the definition 156 my $i = 0; 157 158 #add the key word 159 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>"; 160 161 #add the proceeding words 162 while ($i < $j) 163 { 164 $result = $result . $array[$i] . " "; 165 $i++; 166 } 167 #add the key word 168 $result = $result . "</td><td halign=left>" . $array[$j] . 169 "</td><td halign=left>"; 170 171 #add the trailing words 172 $i++; 173 while ($i <= $#array ) 174 { 175 $result = $result . $array[$i] . " "; 176 $i++; 177 } 178 179 #add the actual acronym 180 181 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>"; 182 183 push @list, $result; 184 $j++; 185 } 186 return @list; 187 } 188 189 #this is the one used when building the collection ... 190 sub to_string { 191 my $self = shift (@_); 192 193 my $result = $self->[0] . " "; 194 195 # do the definition 196 my @array = @{$self->[1]}; 197 my $i = 0; 198 while ($i <= $#array) 199 { 200 $result = $result . $array[$i]; 201 if ($i+1 <= $#array) 202 { 203 $result = $result . " "; 204 } 205 $i++; 206 } 207 return $result; 208 } 209 210 sub check { 211 my $self = shift (@_); 212 213 if (length($self->to_acronym()) < $min_acro_length) 214 { 215 # print "acronym " . $self->to_string() . " rejected (too short I)\n"; 216 return 0; 217 } 218 if ($self->words_in_acronym_definition() < $min_def_length) 219 { 220 # print "acronym " . $self->to_string() . " rejected (too short II)\n"; 221 return 0; 222 } 223 if ($min_length_saving * $self->letters_in_acronym() > 224 $self->letters_in_acronym_definition()) 225 { 226 # print "acronym " . $self->to_string() . " rejected (too short III)\n"; 227 # print "" . $min_length_saving . 228 "|" . $self->letters_in_acronym() . 229 "|" . $self->letters_in_acronym_definition() . "\n"; 230 return 0; 231 } 232 # print "acronym " . $self->to_string() . " not rejected\n"; 233 return 1; 234 } 235 236 ########################################################################### 237 # static functions 238 ########################################################################### 239 240 sub recurse { 241 my ($acro_offset, #offset of word we're finding acronyms for 242 $text_offset, 243 $letter_offset, 244 @def_so_far) = @_; 245 246 my $word = $split_text[$text_offset]; 247 my $acro = $split_text[$acro_offset]; 248 $word = "" if !defined $word; 249 $acro = "" if !defined $acro; 250 251 # print "recurse(" . $acro_offset . ", " . $text_offset . ", " . 252 # $letter_offset . ", " . @def_so_far . ")\n"; 253 254 #check for termination ... 255 if ($letter_offset >= length($acro)) 256 { 257 my $acronym = new acronym(); 258 $acronym->[0] = $acro; 259 push @{$acronym->[1]}, @def_so_far; 260 if ($acronym->check()) 261 { 262 push @acronym_list, ( $acronym ); 263 } 264 # print "acronym created\n"; 265 return; 266 } 267 #check for recursion 268 if (!$allow_recursive) 269 { 270 if ($word eq $acro) 271 { 272 # print "recursion detected\n"; 273 return; 274 } 275 } 276 277 #skip a stop-word 278 my $i = 0; 279 if ($letter_offset != 0) 280 { 281 while ($i <= $#stop_words) 282 { 283 if ($stop_words[$i] eq uc($word)) 284 { 285 # print "::found stop word::" . $stop_words[$i] . "\n"; 286 &recurse($acro_offset, 287 $text_offset+1, 288 $letter_offset, 289 @def_so_far, $word); 290 } 291 $i++; 292 } 293 } 294 $i = 1; 295 #using the first $i letters ... 296 while ($letter_offset+$i <= length($acro) ) 297 { 298 # print "". ((substr $word, 0, $i) . " " . 299 # (substr $acro, $letter_offset, $i) . "\n"); 300 if (((!$case_match) && 301 (uc(substr $word, 0, $i) eq 302 uc(substr $acro, $letter_offset, $i))) 303 || 304 (($case_match) && 305 ((substr $word, 0, $i) eq 306 (substr $acro, $letter_offset, $i)))) 307 { 308 # print "::match::\n"; 309 # print "" . ((substr $word, 0, $i) . " " . 310 # (substr $acro, $letter_offset, $i) . "\n"); 311 &recurse($acro_offset, 312 $text_offset+1, 313 $letter_offset+$i, 314 @def_so_far, $word); 315 } else { 316 return; 317 } 318 $i++; 319 } 320 return; 321 } 322 323 115 324 sub acronyms { 325 #clean up the text 116 326 my $processed_text = shift @_; 117 327 $$processed_text =~ s/[^A-Za-z]/ /g; 118 328 $$processed_text =~ s/\s+/ /g; 119 329 120 return &acronyms_from_clean_text($processed_text) 330 #clear some global variables 331 @split_text = (); 332 @acronym_list = (); 333 334 return &acronyms_from_clean_text($processed_text); 121 335 } 122 336 123 337 sub acronyms_from_clean_text { 124 338 my ($processed_text) = @_; 125 my @acro_list = (); 126 my $max_offset = 50; 127 128 my @text = split / /, $$processed_text; 339 340 @split_text = split / /, $$processed_text; 129 341 130 342 # my $i = 0; 131 # while ($i <= $# text)343 # while ($i <= $#split_text) 132 344 # { 133 # print $ text->[$i] . "\n";345 # print $split_text[$i] . "\n"; 134 346 # $i++; 135 347 # } 136 348 137 349 my $first = 0; 138 my $last = $# text +1;350 my $last = $#split_text +1; 139 351 my $acro_counter = $first; 140 352 141 353 while ($acro_counter < $last) 142 354 { 143 my $word = $text[$acro_counter]; 144 145 # the tests on the following line are VERY important 146 # to the performance of this algorithm be VERY careful 147 # when relaxing them... 148 if (length $word >= 3 && (uc($word) eq $word)) 149 { 150 my $def_counter = 0; 151 if ($acro_counter - $max_offset > 0) 355 my $word = $split_text[$acro_counter]; 356 357 if ((!$upper_case) || 358 (uc($word) eq $word)) 359 { 360 361 if (length $word >= $min_acro_length) 152 362 { 153 $def_counter = $acro_counter - $max_offset; 363 my $def_counter = 0; 364 if ($acro_counter - $max_offset > 0) 365 { 366 $def_counter = $acro_counter - $max_offset; 367 } 368 my $local_last = $acro_counter + $max_offset; 369 if ($local_last > $last) 370 { 371 $local_last = $last; 372 } 373 while ($def_counter <= $local_last) 374 { 375 &recurse($acro_counter,$def_counter,0,()); 376 $def_counter++; 377 } 154 378 } 155 my $local_last = $acro_counter + $max_offset;156 if ($local_last > $last)157 {158 $local_last = $last;159 }160 while ($def_counter <= $local_last)161 {162 my $letter_counter = 0;163 my $match = 1;164 while ($letter_counter < length($word))165 {166 if ($def_counter+$letter_counter >= $local_last)167 {168 $match = 0;169 last;170 }171 my $def_word = $text[$def_counter+$letter_counter];172 173 #throw it out if it's recursing...174 if (uc($word) eq uc($def_word))175 {176 $match = 0;177 last178 }179 if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1))180 {181 $match = 0;182 last;183 }184 $letter_counter++;185 }186 # this line should perhaps be more sophisticated ...187 # it encodes what we consider to be a valid acronym188 if ($match == 1 && $letter_counter > 0 &&189 (abs($def_counter - $acro_counter)< $max_offset))190 {191 my $acro = new acronym();192 $acro->[0] = $word;193 push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ];194 $acro->[3] = $letter_counter;195 # my @tmp = ( $acro );196 push @acro_list, ( $acro );197 # print $acro->to_string(). "\n";198 $match = 0;199 }200 $def_counter++;201 }202 379 } 203 380 $acro_counter++; 204 381 } 205 382 206 return \@acro _list;383 return \@acronym_list; 207 384 } 208 385 … … 234 411 push @{$tla->[1]}, ("Three" ); 235 412 push @{$tla->[1]}, ("Letter" ); 236 push @{$tla->[1]}, ("Letter" );237 413 push @{$tla->[1]}, ("Acronym" ); 238 414 print $tla->to_string(). "\n"; … … 249 425 print "\n"; 250 426 427 print "Testing recursion ...\n"; 428 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA"); 429 430 foreach my $acro (@$acros) 431 { 432 if ($acro->check) 433 { 434 print "accepted: " .$acro->to_string() . "\n"; 435 # print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() . 436 # "|" . $acro->words_in_acronym_definition() . 437 # "|" . $acro->letters_in_acronym_definition() . 438 # "|" . $acro->letters_in_acronym() . "|\n"; 439 } else { 440 # print "found but rejected: " .$acro->to_string() . "\n"; 441 } 442 } 251 443 } 252 444 … … 255 447 256 448 1; 449 450
Note:
See TracChangeset
for help on using the changeset viewer.