Changeset 1336
- Timestamp:
- 2000-08-03T00:41:40+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/acronym.pm
r1244 r1336 44 44 } 45 45 46 #struct (47 # # core items48 # acronym => '$', # the acronym (a string)49 # definition => '@', # the acronyms defintion (an array of strings)50 # stop_words => '@', # an array of 1 (stop word) and 0 (non stop word)51 #52 # # items related to the context in which the acronym was mined53 # word_count => '$', # the index of the acronym within the text54 # definition_offset => '$', # the distance between the acronym and the definition.55 #56 # #temporary items used during the mining of the acronym57 # letters_for_far => '$', # how many letters have we found so far ?58 #59 # #calculated items60 # #...61 #62 # );63 64 #sub definition {65 # my $self = shift (@_);66 # my @def = @$self->[2];67 # print "definition::\@de+f = " . @def . "\n";68 # print "definition::\@_ = " . @_ . "\n";69 # if (@_) {70 # push @def, @_;71 # @$self->[2] = @def;72 # }73 # return @def;74 #}75 76 46 77 47 … … 92 62 } 93 63 94 #sub initialise {95 # my $self = shift (@_);96 #97 # # initialise the struct from the parameters ...98 # my($acro, $wc, $def) = @_;99 # $self->acronym($acro);100 # $self->word_count($wc);101 # $self->definition_offset($def);102 #103 # $self->letters_for_far(0);104 #}105 64 106 65 #print out the kwic for the acronym … … 154 113 } 155 114 156 # called when the acronym is complete and after altering any stats to compute stats etc.157 sub stablise {158 159 160 }161 162 115 sub acronyms { 163 116 my $processed_text = shift @_; … … 171 124 my ($processed_text) = @_; 172 125 my @acro_list = (); 126 my $max_offset = 50; 173 127 174 128 my @text = split / /, $$processed_text; … … 189 143 my $word = $text[$acro_counter]; 190 144 191 # the tests on the following line are VERY important to the performance of this algorithm 192 # be VERY careful when relaxing them... 145 # the tests on the following line are VERY important 146 # to the performance of this algorithm be VERY careful 147 # when relaxing them... 193 148 if (length $word >= 3 && (uc($word) eq $word)) 194 149 { 195 150 my $def_counter = 0; 196 while ($def_counter <= $last) 151 if ($acro_counter - $max_offset > 0) 152 { 153 $def_counter = $acro_counter - $max_offset; 154 } 155 my $local_last = $acro_counter + $max_offset; 156 if ($local_last > $last) 157 { 158 $local_last = $last; 159 } 160 while ($def_counter <= $local_last) 197 161 { 198 162 my $letter_counter = 0; … … 200 164 while ($letter_counter < length($word)) 201 165 { 202 if ($def_counter+$letter_counter >= $l ast)166 if ($def_counter+$letter_counter >= $local_last) 203 167 { 204 168 $match = 0; … … 220 184 $letter_counter++; 221 185 } 222 # this line should perhaps be more sophisticated ... it encodes what we consider 223 # to be a valid acronym 224 if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50)) 186 # this line should perhaps be more sophisticated ... 187 # it encodes what we consider to be a valid acronym 188 if ($match == 1 && $letter_counter > 0 && 189 (abs($def_counter - $acro_counter)< $max_offset)) 225 190 { 226 191 my $acro = new acronym();
Note:
See TracChangeset
for help on using the changeset viewer.