Changeset 1336


Ignore:
Timestamp:
2000-08-03T00:41:40+12:00 (24 years ago)
Author:
say1
Message:

fixed acronym extraction so it is now runs in time linear to the document length (was l2)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/acronym.pm

    r1244 r1336  
    4444}
    4545
    46 #struct (
    47 #   # core items
    48 #   acronym => '$',     # the acronym (a string)
    49 #   definition => '@',  # the acronyms defintion (an array of strings)
    50 #        stop_words => '@',  # an array of 1 (stop word) and 0 (non stop word)
    51 #
    52 #        # items related to the context in which the acronym was mined
    53 #        word_count => '$',        # the index of the acronym within the text
    54 #   definition_offset => '$', # the distance between the acronym and the definition.
    55 #       
    56 #        #temporary items used during the mining of the acronym
    57 #        letters_for_far => '$', # how many letters have we found so far ?
    58 #
    59 #   #calculated items
    60 #   #...
    61 #
    62 #   );
    63 
    64 #sub definition {
    65 #    my $self = shift (@_);
    66 #    my @def = @$self->[2];
    67 #    print "definition::\@de+f = " . @def . "\n";
    68 #    print "definition::\@_ = " . @_ . "\n";
    69 #    if (@_) {
    70 #   push  @def, @_;
    71 #   @$self->[2] = @def;
    72 #    }
    73 #    return @def;
    74 #}
    75 
    7646
    7747
     
    9262}
    9363
    94 #sub initialise {
    95 #    my $self = shift (@_);
    96 #
    97 #    # initialise the struct from the parameters ...
    98 #    my($acro, $wc, $def) = @_;
    99 #    $self->acronym($acro);
    100 #    $self->word_count($wc);
    101 #    $self->definition_offset($def);
    102 #
    103 #    $self->letters_for_far(0);
    104 #}
    10564
    10665#print out the kwic for the acronym
     
    154113}
    155114
    156 # called when the acronym is complete and after altering any stats to compute stats etc.
    157 sub stablise {
    158 
    159 
    160 }
    161 
    162115sub acronyms {
    163116    my $processed_text =  shift @_;
     
    171124    my ($processed_text) = @_;
    172125    my @acro_list = ();
     126    my $max_offset = 50;
    173127
    174128    my @text = split / /, $$processed_text;
     
    189143    my $word = $text[$acro_counter];
    190144
    191     # the tests on the following line are VERY important to the performance of this algorithm
    192     # be VERY careful when relaxing them...
     145    # the tests on the following line are VERY important
     146        # to the performance of this algorithm be VERY careful
     147        # when relaxing them...
    193148    if (length $word >= 3 && (uc($word) eq $word))
    194149    {
    195150        my $def_counter = 0;
    196         while ($def_counter <= $last)
     151        if ($acro_counter - $max_offset > 0)
     152        {
     153        $def_counter = $acro_counter - $max_offset;
     154        }
     155        my $local_last = $acro_counter  + $max_offset;
     156        if ($local_last > $last)
     157        {
     158        $local_last = $last;
     159        }
     160        while ($def_counter <= $local_last)
    197161        {
    198162        my $letter_counter = 0;
     
    200164        while ($letter_counter < length($word))
    201165        {
    202             if ($def_counter+$letter_counter >= $last)
     166            if ($def_counter+$letter_counter >= $local_last)
    203167            {
    204168            $match = 0;
     
    220184            $letter_counter++;
    221185        }
    222         # this line should perhaps be more sophisticated ... it encodes what we consider
    223                 # to be a valid acronym
    224         if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50))
     186        # this line should perhaps be more sophisticated ...
     187                # it encodes what we consider to be a valid acronym
     188        if ($match == 1 && $letter_counter > 0 &&
     189            (abs($def_counter - $acro_counter)< $max_offset))
    225190        {
    226191            my $acro = new acronym();
Note: See TracChangeset for help on using the changeset viewer.