Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1336

Timestamp:

2000-08-03T00:41:40+12:00 (24 years ago)

Author:

say1

Message:

fixed acronym extraction so it is now runs in time linear to the document length (was l²⁾

File:

: 1 edited

trunk/gsdl/perllib/acronym.pm (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/acronym.pm

-              r1244
+              r1336
+}
-#struct (
-#   # core items
-#   acronym => '$',     # the acronym (a string)
-#   definition => '@',  # the acronyms defintion (an array of strings)
-#        stop_words => '@',  # an array of 1 (stop word) and 0 (non stop word)
+#
-#        # items related to the context in which the acronym was mined
-#        word_count => '$',        # the index of the acronym within the text
-#   definition_offset => '$', # the distance between the acronym and the definition.
+#
-#        #temporary items used during the mining of the acronym
-#        letters_for_far => '$', # how many letters have we found so far ?
+#
-#   #calculated items
-#   #...
+#
-#   );
-#sub definition {
-#    my $self = shift (@_);
-#    my @def = @$self->[2];
-#    print "definition::\@de+f = " . @def . "\n";
-#    print "definition::\@_ = " . @_ . "\n";
-#    if (@_) {
-#   push  @def, @_;
-#   @$self->[2] = @def;
-#    }
-#    return @def;
-#}
 …
+}
-#sub initialise {
-#    my $self = shift (@_);
+#
-#    # initialise the struct from the parameters ...
-#    my($acro, $wc, $def) = @_;
-#    $self->acronym($acro);
-#    $self->word_count($wc);
-#    $self->definition_offset($def);
+#
-#    $self->letters_for_far(0);
-#}
 #print out the kwic for the acronym
 …
+}
-# called when the acronym is complete and after altering any stats to compute stats etc.
-sub stablise {
+}
 sub acronyms {
     my $processed_text =  shift @_;
 …
     my ($processed_text) = @_;
     my @acro_list = ();
+    my $max_offset = 50;
     my @text = split / /, $$processed_text;
 …
     my $word = $text[$acro_counter];
+    # the tests on the following line are VERY important to the performance of this algorithm
+    # be VERY careful when relaxing them...
+    # the tests on the following line are VERY important
+        # to the performance of this algorithm be VERY careful
+        # when relaxing them...
     if (length $word >= 3 && (uc($word) eq $word))
+    {
         my $def_counter = 0;
+        while ($def_counter <= $last)
+        if ($acro_counter - $max_offset > 0)
+        {
+        $def_counter = $acro_counter - $max_offset;
+        }
+        my $local_last = $acro_counter  + $max_offset;
+        if ($local_last > $last)
+        {
+        $local_last = $last;
+        }
+        while ($def_counter <= $local_last)
+        {
         my $letter_counter = 0;
 …
         while ($letter_counter < length($word))
+        {
             if ($def_counter+$letter_counter >= $last)
+            if ($def_counter+$letter_counter >= $local_last)
+            {
             $match = 0;
 …
             $letter_counter++;
+        }
+        # this line should perhaps be more sophisticated ... it encodes what we consider
+                # to be a valid acronym
+        if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50))
+        # this line should perhaps be more sophisticated ...
+                # it encodes what we consider to be a valid acronym
+        if ($match == 1 && $letter_counter > 0 &&
+            (abs($def_counter - $acro_counter)< $max_offset))
+        {
             my $acro = new acronym();

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1336

Legend:

trunk/gsdl/perllib/acronym.pm

Download in other formats: