Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1361

Timestamp:

2000-08-05T13:50:33+12:00 (24 years ago)

Author:

say1

Message:

rewrote recursively to handle stop words and more cases

File:

: 1 edited

trunk/gsdl/perllib/acronym.pm (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/acronym.pm

-              r1336
+              r1361
 ###########################################################################
+##########################################################################
+#
 # acronym.pm --
 …
+#
 ###########################################################################
 # class to hold acronyms
+#    class to handle acronyms
+###########################################################################
 use strict;
 #use diagnostics;
+use diagnostics;
 package acronym;
+#use Class::Struct;
+###########################################################################
+#    global variables
+###########################################################################
+# valiables to control the recall/precision tradeoff
+#the maximum range to look for acronyms
+my $max_offset = 30;
+#acronyms must be upper case
+my $upper_case = 1;
+#acronym case must match
+my $case_match = 1;
+#minimum acronym length
+my $min_def_length = 3;
+#minimum acronym length
+my $min_acro_length = 3;
+#minimum acronym length saving
+my $min_length_saving = 3;
+#allow recusive acronyms
+my $allow_recursive = "";
+my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND";
+#my @stop_words = split / /, "OF AT THE IN TO AND";
+#the text split into an array, one word per element
+my @split_text = ();
+my @acronym_list = ();
+###########################################################################
+#    member functions
+###########################################################################
 …
     "", # 0 acronym
     [], # 1 definition
-    [], # 2 stop_words
-,  # 3 letters_for_far
     ];
     bless $self;
 …
     $copy->[0] = $self->[0];
     push @{$copy->[1]}, @{$self->[1]};
+    push @{$copy->[2]}, @{$self->[2]};
+    $copy->[3] = $self->[3];
+    $copy->[4] = $self->[4];
+    $copy->[5] = $self->[5];
+    $copy->[6] = $self->[6];
+    bless $copy;
     return $copy;
+}
+#print out the kwic for the acronym
+sub to_string_kwic {
+    my $self = shift (@_);
+    # the list of all possible combinations
+    my @list = ();
+#return the acronym
+sub to_acronym {
+    my $self = shift (@_);
+    my @array = @{$self->[1]};
+    return $self->[0];
+}
+#return the number of words in the acronym definition
+sub words_in_acronym_definition {
+    my $self = shift (@_);
+    my @array = @{$self->[1]};
+    return $#array + 1;
+}
+#return the number of letters in the acronym definition
+sub letters_in_acronym_definition {
+    my $self = shift (@_);
+    return length($self->to_def_string());
+}
+#return the number of letters in the acronym definition
+sub letters_in_acronym {
+    my $self = shift (@_);
+    return length($self->to_acronym());
+}
+#return the acronym definition
+sub to_def_string {
+    my $self = shift (@_);
     my $result = "";
-    my $j = 0;
-    my @array = @{$self->[1]};
-    while ($j <= $#array)
+    {
-    $result = "";
-    # do the definition
-    my $i = 0;
-    while ($i <= $#array)
+    {
-        my $current = ($i + $j) % ($#array+1);
-        $result = $result . $array[$current] . " ";
-        $i++;
+    }
-    $result = $result . "(" . $self->[0] . ")";
-    push @list, $result;
-    $j++;
+    }
-    return @list;
+}
-#this is the one used when building the collection ...
-sub to_string {
-    my $self = shift (@_);
-    my $result = $self->[0] . " ";
     # do the definition
 …
     while ($i <= $#array)
+    {
+    my $resultnext = $result . $array[$i] . " ";
+    $result = $resultnext;
+    $result = $result . $array[$i];
+    if ($i+1 <= $#array)
+    {
+        $result = $result . " ";
+    }
     $i++;
+    }
 …
+}
+#print out the kwic for the acronym
+sub to_string_kwic {
+    my $self = shift (@_);
+    # the list of all possible combinations
+    my @list = ();
+    my $result = "";
+    my $j = 0;
+    my @array = @{$self->[1]};
+    while ($j <= $#array)
+    {
+    # do the definition
+    my $i = 0;
+    #add the key word
+    $result = "<td halign=left>"  . $array[$j] . "</td><td halign=right>";
+    #add the proceeding words
+    while ($i < $j)
+    {
+        $result = $result .  $array[$i] . " ";
+        $i++;
+    }
+    #add the key word
+    $result = $result . "</td><td halign=left>"  . $array[$j] .
+        "</td><td halign=left>";
+    #add the trailing words
+    $i++;
+    while ($i <= $#array )
+    {
+        $result = $result .  $array[$i] . " ";
+        $i++;
+    }
+    #add the actual acronym
+    $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
+    push @list, $result;
+    $j++;
+    }
+    return @list;
+}
+#this is the one used when building the collection ...
+sub to_string {
+    my $self = shift (@_);
+    my $result = $self->[0] . " ";
+    # do the definition
+    my @array = @{$self->[1]};
+    my $i = 0;
+    while ($i <= $#array)
+    {
+    $result = $result . $array[$i];
+    if ($i+1 <= $#array)
+    {
+        $result = $result . " ";
+    }
+    $i++;
+    }
+    return $result;
+}
+sub check {
+    my $self = shift (@_);
+    if (length($self->to_acronym()) < $min_acro_length)
+    {
+#   print "acronym " . $self->to_string() . " rejected (too short I)\n";
+    return 0;
+    }
+    if ($self->words_in_acronym_definition() < $min_def_length)
+    {
+#   print "acronym " . $self->to_string() . " rejected (too short II)\n";
+    return 0;
+    }
+    if ($min_length_saving * $self->letters_in_acronym() >
+    $self->letters_in_acronym_definition())
+    {
+#   print "acronym " . $self->to_string() . " rejected (too short III)\n";
+#   print "" . $min_length_saving .
+        "|" . $self->letters_in_acronym() .
+        "|" . $self->letters_in_acronym_definition() . "\n";
+    return 0;
+    }
+#    print "acronym " . $self->to_string() . " not rejected\n";
+    return 1;
+}
+###########################################################################
+#    static functions
+###########################################################################
+sub recurse {
+    my ($acro_offset,       #offset of word we're finding acronyms for
+    $text_offset,
+    $letter_offset,
+    @def_so_far) = @_;
+    my $word = $split_text[$text_offset];
+    my $acro = $split_text[$acro_offset];
+    $word = "" if !defined $word;
+    $acro = "" if !defined $acro;
+#    print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
+#   $letter_offset  . ", " . @def_so_far . ")\n";
+    #check for termination ...
+    if ($letter_offset >= length($acro))
+    {
+    my $acronym = new acronym();
+    $acronym->[0] = $acro;
+    push @{$acronym->[1]}, @def_so_far;
+    if ($acronym->check())
+    {
+        push @acronym_list, ( $acronym );
+    }
+#   print "acronym created\n";
+    return;
+    }
+    #check for recursion
+    if (!$allow_recursive)
+    {
+    if ($word eq $acro)
+    {
+#       print "recursion detected\n";
+        return;
+    }
+    }
+    #skip a stop-word
+    my $i = 0;
+    if ($letter_offset != 0)
+    {
+    while ($i <= $#stop_words)
+    {
+        if ($stop_words[$i] eq uc($word))
+        {
+#       print "::found stop word::" . $stop_words[$i] . "\n";
+        &recurse($acro_offset,
+             $text_offset+1,
+             $letter_offset,
+             @def_so_far, $word);
+        }
+        $i++;
+    }
+    }
+    $i = 1;
+    #using the first $i letters ...
+    while ($letter_offset+$i <= length($acro) )
+    {
+#   print "". ((substr $word, 0, $i) . " " .
+#       (substr $acro, $letter_offset, $i) . "\n");
+    if (((!$case_match) &&
+         (uc(substr $word, 0, $i) eq
+          uc(substr $acro, $letter_offset, $i)))
+        ||
+        (($case_match) &&
+         ((substr $word, 0, $i) eq
+          (substr $acro, $letter_offset, $i))))
+    {
+#       print "::match::\n";
+#       print "" . ((substr $word, 0, $i) . " " .
+#          (substr $acro, $letter_offset, $i) . "\n");
+        &recurse($acro_offset,
+             $text_offset+1,
+             $letter_offset+$i,
+             @def_so_far, $word);
+    } else {
+        return;
+    }
+    $i++;
+    }
+    return;
+}
 sub acronyms {
+    #clean up the text
     my $processed_text =  shift @_;
     $$processed_text =~ s/[^A-Za-z]/ /g;
     $$processed_text =~ s/\s+/ /g;
+    return &acronyms_from_clean_text($processed_text)
+    #clear some global variables
+    @split_text = ();
+    @acronym_list = ();
+    return &acronyms_from_clean_text($processed_text);
+}
 sub acronyms_from_clean_text {
     my ($processed_text) = @_;
+    my @acro_list = ();
+    my $max_offset = 50;
+    my @text = split / /, $$processed_text;
+    @split_text = split / /, $$processed_text;
 #    my $i = 0;
 #    while ($i <= $#text)
+#    while ($i <= $#split_text)
 #    {
 #   print $text->[$i] . "\n";
+#   print $split_text[$i] . "\n";
 #   $i++;
 #    }
     my $first = 0;
     my $last = $#text +1;
+    my $last = $#split_text +1;
     my $acro_counter = $first;
     while ($acro_counter < $last)
+    {
+    my $word = $text[$acro_counter];
+    # the tests on the following line are VERY important
+        # to the performance of this algorithm be VERY careful
+        # when relaxing them...
+    if (length $word >= 3 && (uc($word) eq $word))
+    {
+        my $def_counter = 0;
+        if ($acro_counter - $max_offset > 0)
+    my $word = $split_text[$acro_counter];
+    if ((!$upper_case) ||
+        (uc($word) eq $word))
+    {
+        if (length $word >= $min_acro_length)
+        {
+        $def_counter = $acro_counter - $max_offset;
+        my $def_counter = 0;
+        if ($acro_counter - $max_offset > 0)
+        {
+            $def_counter = $acro_counter - $max_offset;
+        }
+        my $local_last = $acro_counter  + $max_offset;
+        if ($local_last > $last)
+        {
+            $local_last = $last;
+        }
+        while ($def_counter <= $local_last)
+        {
+            &recurse($acro_counter,$def_counter,0,());
+            $def_counter++;
+        }
+        }
-        my $local_last = $acro_counter  + $max_offset;
-        if ($local_last > $last)
+        {
-        $local_last = $last;
+        }
-        while ($def_counter <= $local_last)
+        {
-        my $letter_counter = 0;
-        my $match = 1;
-        while ($letter_counter < length($word))
+        {
-            if ($def_counter+$letter_counter >= $local_last)
+            {
-            $match = 0;
-            last;
+            }
-            my $def_word = $text[$def_counter+$letter_counter];
-            #throw it out if it's recursing...
-            if (uc($word) eq uc($def_word))
+            {
-            $match = 0;
-            last
+            }
-            if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1))
+            {
-            $match = 0;
-            last;
+            }
-            $letter_counter++;
+        }
-        # this line should perhaps be more sophisticated ...
-                # it encodes what we consider to be a valid acronym
-        if ($match == 1 && $letter_counter > 0 &&
-            (abs($def_counter - $acro_counter)< $max_offset))
+        {
-            my $acro = new acronym();
-            $acro->[0] = $word;
-            push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ];
-            $acro->[3] = $letter_counter;
-#           my @tmp = ( $acro );
-            push @acro_list, ( $acro );
-#           print $acro->to_string(). "\n";
-            $match = 0;
+        }
-        $def_counter++;
+        }
+    }
     $acro_counter++;
+    }
     return \@acro_list;
+    return \@acronym_list;
+}
 …
     push @{$tla->[1]}, ("Three" );
     push @{$tla->[1]}, ("Letter" );
-    push @{$tla->[1]}, ("Letter" );
     push @{$tla->[1]}, ("Acronym" );
     print $tla->to_string(). "\n";
 …
     print "\n";
+    print "Testing recursion ...\n";
+    my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
+    foreach my $acro (@$acros)
+    {
+    if ($acro->check)
+        {
+           print "accepted: " .$acro->to_string() . "\n";
+#           print "|" .  $acro->to_acronym() . "|" .  $acro->to_def_string() .
+#              "|" .  $acro->words_in_acronym_definition() .
+#              "|" .  $acro->letters_in_acronym_definition() .
+#              "|" .  $acro->letters_in_acronym() . "|\n";
+        } else {
+#          print "found but rejected: " .$acro->to_string() . "\n";
+        }
+    }
+}
 …
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1361

Legend:

trunk/gsdl/perllib/acronym.pm

Download in other formats: