########################################################################### # # acronym.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # class to hold acronyms use strict; #use diagnostics; package acronym; #use Class::Struct; sub new { my $self = [ "", # 0 acronym [], # 1 definition [], # 2 stop_words 0, # 3 letters_for_far ]; bless $self; } #struct ( # # core items # acronym => '$', # the acronym (a string) # definition => '@', # the acronyms defintion (an array of strings) # stop_words => '@', # an array of 1 (stop word) and 0 (non stop word) # # # items related to the context in which the acronym was mined # word_count => '$', # the index of the acronym within the text # definition_offset => '$', # the distance between the acronym and the definition. # # #temporary items used during the mining of the acronym # letters_for_far => '$', # how many letters have we found so far ? # # #calculated items # #... # # ); #sub definition { # my $self = shift (@_); # my @def = @$self->[2]; # print "definition::\@de+f = " . @def . "\n"; # print "definition::\@_ = " . @_ . "\n"; # if (@_) { # push @def, @_; # @$self->[2] = @def; # } # return @def; #} sub clone { my $self = shift (@_); my $copy = new acronym; $copy->[0] = $self->[0]; push @{$copy->[1]}, @{$self->[1]}; push @{$copy->[2]}, @{$self->[2]}; $copy->[3] = $self->[3]; $copy->[4] = $self->[4]; $copy->[5] = $self->[5]; $copy->[6] = $self->[6]; return $copy; } #sub initialise { # my $self = shift (@_); # # # initialise the struct from the parameters ... # my($acro, $wc, $def) = @_; # $self->acronym($acro); # $self->word_count($wc); # $self->definition_offset($def); # # $self->letters_for_far(0); #} #print out the kwic for the acronym sub to_string_kwic { my $self = shift (@_); # the list of all possible combinations my @list = (); my $result = ""; my $j = 0; my @array = @{$self->[1]}; while ($j <= $#array) { $result = ""; # do the definition my $i = 0; while ($i <= $#array) { my $current = ($i + $j) % ($#array+1); $result = $result . $array[$current] . " "; $i++; } $result = $result . "(" . $self->[0] . ")"; push @list, $result; $j++; } return @list; } #this is the one used when building the collection ... sub to_string { my $self = shift (@_); my $result = $self->[0] . " "; # do the definition my @array = @{$self->[1]}; my $i = 0; while ($i <= $#array) { my $resultnext = $result . $array[$i] . " "; $result = $resultnext; $i++; } return $result; } # called when the acronym is complete and after altering any stats to compute stats etc. sub stablise { } sub acronyms { my $processed_text = shift @_; $$processed_text =~ s/[^A-Za-z]/ /g; $$processed_text =~ s/\s+/ /g; return &acronyms_from_clean_text($processed_text) } sub acronyms_from_clean_text { my ($processed_text) = @_; my @acro_list = (); my @text = split / /, $$processed_text; # my $i = 0; # while ($i <= $#text) # { # print $text->[$i] . "\n"; # $i++; # } my $first = 0; my $last = $#text +1; my $acro_counter = $first; while ($acro_counter < $last) { my $word = $text[$acro_counter]; # the tests on the following line are VERY important to the performance of this algorithm # be VERY careful when relaxing them... if (length $word >= 3 && (uc($word) eq $word)) { my $def_counter = 0; while ($def_counter <= $last) { my $letter_counter = 0; my $match = 1; while ($letter_counter < length($word)) { if ($def_counter+$letter_counter >= $last) { $match = 0; last; } my $def_word = $text[$def_counter+$letter_counter]; #throw it out if it's recursing... if (uc($word) eq uc($def_word)) { $match = 0; last } if (substr($word, $letter_counter, 1) ne substr($def_word, 0, 1)) { $match = 0; last; } $letter_counter++; } # this line should perhaps be more sophisticated ... it encodes what we consider # to be a valid acronym if ($match == 1 && $letter_counter > 0 && (abs($def_counter - $acro_counter)< 50)) { my $acro = new acronym(); $acro->[0] = $word; push @{$acro->[1]}, @text[$def_counter .. $def_counter + $letter_counter - 1 ]; $acro->[3] = $letter_counter; # my @tmp = ( $acro ); push @acro_list, ( $acro ); # print $acro->to_string(). "\n"; $match = 0; } $def_counter++; } } $acro_counter++; } return \@acro_list; } sub test { # my $blarg = new acronym(); # my $simple; # $simple = 10; # $blarg->initialise($simple, $simple, $simple); # my $blarg2 = $blarg->clone(); # print $blarg->to_string(); # print $blarg2; # print "\n"; # my $tla = new acronym(); $tla->[0] = "TLA"; my @array = ("Three", "Letter", "Acronym"); # my $i = 0; # while ($i <= $#array) # { # print @array[$i] . "\n"; # $i++; # } print "\n"; push @{$tla->[1]}, ("Three" ); push @{$tla->[1]}, ("Letter" ); push @{$tla->[1]}, ("Letter" ); push @{$tla->[1]}, ("Acronym" ); print $tla->to_string(). "\n"; print "\n"; print "\n"; my $tla2 = $tla->clone(); push @{$tla2->[1]}, ("One"); push @{$tla2->[1]}, ("Two"); $tla2->[0] = "ALT"; print $tla->to_string(). "\n"; print $tla2->to_string(). "\n"; print "\n"; print "\n"; } #uncomment this line to test this package #&test(); 1;