package Kea; # This function is called by BasPlug.pm when a flag in a collection # configuration document specifies that keyphrase metadata must be gathered for # that collection. # It is passed as arguments, the documents text and possibly some options for # how the keyphrase data is to be collected if the keyphrase option flag was # set in the collection configuration file. This module then writes the # documents text to a file because the stand-alone program Kea which will be # called to do the actual extraction of the keyphrases expects a file argument. # Once Kea has been called upon, the file containing the keyphrase data # gathered by Kea should be stored in gsdl/tmp and this file is read, the data # we are interested in is extracted and passed back to BasPlug.pm in an # appropriate format. sub extract_KeyPhrases { my $gsdlhome = $ENV{'GSDLHOME'}; my $doc = shift(@_); #documents text my $args = shift(@_); #any options my @optionlist = split(/ +/, $args) if (defined($args)); #list of options my $suffix = 'kea'; #default file will be called .kea my $command = ""; my @keylist; my @stemlist; print STDERR "optionlist: @optionlist\n"; foreach $element (@optionlist){ #for each option my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist) $option = "-".$option; #place dash in front of option $file = "" if(!defined($file)); #no file options specified $suffix = $file if($option eq '-E'); #if option is extension (suffix) option $command .= " $option $file "; #add to list of commands } print STDERR "Using output suffix: $suffix\n"; # remove all HTML tags $doc =~ s/]*>/\n/sgi; $doc =~ s/]*>/\n/sgi; $doc =~ s/<[^>]*>//sgi; $doc =~ tr/\n/\n/s; #write text to a file eg doc.txt open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; print OUT $doc; close(OUT); #call Kea with specifed options `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`; #read doc.kea with keywords open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist; #this means doc.kea does not exist #either because an option was wrongly specified #or no keyphrases were found while(){ chomp; @key = split(/\t/); #split into array separated by a tab push(@keylist, $key[0]); #add to list of keywords push(@stemlist, $key[1]); #add to list of stems } close(IN); #put data into appropriate format $keylist = join(", ", @keylist); $stemlist = join(", ", @stemlist); #delete doc.extension so that in future it will not be opened and read `rm $gsdlhome/tmp/doc.$suffix`; #return keywords + stems to basplug my @keystemlist = ($keylist, $stemlist); return @keystemlist; } 1;