package Kea; # This function is called by BasPlug.pm when a flag in a collection # configuration document specifies that keyphrase metadata must be gathered for # that collection. # It is passed as arguments, the documents text and possibly some options for # how the keyphrase data is to be collected if the keyphrase option flag was # set in the collection configuration file. This module then writes the # documents text to a file because the stand-alone program Kea which will be # called to do the actual extraction of the keyphrases expects a file argument. # Once Kea has been called upon, the file containing the keyphrase data # gathered by Kea should be stored in gsdl/tmp and this file is read, the data # we are interested in is extracted and passed back to BasPlug.pm in an # appropriate format. sub extract_KeyPhrases { my $gsdlhome = $ENV{'GSDLHOME'}; my $doc = shift(@_); #documents text my $args = shift(@_); #any options my @optionlist = split(/ +/, $args) if (defined($args)); #list of options my $suffix = 'kea'; #default file will be called .kea my $command = ""; my @keylist; my @stemlist; print STDERR "optionlist: @optionlist\n"; foreach $element (@optionlist){ #for each option my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist) $option = "-".$option; #place dash in front of option $file = "" if(!defined($file)); #no file options specified $suffix = $file if($option eq '-E'); #if option is extension (suffix) option $command .= " $option $file "; #add to list of commands } print STDERR "Using output suffix: $suffix\n"; # remove all HTML tags $doc =~ s/
]*>/\n/sgi;
$doc =~ s/