package Kea; use strict; # This function is called by BasPlug.pm when a flag in a collection # configuration document specifies that keyphrase metadata must be gathered for # that collection. # It is passed as arguments, the documents text and possibly some options for # how the keyphrase data is to be collected if the keyphrase option flag was # set in the collection configuration file. This module then writes the # documents text to a file because the stand-alone program Kea which will be # called to do the actual extraction of the keyphrases expects a file argument. # Once Kea has been called upon, the file containing the keyphrase data # gathered by Kea should be stored in gsdl/tmp and this file is read, the data # we are interested in is extracted and passed back to BasPlug.pm in an # appropriate format. sub extract_KeyPhrases { my $gsdlhome = $ENV{'GSDLHOME'}; my $doc = shift(@_); #documents text my $args = shift(@_); #any options my @optionlist = split(/ +/, $args) if (defined($args)); #list of options my $suffix = 'kea'; #default file will be called .kea my @kea_options; my @keylist; my @stemlist; foreach my $element (@optionlist){ #for each option my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist) $option = "-".$option; #place dash in front of option push @kea_options, $option; if (defined($file)) { push @kea_options, $file; } if ($option eq '-E') # option is extension (suffix) option { $suffix = $file } } # print STDERR "Using output suffix: $suffix\n"; # remove all HTML tags $doc =~ s/<[ph][^>]*>/\n/sgi; # replace headings/paragraphs with newline $doc =~ s/<[^>]*>/ /sgi; # replace all others with a space # > lt amp $doc =~ s/\&(?:gt|lt|amp)\;/ /gi; my $tmpfile="$gsdlhome/tmp/doc.txt"; open(OUT, ">$tmpfile") or die "Kea.pm could not create doc.txt: $!\n"; print OUT $doc; close(OUT); # call Kea with specifed options system("$gsdlhome/perllib/Kea-1.1.4/Kea", @kea_options, $tmpfile); unlink($tmpfile); # don't need this file anymore # read doc.kea with keywords my $inputfile="$gsdlhome/tmp/doc.$suffix"; # If this file doesn't exist, then either an option was wrongly specified # or no keyphrases were found open(IN, "<$inputfile") or return (); while(){ chomp; my @key = split(/\t/); #split into array separated by a tab push(@keylist, $key[0]); #add to list of keywords push(@stemlist, $key[1]); #add to list of stems } close(IN); #put data into appropriate format my $keylistref = join(", ", @keylist); my $stemlistref = join(", ", @stemlist); # delete doc.extension so that in future it will not be opened and read unlink($inputfile); # return keywords + stems to basplug return ($keylistref, $stemlistref); } 1;