package Kea; # This function is called by BasPlug.pm when a flag in a collection # configuration document specifies that keyphrase metadata must be gathered for # that collection. # It is passed as arguments, the documents text and possibly some options for # how the keyphrase data is to be collected if the keyphrase option flag was # set in the collection configuration file. This module then writes the # documents text to a file in a temporary directory because the stand-alone program Kea which will be # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument. # Once Kea has been called upon, the file containing the keyphrase data # gathered by Kea should be stored in gsdl/tmp and this file is read, the data # we are interested in is extracted and passed back to BasPlug.pm in an # appropriate format. # returns a string containing comma-separated keyphrases sub extract_KeyPhrases { # Parsing arguments of the function my $doc = shift(@_); # documents text my $args = shift(@_); # any options my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options # Specifying directory names my $gsdlhome = $ENV{'GSDLHOME'}; my $keahome = "$gsdlhome/packages/kea/kea-3.0"; my $defaultmodel = "$keahome/CSTR"; # Initializing variables: my $command = ""; my @keylist; my @options = (); $modelspec = 0; # Settings for the java executable: # CLASSPATH: $java_classpath = ".:$keahome"; # See if java executable is on path my $java_exec=""; if (system("which java >/dev/null 2>/dev/null")==0) { $java_exec=`which java`; chomp $java_exec; } else { $java_exec="$java_home/bin/java"; } # The actual java command is based on these other variables: $java_command = "$java_exec -classpath \"$java_classpath\""; # end of java settings # Parsing options for keyphrase extraction: if (@optionlist) { foreach $element (@optionlist){ #for each option if (length($element) == 1) { push(@options, "-$element"); } else { $option = substr($element, 0, 1); $value = substr($element,1); if (($option eq "m") && (-e "$keahome/$value")) { $modelspec = 1; push(@options, "-$option $keahome/$value"); } elsif ($option eq "m") { $modelspec = 1; print STDERR "Couldn't find model $value. Using the default model instead\n"; push(@options, "-$option $defaultmodel"); } else { push(@options, "-$option $value"); } } } # if none of the option specifies the model, set the default one: if ($modelspec != 1) { push(@options, "-m $defaultmodel"); } $options = join(" ",@options); # print STDERR "OPTIONS: $options\n"; } else { # If no options were specified: Set default value for the model $options = "-m $defaultmodel"; } # Remove all HTML tags from the original text $doc =~ s/]*>/\n/sgi; $doc =~ s/]*>/\n/sgi; $doc =~ s/<[^>]*>//sgi; $doc =~ tr/\n/\n/s; # Write text to a temporary file doc.txt open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; print OUT $doc; close(OUT); # EXECUTE KEA with specific options: $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options"; system ("$command"); # Read the resulting doc.key, which contains keyphrases: open(IN, "<$gsdlhome/tmp/doc.key") or return ""; #this means doc.key does not exist #either because an option was wrongly specified #or no keyphrases were found while(){ chomp; push(@keylist,$_); } close(IN); $keylist = join(", ", @keylist); # Delete doc.key so that in future it will not be opened and read. # Otherwise KEA sees it as more keyphrases! unlink("$gsdlhome/tmp/doc.key"); return $keylist; } 1;