source: trunk/gsdl/perllib/Kea.pm@ 9674

Last change on this file since 9674 was 9409, checked in by jrm21, 19 years ago

use unlink() instead of rm

  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1package Kea;
2
3# This function is called by BasPlug.pm when a flag in a collection
4# configuration document specifies that keyphrase metadata must be gathered for
5# that collection.
6# It is passed as arguments, the documents text and possibly some options for
7# how the keyphrase data is to be collected if the keyphrase option flag was
8# set in the collection configuration file. This module then writes the
9# documents text to a file in a temporary directory because the stand-alone program Kea which will be
10# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
11# Once Kea has been called upon, the file containing the keyphrase data
12# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
13# we are interested in is extracted and passed back to BasPlug.pm in an
14# appropriate format.
15
16# returns a string containing comma-separated keyphrases
17sub extract_KeyPhrases {
18
19 # Parsing arguments of the function
20 my $doc = shift(@_); # documents text
21 my $args = shift(@_); # any options
22 my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options
23
24 # Specifying directory names
25 my $gsdlhome = $ENV{'GSDLHOME'};
26 my $keahome = "$gsdlhome/packages/kea/kea-3.0";
27 my $defaultmodel = "$keahome/CSTR";
28
29 # Initializing variables:
30 my $command = "";
31 my @keylist;
32 my @options = ();
33 $modelspec = 0;
34
35 # Settings for the java executable:
36
37 # CLASSPATH:
38 $java_classpath = ".:$keahome";
39
40 # See if java executable is on path
41 my $java_exec="";
42 if (system("which java >/dev/null 2>/dev/null")==0) {
43 $java_exec=`which java`;
44 chomp $java_exec;
45 } else {
46 $java_exec="$java_home/bin/java";
47 }
48
49 # The actual java command is based on these other variables:
50 $java_command = "$java_exec -classpath \"$java_classpath\"";
51
52 # end of java settings
53
54 # Parsing options for keyphrase extraction:
55 if (@optionlist) {
56 foreach $element (@optionlist){ #for each option
57 if (length($element) == 1) {
58 push(@options, "-$element");
59 } else {
60 $option = substr($element, 0, 1);
61 $value = substr($element,1);
62 if (($option eq "m") && (-e "$keahome/$value")) {
63 $modelspec = 1;
64 push(@options, "-$option $keahome/$value");
65 } elsif ($option eq "m") {
66 $modelspec = 1;
67 print STDERR "Couldn't find model $value. Using the default model instead\n";
68 push(@options, "-$option $defaultmodel");
69 } else {
70 push(@options, "-$option $value");
71 }
72
73 }
74 }
75 # if none of the option specifies the model, set the default one:
76 if ($modelspec != 1) {
77 push(@options, "-m $defaultmodel");
78 }
79 $options = join(" ",@options);
80 # print STDERR "OPTIONS: $options\n";
81 } else {
82 # If no options were specified: Set default value for the model
83 $options = "-m $defaultmodel";
84 }
85
86 # Remove all HTML tags from the original text
87 $doc =~ s/<P[^>]*>/\n/sgi;
88 $doc =~ s/<H[^>]*>/\n/sgi;
89 $doc =~ s/<[^>]*>//sgi;
90 $doc =~ tr/\n/\n/s;
91
92 # Write text to a temporary file doc.txt
93 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
94 print OUT $doc;
95 close(OUT);
96
97
98 # EXECUTE KEA with specific options:
99 $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options";
100 system ("$command");
101
102 # Read the resulting doc.key, which contains keyphrases:
103
104 open(IN, "<$gsdlhome/tmp/doc.key") or return "";
105 #this means doc.key does not exist
106 #either because an option was wrongly specified
107 #or no keyphrases were found
108 while(<IN>){
109 chomp;
110 push(@keylist,$_);
111 }
112 close(IN);
113
114 $keylist = join(", ", @keylist);
115
116 # Delete doc.key so that in future it will not be opened and read.
117 # Otherwise KEA sees it as more keyphrases!
118
119 unlink("$gsdlhome/tmp/doc.key");
120
121 return $keylist;
122}
123
1241;
Note: See TracBrowser for help on using the repository browser.