source: trunk/gsdl/perllib/Kea.pm@ 8891

Last change on this file since 8891 was 8814, checked in by mdewsnip, 19 years ago

Updated files for Kea 3.0, thanks to Olena.

  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1package Kea;
2
3# This function is called by BasPlug.pm when a flag in a collection
4# configuration document specifies that keyphrase metadata must be gathered for
5# that collection.
6# It is passed as arguments, the documents text and possibly some options for
7# how the keyphrase data is to be collected if the keyphrase option flag was
8# set in the collection configuration file. This module then writes the
9# documents text to a file in a temporary directory because the stand-alone program Kea which will be
10# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
11# Once Kea has been called upon, the file containing the keyphrase data
12# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
13# we are interested in is extracted and passed back to BasPlug.pm in an
14# appropriate format.
15
16sub extract_KeyPhrases {
17
18 # Parsing arguments of the function
19 my $doc = shift(@_); # documents text
20 my $args = shift(@_); # any options
21 my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options
22
23 # Specifying directory names
24 my $gsdlhome = $ENV{'GSDLHOME'};
25 my $keahome = "$gsdlhome/packages/kea/kea-3.0";
26 my $defaultmodel = "$keahome/CSTR";
27
28 # Initializing variables:
29 my $command = "";
30 my @keylist;
31 my @options = ();
32 $modelspec = 0;
33
34 # Settings for the java executable:
35
36 # CLASSPATH:
37 $java_classpath = ".:$keahome";
38
39 # See if java executable is on path
40 my $java_exec="";
41 if (system("which java >/dev/null 2>/dev/null")==0) {
42 $java_exec=`which java`;
43 chomp $java_exec;
44 } else {
45 $java_exec="$java_home/bin/java";
46 }
47
48 # The actual java command is based on these other variables:
49 $java_command = "$java_exec -classpath \"$java_classpath\"";
50
51 # end of java settings
52
53 # Parsing options for keyphrase extraction:
54 if (@optionlist) {
55 foreach $element (@optionlist){ #for each option
56 if (length($element) == 1) {
57 push(@options, "-$element");
58 } else {
59 $option = substr($element, 0, 1);
60 $value = substr($element,1);
61 if (($option eq "m") && (-e "$keahome/$value")) {
62 $modelspec = 1;
63 push(@options, "-$option $keahome/$value");
64 } elsif ($option eq "m") {
65 $modelspec = 1;
66 print STDERR "Couldn't find model $value. Using the default model instead\n";
67 push(@options, "-$option $defaultmodel");
68 } else {
69 push(@options, "-$option $value");
70 }
71
72 }
73 }
74 # if none of the option specifies the model, set the default one:
75 if ($modelspec != 1) {
76 push(@options, "-m $defaultmodel");
77 }
78 $options = join(" ",@options);
79 # print STDERR "OPTIONS: $options\n";
80 } else {
81 # If no options were specified: Set default value for the model
82 $options = "-m $defaultmodel";
83 }
84
85 # Remove all HTML tags from the original text
86 $doc =~ s/<P[^>]*>/\n/sgi;
87 $doc =~ s/<H[^>]*>/\n/sgi;
88 $doc =~ s/<[^>]*>//sgi;
89 $doc =~ tr/\n/\n/s;
90
91 # Write text to a temporary file doc.txt
92 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
93 print OUT $doc;
94 close(OUT);
95
96
97 # EXECUTE KEA with specific options:
98 $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options";
99 system ("$command");
100
101 # Read the resulting doc.key, which contains keyphrases:
102
103 open(IN, "<$gsdlhome/tmp/doc.key") or return @emptykeylist;
104 #this means doc.key does not exist
105 #either because an option was wrongly specified
106 #or no keyphrases were found
107 while(<IN>){
108 chomp;
109 push(@keylist,$_);
110 }
111 close(IN);
112
113 $keylist = join(", ", @keylist);
114
115 # Delete doc.key so that in future it will not be opened and read.
116 # Otherwise KEA sees it as athor keyphrases!
117
118 `rm $gsdlhome/tmp/doc.key`;
119
120 return $keylist;
121}
122
1231;
Note: See TracBrowser for help on using the repository browser.