source: trunk/gsdl/perllib/Kea.pm@ 11069

Last change on this file since 11069 was 11069, checked in by mdewsnip, 18 years ago

Added an option to use Kea 4.0 -- this isn't included with Greenstone, but it will tell you where to get it.

  • Property svn:keywords set to Author Date Id Revision
File size: 4.3 KB
Line 
1package Kea;
2
3# This function is called by BasPlug.pm when a flag in a collection
4# configuration document specifies that keyphrase metadata must be gathered for
5# that collection.
6# It is passed as arguments, the documents text and possibly some options for
7# how the keyphrase data is to be collected if the keyphrase option flag was
8# set in the collection configuration file. This module then writes the
9# documents text to a file in a temporary directory because the stand-alone program Kea which will be
10# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
11# Once Kea has been called upon, the file containing the keyphrase data
12# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
13# we are interested in is extracted and passed back to BasPlug.pm in an
14# appropriate format.
15
16sub get_Kea_directory
17{
18 my $kea_version = shift(@_);
19 return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version");
20}
21
22# returns a string containing comma-separated keyphrases
23sub extract_KeyPhrases {
24
25 # Parsing arguments of the function
26 my $kea_version = shift(@_);
27 my $doc = shift(@_); # documents text
28 my $args = shift(@_); # any options
29 my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options
30
31 # Specifying directory names
32 my $keahome = &get_Kea_directory($kea_version);
33 my $defaultmodel = &util::filename_cat($keahome, "CSTR-20");
34 if ($kea_version eq "4.0") {
35 # Use a different model for Kea 4.0
36 $defaultmodel = &util::filename_cat($keahome, "FAO-20docs");
37 }
38
39 # Initializing variables:
40 my $command = "";
41 my @keylist;
42 my @options = ();
43 $modelspec = 0;
44
45 # Settings for the java executable:
46
47 # CLASSPATH:
48 $java_classpath = ".:$keahome";
49
50 # See if java executable is on path
51 my $java_exec="";
52 if (system("which java >/dev/null 2>/dev/null")==0) {
53 $java_exec=`which java`;
54 chomp $java_exec;
55 } else {
56 $java_exec="$java_home/bin/java";
57 }
58
59 # The actual java command is based on these other variables:
60 $java_command = "$java_exec -classpath \"$java_classpath\"";
61
62 # end of java settings
63
64 # Parsing options for keyphrase extraction:
65 if (@optionlist) {
66 foreach $element (@optionlist){ #for each option
67 if (length($element) == 1) {
68 push(@options, "-$element");
69 } else {
70 $option = substr($element, 0, 1);
71 $value = substr($element,1);
72 if (($option eq "m") && (-e "$keahome/$value")) {
73 $modelspec = 1;
74 push(@options, "-$option $keahome/$value");
75 } elsif ($option eq "m") {
76 $modelspec = 1;
77 print STDERR "Couldn't find model $value. Using the default model instead\n";
78 push(@options, "-$option $defaultmodel");
79 } else {
80 push(@options, "-$option $value");
81 }
82
83 }
84 }
85 # if none of the option specifies the model, set the default one:
86 if ($modelspec != 1) {
87 push(@options, "-m $defaultmodel");
88 }
89 $options = join(" ",@options);
90 # print STDERR "OPTIONS: $options\n";
91 } else {
92 # If no options were specified: Set default value for the model
93 $options = "-m $defaultmodel";
94 }
95
96 # Remove all HTML tags from the original text
97 $doc =~ s/<P[^>]*>/\n/sgi;
98 $doc =~ s/<H[^>]*>/\n/sgi;
99 $doc =~ s/<[^>]*>//sgi;
100 $doc =~ tr/\n/\n/s;
101
102 # Write text to a temporary file doc.txt
103 my $gsdlhome = $ENV{'GSDLHOME'};
104 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
105 print OUT $doc;
106 close(OUT);
107
108
109 # EXECUTE KEA with specific options:
110 $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options";
111 system ("$command");
112
113 # Read the resulting doc.key, which contains keyphrases:
114
115 open(IN, "<$gsdlhome/tmp/doc.key") or return "";
116 #this means doc.key does not exist
117 #either because an option was wrongly specified
118 #or no keyphrases were found
119 while(<IN>){
120 chomp;
121 push(@keylist,$_);
122 }
123 close(IN);
124
125 $keylist = join(", ", @keylist);
126
127 # Delete doc.key so that in future it will not be opened and read.
128 # Otherwise KEA sees it as more keyphrases!
129
130 unlink("$gsdlhome/tmp/doc.key");
131
132 return $keylist;
133}
134
1351;
Note: See TracBrowser for help on using the repository browser.