1 | package Kea;
|
---|
2 |
|
---|
3 | # This function is called by BasPlug.pm when a flag in a collection
|
---|
4 | # configuration document specifies that keyphrase metadata must be gathered for
|
---|
5 | # that collection.
|
---|
6 | # It is passed as arguments, the documents text and possibly some options for
|
---|
7 | # how the keyphrase data is to be collected if the keyphrase option flag was
|
---|
8 | # set in the collection configuration file. This module then writes the
|
---|
9 | # documents text to a file in a temporary directory because the stand-alone program Kea which will be
|
---|
10 | # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
|
---|
11 | # Once Kea has been called upon, the file containing the keyphrase data
|
---|
12 | # gathered by Kea should be stored in gsdl/tmp and this file is read, the data
|
---|
13 | # we are interested in is extracted and passed back to BasPlug.pm in an
|
---|
14 | # appropriate format.
|
---|
15 |
|
---|
16 | # returns a string containing comma-separated keyphrases
|
---|
17 | sub extract_KeyPhrases {
|
---|
18 |
|
---|
19 | # Parsing arguments of the function
|
---|
20 | my $doc = shift(@_); # documents text
|
---|
21 | my $args = shift(@_); # any options
|
---|
22 | my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options
|
---|
23 |
|
---|
24 | # Specifying directory names
|
---|
25 | my $gsdlhome = $ENV{'GSDLHOME'};
|
---|
26 | my $keahome = "$gsdlhome/packages/kea/kea-3.0";
|
---|
27 | my $defaultmodel = "$keahome/CSTR";
|
---|
28 |
|
---|
29 | # Initializing variables:
|
---|
30 | my $command = "";
|
---|
31 | my @keylist;
|
---|
32 | my @options = ();
|
---|
33 | $modelspec = 0;
|
---|
34 |
|
---|
35 | # Settings for the java executable:
|
---|
36 |
|
---|
37 | # CLASSPATH:
|
---|
38 | $java_classpath = ".:$keahome";
|
---|
39 |
|
---|
40 | # See if java executable is on path
|
---|
41 | my $java_exec="";
|
---|
42 | if (system("which java >/dev/null 2>/dev/null")==0) {
|
---|
43 | $java_exec=`which java`;
|
---|
44 | chomp $java_exec;
|
---|
45 | } else {
|
---|
46 | $java_exec="$java_home/bin/java";
|
---|
47 | }
|
---|
48 |
|
---|
49 | # The actual java command is based on these other variables:
|
---|
50 | $java_command = "$java_exec -classpath \"$java_classpath\"";
|
---|
51 |
|
---|
52 | # end of java settings
|
---|
53 |
|
---|
54 | # Parsing options for keyphrase extraction:
|
---|
55 | if (@optionlist) {
|
---|
56 | foreach $element (@optionlist){ #for each option
|
---|
57 | if (length($element) == 1) {
|
---|
58 | push(@options, "-$element");
|
---|
59 | } else {
|
---|
60 | $option = substr($element, 0, 1);
|
---|
61 | $value = substr($element,1);
|
---|
62 | if (($option eq "m") && (-e "$keahome/$value")) {
|
---|
63 | $modelspec = 1;
|
---|
64 | push(@options, "-$option $keahome/$value");
|
---|
65 | } elsif ($option eq "m") {
|
---|
66 | $modelspec = 1;
|
---|
67 | print STDERR "Couldn't find model $value. Using the default model instead\n";
|
---|
68 | push(@options, "-$option $defaultmodel");
|
---|
69 | } else {
|
---|
70 | push(@options, "-$option $value");
|
---|
71 | }
|
---|
72 |
|
---|
73 | }
|
---|
74 | }
|
---|
75 | # if none of the option specifies the model, set the default one:
|
---|
76 | if ($modelspec != 1) {
|
---|
77 | push(@options, "-m $defaultmodel");
|
---|
78 | }
|
---|
79 | $options = join(" ",@options);
|
---|
80 | # print STDERR "OPTIONS: $options\n";
|
---|
81 | } else {
|
---|
82 | # If no options were specified: Set default value for the model
|
---|
83 | $options = "-m $defaultmodel";
|
---|
84 | }
|
---|
85 |
|
---|
86 | # Remove all HTML tags from the original text
|
---|
87 | $doc =~ s/<P[^>]*>/\n/sgi;
|
---|
88 | $doc =~ s/<H[^>]*>/\n/sgi;
|
---|
89 | $doc =~ s/<[^>]*>//sgi;
|
---|
90 | $doc =~ tr/\n/\n/s;
|
---|
91 |
|
---|
92 | # Write text to a temporary file doc.txt
|
---|
93 | open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
|
---|
94 | print OUT $doc;
|
---|
95 | close(OUT);
|
---|
96 |
|
---|
97 |
|
---|
98 | # EXECUTE KEA with specific options:
|
---|
99 | $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options";
|
---|
100 | system ("$command");
|
---|
101 |
|
---|
102 | # Read the resulting doc.key, which contains keyphrases:
|
---|
103 |
|
---|
104 | open(IN, "<$gsdlhome/tmp/doc.key") or return "";
|
---|
105 | #this means doc.key does not exist
|
---|
106 | #either because an option was wrongly specified
|
---|
107 | #or no keyphrases were found
|
---|
108 | while(<IN>){
|
---|
109 | chomp;
|
---|
110 | push(@keylist,$_);
|
---|
111 | }
|
---|
112 | close(IN);
|
---|
113 |
|
---|
114 | $keylist = join(", ", @keylist);
|
---|
115 |
|
---|
116 | # Delete doc.key so that in future it will not be opened and read.
|
---|
117 | # Otherwise KEA sees it as more keyphrases!
|
---|
118 |
|
---|
119 | unlink("$gsdlhome/tmp/doc.key");
|
---|
120 |
|
---|
121 | return $keylist;
|
---|
122 | }
|
---|
123 |
|
---|
124 | 1;
|
---|