1 | package Kea;
|
---|
2 |
|
---|
3 | # This function is called by BasPlug.pm when a flag in a collection
|
---|
4 | # configuration document specifies that keyphrase metadata must be gathered for
|
---|
5 | # that collection.
|
---|
6 | # It is passed as arguments, the documents text and possibly some options for
|
---|
7 | # how the keyphrase data is to be collected if the keyphrase option flag was
|
---|
8 | # set in the collection configuration file. This module then writes the
|
---|
9 | # documents text to a file in a temporary directory because the stand-alone program Kea which will be
|
---|
10 | # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
|
---|
11 | # Once Kea has been called upon, the file containing the keyphrase data
|
---|
12 | # gathered by Kea should be stored in gsdl/tmp and this file is read, the data
|
---|
13 | # we are interested in is extracted and passed back to BasPlug.pm in an
|
---|
14 | # appropriate format.
|
---|
15 |
|
---|
16 | sub get_Kea_directory
|
---|
17 | {
|
---|
18 | my $kea_version = shift(@_);
|
---|
19 | return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version");
|
---|
20 | }
|
---|
21 |
|
---|
22 | # returns a string containing comma-separated keyphrases
|
---|
23 | sub extract_KeyPhrases {
|
---|
24 |
|
---|
25 | # Parsing arguments of the function
|
---|
26 | my $kea_version = shift(@_);
|
---|
27 | my $doc = shift(@_); # documents text
|
---|
28 | my $args = shift(@_); # any options
|
---|
29 | my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options
|
---|
30 |
|
---|
31 | # Specifying directory names
|
---|
32 | my $keahome = &get_Kea_directory($kea_version);
|
---|
33 | my $defaultmodel = &util::filename_cat($keahome, "CSTR-20");
|
---|
34 | if ($kea_version eq "4.0") {
|
---|
35 | # Use a different model for Kea 4.0
|
---|
36 | $defaultmodel = &util::filename_cat($keahome, "FAO-20docs");
|
---|
37 | }
|
---|
38 |
|
---|
39 | # Initializing variables:
|
---|
40 | my $command = "";
|
---|
41 | my @keylist;
|
---|
42 | my @options = ();
|
---|
43 | $modelspec = 0;
|
---|
44 |
|
---|
45 | # Settings for the java executable:
|
---|
46 |
|
---|
47 | # CLASSPATH:
|
---|
48 | $java_classpath = ".:$keahome";
|
---|
49 |
|
---|
50 | # See if java executable is on path
|
---|
51 | my $java_exec="";
|
---|
52 | if (system("which java >/dev/null 2>/dev/null")==0) {
|
---|
53 | $java_exec=`which java`;
|
---|
54 | chomp $java_exec;
|
---|
55 | } else {
|
---|
56 | $java_exec="$java_home/bin/java";
|
---|
57 | }
|
---|
58 |
|
---|
59 | # The actual java command is based on these other variables:
|
---|
60 | $java_command = "$java_exec -classpath \"$java_classpath\"";
|
---|
61 |
|
---|
62 | # end of java settings
|
---|
63 |
|
---|
64 | # Parsing options for keyphrase extraction:
|
---|
65 | if (@optionlist) {
|
---|
66 | foreach $element (@optionlist){ #for each option
|
---|
67 | if (length($element) == 1) {
|
---|
68 | push(@options, "-$element");
|
---|
69 | } else {
|
---|
70 | $option = substr($element, 0, 1);
|
---|
71 | $value = substr($element,1);
|
---|
72 | if (($option eq "m") && (-e "$keahome/$value")) {
|
---|
73 | $modelspec = 1;
|
---|
74 | push(@options, "-$option $keahome/$value");
|
---|
75 | } elsif ($option eq "m") {
|
---|
76 | $modelspec = 1;
|
---|
77 | print STDERR "Couldn't find model $value. Using the default model instead\n";
|
---|
78 | push(@options, "-$option $defaultmodel");
|
---|
79 | } else {
|
---|
80 | push(@options, "-$option $value");
|
---|
81 | }
|
---|
82 |
|
---|
83 | }
|
---|
84 | }
|
---|
85 | # if none of the option specifies the model, set the default one:
|
---|
86 | if ($modelspec != 1) {
|
---|
87 | push(@options, "-m $defaultmodel");
|
---|
88 | }
|
---|
89 | $options = join(" ",@options);
|
---|
90 | # print STDERR "OPTIONS: $options\n";
|
---|
91 | } else {
|
---|
92 | # If no options were specified: Set default value for the model
|
---|
93 | $options = "-m $defaultmodel";
|
---|
94 | }
|
---|
95 |
|
---|
96 | # Remove all HTML tags from the original text
|
---|
97 | $doc =~ s/<P[^>]*>/\n/sgi;
|
---|
98 | $doc =~ s/<H[^>]*>/\n/sgi;
|
---|
99 | $doc =~ s/<[^>]*>//sgi;
|
---|
100 | $doc =~ tr/\n/\n/s;
|
---|
101 |
|
---|
102 | # Write text to a temporary file doc.txt
|
---|
103 | my $gsdlhome = $ENV{'GSDLHOME'};
|
---|
104 | open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
|
---|
105 | print OUT $doc;
|
---|
106 | close(OUT);
|
---|
107 |
|
---|
108 |
|
---|
109 | # EXECUTE KEA with specific options:
|
---|
110 | $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options";
|
---|
111 | system ("$command");
|
---|
112 |
|
---|
113 | # Read the resulting doc.key, which contains keyphrases:
|
---|
114 |
|
---|
115 | open(IN, "<$gsdlhome/tmp/doc.key") or return "";
|
---|
116 | #this means doc.key does not exist
|
---|
117 | #either because an option was wrongly specified
|
---|
118 | #or no keyphrases were found
|
---|
119 | while(<IN>){
|
---|
120 | chomp;
|
---|
121 | push(@keylist,$_);
|
---|
122 | }
|
---|
123 | close(IN);
|
---|
124 |
|
---|
125 | $keylist = join(", ", @keylist);
|
---|
126 |
|
---|
127 | # Delete doc.key so that in future it will not be opened and read.
|
---|
128 | # Otherwise KEA sees it as more keyphrases!
|
---|
129 |
|
---|
130 | unlink("$gsdlhome/tmp/doc.key");
|
---|
131 |
|
---|
132 | return $keylist;
|
---|
133 | }
|
---|
134 |
|
---|
135 | 1;
|
---|