1 | package Kea;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 |
|
---|
5 | use util;
|
---|
6 | use FileUtils;
|
---|
7 |
|
---|
8 | # This function is called by BasPlug.pm when a flag in a collection
|
---|
9 | # configuration document specifies that keyphrase metadata must be gathered for
|
---|
10 | # that collection.
|
---|
11 | # It is passed as arguments, the documents text and possibly some options for
|
---|
12 | # how the keyphrase data is to be collected if the keyphrase option flag was
|
---|
13 | # set in the collection configuration file. This module then writes the
|
---|
14 | # documents text to a file in a temporary directory because the stand-alone program Kea which will be
|
---|
15 | # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
|
---|
16 | # Once Kea has been called upon, the file containing the keyphrase data
|
---|
17 | # gathered by Kea should be stored in gsdl/tmp and this file is read, the data
|
---|
18 | # we are interested in is extracted and passed back to BasPlug.pm in an
|
---|
19 | # appropriate format.
|
---|
20 |
|
---|
21 | sub get_Kea_directory
|
---|
22 | {
|
---|
23 | my $kea_version = shift(@_);
|
---|
24 | my $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'build-src', 'packages', 'kea', 'kea-' . $kea_version);
|
---|
25 | # Legacy support for systems that predate the splitting of packages
|
---|
26 | if (!&FileUtils::directoryExists($kea_path))
|
---|
27 | {
|
---|
28 | $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'packages', 'kea', 'kea-' . $kea_version);
|
---|
29 | }
|
---|
30 | return $kea_path;
|
---|
31 | }
|
---|
32 |
|
---|
33 | # returns a string containing comma-separated keyphrases
|
---|
34 | sub extract_KeyPhrases
|
---|
35 | {
|
---|
36 | my $kea_version = shift(@_);
|
---|
37 | my $doc = shift(@_); # Document's text
|
---|
38 | my $args = shift(@_); # Options
|
---|
39 |
|
---|
40 | # Set default models
|
---|
41 | my $kea_home = &get_Kea_directory($kea_version);
|
---|
42 | my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
|
---|
43 | if ($kea_version eq "4.0") {
|
---|
44 | # Use a different default model for Kea 4.0
|
---|
45 | $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
|
---|
46 | }
|
---|
47 |
|
---|
48 | # Parse the Kea options
|
---|
49 | my $options_string;
|
---|
50 | my @args_list = split(/\s+/, $args) if (defined($args));
|
---|
51 | if (@args_list) {
|
---|
52 | my $model_specified = 0;
|
---|
53 | foreach my $arg (@args_list) {
|
---|
54 | if (length($arg) == 1) {
|
---|
55 | $options_string .= " -$arg";
|
---|
56 | }
|
---|
57 | else {
|
---|
58 | my $option = substr($arg, 0, 1);
|
---|
59 | my $value = substr($arg, 1);
|
---|
60 | if ($option eq "m") {
|
---|
61 | my $model_path = &util::filename_cat($kea_home, $value);
|
---|
62 | if (-e $model_path) {
|
---|
63 | $options_string .= " -m $model_path";
|
---|
64 | }
|
---|
65 | else {
|
---|
66 | print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
|
---|
67 | $options_string .= " -m $default_model_path";
|
---|
68 | }
|
---|
69 | $model_specified = 1;
|
---|
70 | }
|
---|
71 | else {
|
---|
72 | $options_string .= " -$option $value";
|
---|
73 | }
|
---|
74 | }
|
---|
75 | }
|
---|
76 |
|
---|
77 | # If none of the option specifies the model, use the default one
|
---|
78 | if ($model_specified != 1) {
|
---|
79 | $options_string .= " -m $default_model_path";
|
---|
80 | }
|
---|
81 | }
|
---|
82 | else {
|
---|
83 | # If no options were specified, use the default model
|
---|
84 | $options_string = "-m $default_model_path";
|
---|
85 | }
|
---|
86 |
|
---|
87 | # Remove all HTML tags from the original text
|
---|
88 | $doc =~ s/<P[^>]*>/\n/sgi;
|
---|
89 | $doc =~ s/<H[^>]*>/\n/sgi;
|
---|
90 | $doc =~ s/<[^>]*>//sgi;
|
---|
91 | $doc =~ tr/\n/\n/s;
|
---|
92 |
|
---|
93 | # Write text to a temporary file doc.txt
|
---|
94 | my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
|
---|
95 | my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
|
---|
96 | open(DOC_TXT, '>:utf8', $doc_txt_file_path) or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
|
---|
97 | print DOC_TXT $doc;
|
---|
98 | close(DOC_TXT);
|
---|
99 |
|
---|
100 | # Run Kea with the specified options
|
---|
101 | my $cmd = 'taskset -c 7 java -classpath "' . $kea_home . '" KEAKeyphraseExtractor -l ' . $tmp_directory_path . ' ' . $options_string;
|
---|
102 | #rint "[DEBUG] Kea Command: |$cmd|\n";
|
---|
103 | system($cmd);
|
---|
104 | #system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
|
---|
105 |
|
---|
106 | # Read the resulting doc.key file which contains the keyphrases
|
---|
107 | my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
|
---|
108 | if (!open(IN, "<$doc_key_file_path")) {
|
---|
109 | # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
|
---|
110 | return "";
|
---|
111 | }
|
---|
112 |
|
---|
113 | my @keyphrase_list = ();
|
---|
114 | while (<IN>) {
|
---|
115 | chomp;
|
---|
116 | push(@keyphrase_list, $_);
|
---|
117 | }
|
---|
118 | close(IN);
|
---|
119 |
|
---|
120 | # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
|
---|
121 | unlink($doc_key_file_path);
|
---|
122 |
|
---|
123 | my $keyphrases = join(", ", @keyphrase_list);
|
---|
124 | return $keyphrases;
|
---|
125 | }
|
---|
126 |
|
---|
127 | 1;
|
---|