source: gs2-extensions/parallel-building/trunk/src/perllib/Kea.pm@ 28187

Last change on this file since 28187 was 28187, checked in by jmt12, 11 years ago

A customized version of Kea.pm that looks in the correct place for newer versions of Greenstone - maybe commit this back to trunk at some stage

File size: 4.3 KB
Line 
1package Kea;
2
3use strict;
4
5use util;
6use FileUtils;
7
8# This function is called by BasPlug.pm when a flag in a collection
9# configuration document specifies that keyphrase metadata must be gathered for
10# that collection.
11# It is passed as arguments, the documents text and possibly some options for
12# how the keyphrase data is to be collected if the keyphrase option flag was
13# set in the collection configuration file. This module then writes the
14# documents text to a file in a temporary directory because the stand-alone program Kea which will be
15# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
16# Once Kea has been called upon, the file containing the keyphrase data
17# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
18# we are interested in is extracted and passed back to BasPlug.pm in an
19# appropriate format.
20
21sub get_Kea_directory
22{
23 my $kea_version = shift(@_);
24 my $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'build-src', 'packages', 'kea', 'kea-' . $kea_version);
25 # Legacy support for systems that predate the splitting of packages
26 if (!&FileUtils::directoryExists($kea_path))
27 {
28 $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'packages', 'kea', 'kea-' . $kea_version);
29 }
30 return $kea_path;
31}
32
33# returns a string containing comma-separated keyphrases
34sub extract_KeyPhrases
35{
36 my $kea_version = shift(@_);
37 my $doc = shift(@_); # Document's text
38 my $args = shift(@_); # Options
39
40 # Set default models
41 my $kea_home = &get_Kea_directory($kea_version);
42 my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
43 if ($kea_version eq "4.0") {
44 # Use a different default model for Kea 4.0
45 $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
46 }
47
48 # Parse the Kea options
49 my $options_string;
50 my @args_list = split(/\s+/, $args) if (defined($args));
51 if (@args_list) {
52 my $model_specified = 0;
53 foreach my $arg (@args_list) {
54 if (length($arg) == 1) {
55 $options_string .= " -$arg";
56 }
57 else {
58 my $option = substr($arg, 0, 1);
59 my $value = substr($arg, 1);
60 if ($option eq "m") {
61 my $model_path = &util::filename_cat($kea_home, $value);
62 if (-e $model_path) {
63 $options_string .= " -m $model_path";
64 }
65 else {
66 print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
67 $options_string .= " -m $default_model_path";
68 }
69 $model_specified = 1;
70 }
71 else {
72 $options_string .= " -$option $value";
73 }
74 }
75 }
76
77 # If none of the option specifies the model, use the default one
78 if ($model_specified != 1) {
79 $options_string .= " -m $default_model_path";
80 }
81 }
82 else {
83 # If no options were specified, use the default model
84 $options_string = "-m $default_model_path";
85 }
86
87 # Remove all HTML tags from the original text
88 $doc =~ s/<P[^>]*>/\n/sgi;
89 $doc =~ s/<H[^>]*>/\n/sgi;
90 $doc =~ s/<[^>]*>//sgi;
91 $doc =~ tr/\n/\n/s;
92
93 # Write text to a temporary file doc.txt
94 my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
95 my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
96 open(DOC_TXT, '>:utf8', $doc_txt_file_path) or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
97 print DOC_TXT $doc;
98 close(DOC_TXT);
99
100 # Run Kea with the specified options
101 my $cmd = 'taskset -c 7 java -classpath "' . $kea_home . '" KEAKeyphraseExtractor -l ' . $tmp_directory_path . ' ' . $options_string;
102 #rint "[DEBUG] Kea Command: |$cmd|\n";
103 system($cmd);
104 #system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
105
106 # Read the resulting doc.key file which contains the keyphrases
107 my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
108 if (!open(IN, "<$doc_key_file_path")) {
109 # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
110 return "";
111 }
112
113 my @keyphrase_list = ();
114 while (<IN>) {
115 chomp;
116 push(@keyphrase_list, $_);
117 }
118 close(IN);
119
120 # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
121 unlink($doc_key_file_path);
122
123 my $keyphrases = join(", ", @keyphrase_list);
124 return $keyphrases;
125}
126
1271;
Note: See TracBrowser for help on using the repository browser.