source: gs2-extensions/parallel-building/trunk/src/perllib/Kea.pm@ 30281

Last change on this file since 30281 was 29259, checked in by jmt12, 10 years ago

Kea override allowing for fixed processor affinity if necessary (commented out at the moment

File size: 4.4 KB
Line 
1package Kea;
2
3use strict;
4
5use util;
6use FileUtils;
7
8# This function is called by BasPlug.pm when a flag in a collection
9# configuration document specifies that keyphrase metadata must be gathered for
10# that collection.
11# It is passed as arguments, the documents text and possibly some options for
12# how the keyphrase data is to be collected if the keyphrase option flag was
13# set in the collection configuration file. This module then writes the
14# documents text to a file in a temporary directory because the stand-alone program Kea which will be
15# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
16# Once Kea has been called upon, the file containing the keyphrase data
17# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
18# we are interested in is extracted and passed back to BasPlug.pm in an
19# appropriate format.
20
21sub get_Kea_directory
22{
23 my $kea_version = shift(@_);
24 my $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'build-src', 'packages', 'kea', 'kea-' . $kea_version);
25 # Legacy support for systems that predate the splitting of packages
26 if (!&FileUtils::directoryExists($kea_path))
27 {
28 $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'packages', 'kea', 'kea-' . $kea_version);
29 }
30 return $kea_path;
31}
32
33# returns a string containing comma-separated keyphrases
34sub extract_KeyPhrases
35{
36 my $kea_version = shift(@_);
37 my $doc = shift(@_); # Document's text
38 my $args = shift(@_); # Options
39
40 # Set default models
41 my $kea_home = &get_Kea_directory($kea_version);
42 my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
43 if ($kea_version eq "4.0") {
44 # Use a different default model for Kea 4.0
45 $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
46 }
47
48 # Parse the Kea options
49 my $options_string;
50 my @args_list = split(/\s+/, $args) if (defined($args));
51 if (@args_list) {
52 my $model_specified = 0;
53 foreach my $arg (@args_list) {
54 if (length($arg) == 1) {
55 $options_string .= " -$arg";
56 }
57 else {
58 my $option = substr($arg, 0, 1);
59 my $value = substr($arg, 1);
60 if ($option eq "m") {
61 my $model_path = &util::filename_cat($kea_home, $value);
62 if (-e $model_path) {
63 $options_string .= " -m $model_path";
64 }
65 else {
66 print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
67 $options_string .= " -m $default_model_path";
68 }
69 $model_specified = 1;
70 }
71 else {
72 $options_string .= " -$option $value";
73 }
74 }
75 }
76
77 # If none of the option specifies the model, use the default one
78 if ($model_specified != 1) {
79 $options_string .= " -m $default_model_path";
80 }
81 }
82 else {
83 # If no options were specified, use the default model
84 $options_string = "-m $default_model_path";
85 }
86
87 # Remove all HTML tags from the original text
88 $doc =~ s/<P[^>]*>/\n/sgi;
89 $doc =~ s/<H[^>]*>/\n/sgi;
90 $doc =~ s/<[^>]*>//sgi;
91 $doc =~ tr/\n/\n/s;
92
93 # Write text to a temporary file doc.txt
94 my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
95 my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
96 open(DOC_TXT, '>:utf8', $doc_txt_file_path) or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
97 print DOC_TXT $doc;
98 close(DOC_TXT);
99
100 # Run Kea with the specified options
101 #my $cmd = 'taskset -c 7 java -classpath "' . $kea_home . '" KEAKeyphraseExtractor -l ' . $tmp_directory_path . ' ' . $options_string;
102 my $cmd = 'java -classpath "' . $kea_home . '" KEAKeyphraseExtractor -l ' . $tmp_directory_path . ' ' . $options_string;
103 #rint "[DEBUG] Kea Command: |$cmd|\n";
104 system($cmd);
105 #system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
106
107 # Read the resulting doc.key file which contains the keyphrases
108 my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
109 if (!open(IN, "<$doc_key_file_path")) {
110 # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
111 return "";
112 }
113
114 my @keyphrase_list = ();
115 while (<IN>) {
116 chomp;
117 push(@keyphrase_list, $_);
118 }
119 close(IN);
120
121 # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
122 unlink($doc_key_file_path);
123
124 my $keyphrases = join(", ", @keyphrase_list);
125 return $keyphrases;
126}
127
1281;
Note: See TracBrowser for help on using the repository browser.