[28187] | 1 | package Kea;
|
---|
| 2 |
|
---|
| 3 | use strict;
|
---|
| 4 |
|
---|
| 5 | use util;
|
---|
| 6 | use FileUtils;
|
---|
| 7 |
|
---|
| 8 | # This function is called by BasPlug.pm when a flag in a collection
|
---|
| 9 | # configuration document specifies that keyphrase metadata must be gathered for
|
---|
| 10 | # that collection.
|
---|
| 11 | # It is passed as arguments, the documents text and possibly some options for
|
---|
| 12 | # how the keyphrase data is to be collected if the keyphrase option flag was
|
---|
| 13 | # set in the collection configuration file. This module then writes the
|
---|
| 14 | # documents text to a file in a temporary directory because the stand-alone program Kea which will be
|
---|
| 15 | # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
|
---|
| 16 | # Once Kea has been called upon, the file containing the keyphrase data
|
---|
| 17 | # gathered by Kea should be stored in gsdl/tmp and this file is read, the data
|
---|
| 18 | # we are interested in is extracted and passed back to BasPlug.pm in an
|
---|
| 19 | # appropriate format.
|
---|
| 20 |
|
---|
| 21 | sub get_Kea_directory
|
---|
| 22 | {
|
---|
| 23 | my $kea_version = shift(@_);
|
---|
| 24 | my $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'build-src', 'packages', 'kea', 'kea-' . $kea_version);
|
---|
| 25 | # Legacy support for systems that predate the splitting of packages
|
---|
| 26 | if (!&FileUtils::directoryExists($kea_path))
|
---|
| 27 | {
|
---|
| 28 | $kea_path = &util::filename_cat($ENV{'GSDLHOME'}, 'packages', 'kea', 'kea-' . $kea_version);
|
---|
| 29 | }
|
---|
| 30 | return $kea_path;
|
---|
| 31 | }
|
---|
| 32 |
|
---|
| 33 | # returns a string containing comma-separated keyphrases
|
---|
| 34 | sub extract_KeyPhrases
|
---|
| 35 | {
|
---|
| 36 | my $kea_version = shift(@_);
|
---|
| 37 | my $doc = shift(@_); # Document's text
|
---|
| 38 | my $args = shift(@_); # Options
|
---|
| 39 |
|
---|
| 40 | # Set default models
|
---|
| 41 | my $kea_home = &get_Kea_directory($kea_version);
|
---|
| 42 | my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
|
---|
| 43 | if ($kea_version eq "4.0") {
|
---|
| 44 | # Use a different default model for Kea 4.0
|
---|
| 45 | $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
|
---|
| 46 | }
|
---|
| 47 |
|
---|
| 48 | # Parse the Kea options
|
---|
| 49 | my $options_string;
|
---|
| 50 | my @args_list = split(/\s+/, $args) if (defined($args));
|
---|
| 51 | if (@args_list) {
|
---|
| 52 | my $model_specified = 0;
|
---|
| 53 | foreach my $arg (@args_list) {
|
---|
| 54 | if (length($arg) == 1) {
|
---|
| 55 | $options_string .= " -$arg";
|
---|
| 56 | }
|
---|
| 57 | else {
|
---|
| 58 | my $option = substr($arg, 0, 1);
|
---|
| 59 | my $value = substr($arg, 1);
|
---|
| 60 | if ($option eq "m") {
|
---|
| 61 | my $model_path = &util::filename_cat($kea_home, $value);
|
---|
| 62 | if (-e $model_path) {
|
---|
| 63 | $options_string .= " -m $model_path";
|
---|
| 64 | }
|
---|
| 65 | else {
|
---|
| 66 | print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
|
---|
| 67 | $options_string .= " -m $default_model_path";
|
---|
| 68 | }
|
---|
| 69 | $model_specified = 1;
|
---|
| 70 | }
|
---|
| 71 | else {
|
---|
| 72 | $options_string .= " -$option $value";
|
---|
| 73 | }
|
---|
| 74 | }
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | # If none of the option specifies the model, use the default one
|
---|
| 78 | if ($model_specified != 1) {
|
---|
| 79 | $options_string .= " -m $default_model_path";
|
---|
| 80 | }
|
---|
| 81 | }
|
---|
| 82 | else {
|
---|
| 83 | # If no options were specified, use the default model
|
---|
| 84 | $options_string = "-m $default_model_path";
|
---|
| 85 | }
|
---|
| 86 |
|
---|
| 87 | # Remove all HTML tags from the original text
|
---|
| 88 | $doc =~ s/<P[^>]*>/\n/sgi;
|
---|
| 89 | $doc =~ s/<H[^>]*>/\n/sgi;
|
---|
| 90 | $doc =~ s/<[^>]*>//sgi;
|
---|
| 91 | $doc =~ tr/\n/\n/s;
|
---|
| 92 |
|
---|
| 93 | # Write text to a temporary file doc.txt
|
---|
| 94 | my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
|
---|
| 95 | my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
|
---|
| 96 | open(DOC_TXT, '>:utf8', $doc_txt_file_path) or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
|
---|
| 97 | print DOC_TXT $doc;
|
---|
| 98 | close(DOC_TXT);
|
---|
| 99 |
|
---|
| 100 | # Run Kea with the specified options
|
---|
[29259] | 101 | #my $cmd = 'taskset -c 7 java -classpath "' . $kea_home . '" KEAKeyphraseExtractor -l ' . $tmp_directory_path . ' ' . $options_string;
|
---|
| 102 | my $cmd = 'java -classpath "' . $kea_home . '" KEAKeyphraseExtractor -l ' . $tmp_directory_path . ' ' . $options_string;
|
---|
[28187] | 103 | #rint "[DEBUG] Kea Command: |$cmd|\n";
|
---|
| 104 | system($cmd);
|
---|
| 105 | #system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
|
---|
| 106 |
|
---|
| 107 | # Read the resulting doc.key file which contains the keyphrases
|
---|
| 108 | my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
|
---|
| 109 | if (!open(IN, "<$doc_key_file_path")) {
|
---|
| 110 | # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
|
---|
| 111 | return "";
|
---|
| 112 | }
|
---|
| 113 |
|
---|
| 114 | my @keyphrase_list = ();
|
---|
| 115 | while (<IN>) {
|
---|
| 116 | chomp;
|
---|
| 117 | push(@keyphrase_list, $_);
|
---|
| 118 | }
|
---|
| 119 | close(IN);
|
---|
| 120 |
|
---|
| 121 | # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
|
---|
| 122 | unlink($doc_key_file_path);
|
---|
| 123 |
|
---|
| 124 | my $keyphrases = join(", ", @keyphrase_list);
|
---|
| 125 | return $keyphrases;
|
---|
| 126 | }
|
---|
| 127 |
|
---|
| 128 | 1;
|
---|