[1954] | 1 | package Kea;
|
---|
| 2 |
|
---|
[11070] | 3 | use strict;
|
---|
| 4 |
|
---|
[2018] | 5 | # This function is called by BasPlug.pm when a flag in a collection
|
---|
| 6 | # configuration document specifies that keyphrase metadata must be gathered for
|
---|
| 7 | # that collection.
|
---|
| 8 | # It is passed as arguments, the documents text and possibly some options for
|
---|
| 9 | # how the keyphrase data is to be collected if the keyphrase option flag was
|
---|
| 10 | # set in the collection configuration file. This module then writes the
|
---|
[8814] | 11 | # documents text to a file in a temporary directory because the stand-alone program Kea which will be
|
---|
| 12 | # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
|
---|
[2018] | 13 | # Once Kea has been called upon, the file containing the keyphrase data
|
---|
| 14 | # gathered by Kea should be stored in gsdl/tmp and this file is read, the data
|
---|
| 15 | # we are interested in is extracted and passed back to BasPlug.pm in an
|
---|
| 16 | # appropriate format.
|
---|
[1954] | 17 |
|
---|
[11069] | 18 | sub get_Kea_directory
|
---|
| 19 | {
|
---|
| 20 | my $kea_version = shift(@_);
|
---|
| 21 | return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version");
|
---|
| 22 | }
|
---|
| 23 |
|
---|
[9409] | 24 | # returns a string containing comma-separated keyphrases
|
---|
[11070] | 25 | sub extract_KeyPhrases
|
---|
| 26 | {
|
---|
[11069] | 27 | my $kea_version = shift(@_);
|
---|
[11070] | 28 | my $doc = shift(@_); # Document's text
|
---|
| 29 | my $args = shift(@_); # Options
|
---|
[8814] | 30 |
|
---|
[11070] | 31 | # Set default models
|
---|
| 32 | my $kea_home = &get_Kea_directory($kea_version);
|
---|
| 33 | my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
|
---|
[11069] | 34 | if ($kea_version eq "4.0") {
|
---|
[11070] | 35 | # Use a different default model for Kea 4.0
|
---|
| 36 | $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
|
---|
[11069] | 37 | }
|
---|
| 38 |
|
---|
[11070] | 39 | # Parse the Kea options
|
---|
| 40 | my $options_string;
|
---|
| 41 | my @args_list = split(/\s+/, $args) if (defined($args));
|
---|
| 42 | if (@args_list) {
|
---|
| 43 | my $model_specified = 0;
|
---|
| 44 | foreach my $arg (@args_list) {
|
---|
| 45 | if (length($arg) == 1) {
|
---|
| 46 | $options_string .= " -$arg";
|
---|
| 47 | }
|
---|
| 48 | else {
|
---|
| 49 | my $option = substr($arg, 0, 1);
|
---|
| 50 | my $value = substr($arg, 1);
|
---|
| 51 | if ($option eq "m") {
|
---|
| 52 | my $model_path = &util::filename_cat($kea_home, $value);
|
---|
| 53 | if (-e $model_path) {
|
---|
| 54 | $options_string .= " -m $model_path";
|
---|
| 55 | }
|
---|
| 56 | else {
|
---|
| 57 | print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
|
---|
| 58 | $options_string .= " -m $default_model_path";
|
---|
| 59 | }
|
---|
| 60 | $model_specified = 1;
|
---|
[8814] | 61 | }
|
---|
[11070] | 62 | else {
|
---|
| 63 | $options_string .= " -$option $value";
|
---|
| 64 | }
|
---|
[8814] | 65 | }
|
---|
[6792] | 66 | }
|
---|
[11070] | 67 |
|
---|
| 68 | # If none of the option specifies the model, use the default one
|
---|
| 69 | if ($model_specified != 1) {
|
---|
| 70 | $options_string .= " -m $default_model_path";
|
---|
[8814] | 71 | }
|
---|
[1954] | 72 | }
|
---|
[11070] | 73 | else {
|
---|
| 74 | # If no options were specified, use the default model
|
---|
| 75 | $options_string = "-m $default_model_path";
|
---|
| 76 | }
|
---|
[1954] | 77 |
|
---|
[8814] | 78 | # Remove all HTML tags from the original text
|
---|
| 79 | $doc =~ s/<P[^>]*>/\n/sgi;
|
---|
| 80 | $doc =~ s/<H[^>]*>/\n/sgi;
|
---|
| 81 | $doc =~ s/<[^>]*>//sgi;
|
---|
| 82 | $doc =~ tr/\n/\n/s;
|
---|
[1954] | 83 |
|
---|
[11070] | 84 | # Write text to a temporary file doc.txt
|
---|
| 85 | my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
|
---|
| 86 | my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
|
---|
| 87 | open(DOC_TXT, ">$doc_txt_file_path") or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
|
---|
| 88 | print DOC_TXT $doc;
|
---|
| 89 | close(DOC_TXT);
|
---|
[1954] | 90 |
|
---|
[11070] | 91 | # Run Kea with the specified options
|
---|
| 92 | system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
|
---|
[1954] | 93 |
|
---|
[11070] | 94 | # Read the resulting doc.key file which contains the keyphrases
|
---|
| 95 | my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
|
---|
| 96 | if (!open(IN, "<$doc_key_file_path")) {
|
---|
| 97 | # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
|
---|
| 98 | return "";
|
---|
| 99 | }
|
---|
[6792] | 100 |
|
---|
[11070] | 101 | my @keyphrase_list = ();
|
---|
| 102 | while (<IN>) {
|
---|
[1954] | 103 | chomp;
|
---|
[11070] | 104 | push(@keyphrase_list, $_);
|
---|
[1954] | 105 | }
|
---|
| 106 | close(IN);
|
---|
| 107 |
|
---|
[11070] | 108 | # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
|
---|
| 109 | unlink($doc_key_file_path);
|
---|
[1954] | 110 |
|
---|
[11070] | 111 | my $keyphrases = join(", ", @keyphrase_list);
|
---|
| 112 | return $keyphrases;
|
---|
[1954] | 113 | }
|
---|
| 114 |
|
---|
| 115 | 1;
|
---|