source: gsdl/trunk/perllib/Kea.pm@ 18463

Last change on this file since 18463 was 11070, checked in by mdewsnip, 18 years ago

A much tidier Kea.pm that now also works on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 3.8 KB
Line 
1package Kea;
2
3use strict;
4
5# This function is called by BasPlug.pm when a flag in a collection
6# configuration document specifies that keyphrase metadata must be gathered for
7# that collection.
8# It is passed as arguments, the documents text and possibly some options for
9# how the keyphrase data is to be collected if the keyphrase option flag was
10# set in the collection configuration file. This module then writes the
11# documents text to a file in a temporary directory because the stand-alone program Kea which will be
12# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
13# Once Kea has been called upon, the file containing the keyphrase data
14# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
15# we are interested in is extracted and passed back to BasPlug.pm in an
16# appropriate format.
17
18sub get_Kea_directory
19{
20 my $kea_version = shift(@_);
21 return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version");
22}
23
24# returns a string containing comma-separated keyphrases
25sub extract_KeyPhrases
26{
27 my $kea_version = shift(@_);
28 my $doc = shift(@_); # Document's text
29 my $args = shift(@_); # Options
30
31 # Set default models
32 my $kea_home = &get_Kea_directory($kea_version);
33 my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
34 if ($kea_version eq "4.0") {
35 # Use a different default model for Kea 4.0
36 $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
37 }
38
39 # Parse the Kea options
40 my $options_string;
41 my @args_list = split(/\s+/, $args) if (defined($args));
42 if (@args_list) {
43 my $model_specified = 0;
44 foreach my $arg (@args_list) {
45 if (length($arg) == 1) {
46 $options_string .= " -$arg";
47 }
48 else {
49 my $option = substr($arg, 0, 1);
50 my $value = substr($arg, 1);
51 if ($option eq "m") {
52 my $model_path = &util::filename_cat($kea_home, $value);
53 if (-e $model_path) {
54 $options_string .= " -m $model_path";
55 }
56 else {
57 print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
58 $options_string .= " -m $default_model_path";
59 }
60 $model_specified = 1;
61 }
62 else {
63 $options_string .= " -$option $value";
64 }
65 }
66 }
67
68 # If none of the option specifies the model, use the default one
69 if ($model_specified != 1) {
70 $options_string .= " -m $default_model_path";
71 }
72 }
73 else {
74 # If no options were specified, use the default model
75 $options_string = "-m $default_model_path";
76 }
77
78 # Remove all HTML tags from the original text
79 $doc =~ s/<P[^>]*>/\n/sgi;
80 $doc =~ s/<H[^>]*>/\n/sgi;
81 $doc =~ s/<[^>]*>//sgi;
82 $doc =~ tr/\n/\n/s;
83
84 # Write text to a temporary file doc.txt
85 my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
86 my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
87 open(DOC_TXT, ">$doc_txt_file_path") or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
88 print DOC_TXT $doc;
89 close(DOC_TXT);
90
91 # Run Kea with the specified options
92 system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
93
94 # Read the resulting doc.key file which contains the keyphrases
95 my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
96 if (!open(IN, "<$doc_key_file_path")) {
97 # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
98 return "";
99 }
100
101 my @keyphrase_list = ();
102 while (<IN>) {
103 chomp;
104 push(@keyphrase_list, $_);
105 }
106 close(IN);
107
108 # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
109 unlink($doc_key_file_path);
110
111 my $keyphrases = join(", ", @keyphrase_list);
112 return $keyphrases;
113}
114
1151;
Note: See TracBrowser for help on using the repository browser.