source: trunk/gsdl/perllib/Kea.pm@ 14071

Last change on this file since 14071 was 11070, checked in by mdewsnip, 18 years ago

A much tidier Kea.pm that now also works on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 3.8 KB
RevLine 
[1954]1package Kea;
2
[11070]3use strict;
4
[2018]5# This function is called by BasPlug.pm when a flag in a collection
6# configuration document specifies that keyphrase metadata must be gathered for
7# that collection.
8# It is passed as arguments, the documents text and possibly some options for
9# how the keyphrase data is to be collected if the keyphrase option flag was
10# set in the collection configuration file. This module then writes the
[8814]11# documents text to a file in a temporary directory because the stand-alone program Kea which will be
12# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
[2018]13# Once Kea has been called upon, the file containing the keyphrase data
14# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
15# we are interested in is extracted and passed back to BasPlug.pm in an
16# appropriate format.
[1954]17
[11069]18sub get_Kea_directory
19{
20 my $kea_version = shift(@_);
21 return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version");
22}
23
[9409]24# returns a string containing comma-separated keyphrases
[11070]25sub extract_KeyPhrases
26{
[11069]27 my $kea_version = shift(@_);
[11070]28 my $doc = shift(@_); # Document's text
29 my $args = shift(@_); # Options
[8814]30
[11070]31 # Set default models
32 my $kea_home = &get_Kea_directory($kea_version);
33 my $default_model_path = &util::filename_cat($kea_home, "CSTR-20");
[11069]34 if ($kea_version eq "4.0") {
[11070]35 # Use a different default model for Kea 4.0
36 $default_model_path = &util::filename_cat($kea_home, "FAO-20docs");
[11069]37 }
38
[11070]39 # Parse the Kea options
40 my $options_string;
41 my @args_list = split(/\s+/, $args) if (defined($args));
42 if (@args_list) {
43 my $model_specified = 0;
44 foreach my $arg (@args_list) {
45 if (length($arg) == 1) {
46 $options_string .= " -$arg";
47 }
48 else {
49 my $option = substr($arg, 0, 1);
50 my $value = substr($arg, 1);
51 if ($option eq "m") {
52 my $model_path = &util::filename_cat($kea_home, $value);
53 if (-e $model_path) {
54 $options_string .= " -m $model_path";
55 }
56 else {
57 print STDERR "Warning: Couldn't find model $model_path; using the default model instead.\n";
58 $options_string .= " -m $default_model_path";
59 }
60 $model_specified = 1;
[8814]61 }
[11070]62 else {
63 $options_string .= " -$option $value";
64 }
[8814]65 }
[6792]66 }
[11070]67
68 # If none of the option specifies the model, use the default one
69 if ($model_specified != 1) {
70 $options_string .= " -m $default_model_path";
[8814]71 }
[1954]72 }
[11070]73 else {
74 # If no options were specified, use the default model
75 $options_string = "-m $default_model_path";
76 }
[1954]77
[8814]78 # Remove all HTML tags from the original text
79 $doc =~ s/<P[^>]*>/\n/sgi;
80 $doc =~ s/<H[^>]*>/\n/sgi;
81 $doc =~ s/<[^>]*>//sgi;
82 $doc =~ tr/\n/\n/s;
[1954]83
[11070]84 # Write text to a temporary file doc.txt
85 my $tmp_directory_path = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
86 my $doc_txt_file_path = &util::filename_cat($tmp_directory_path, "doc.txt");
87 open(DOC_TXT, ">$doc_txt_file_path") or die "Error: Could not write $doc_txt_file_path in Kea.pm.\n";
88 print DOC_TXT $doc;
89 close(DOC_TXT);
[1954]90
[11070]91 # Run Kea with the specified options
92 system("java -classpath \"$kea_home\" KEAKeyphraseExtractor -l $tmp_directory_path $options_string");
[1954]93
[11070]94 # Read the resulting doc.key file which contains the keyphrases
95 my $doc_key_file_path = &util::filename_cat($tmp_directory_path, "doc.key");
96 if (!open(IN, "<$doc_key_file_path")) {
97 # The doc.key file does not exist (either an option was wrongly specified, or no keyphrases were found)
98 return "";
99 }
[6792]100
[11070]101 my @keyphrase_list = ();
102 while (<IN>) {
[1954]103 chomp;
[11070]104 push(@keyphrase_list, $_);
[1954]105 }
106 close(IN);
107
[11070]108 # Delete doc.key so that in future it will not be opened and read (otherwise KEA sees it as more keyphrases!)
109 unlink($doc_key_file_path);
[1954]110
[11070]111 my $keyphrases = join(", ", @keyphrase_list);
112 return $keyphrases;
[1954]113}
114
1151;
Note: See TracBrowser for help on using the repository browser.