source: trunk/gsdl/perllib/Kea.pm@ 1954

Last change on this file since 1954 was 1954, checked in by jmt14, 23 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 2.9 KB
Line 
1package Kea;
2
3
4use BasPlug;
5
6#This function is called by BasPlug.pm when a flag in a collection configuration
7#document specifies that keyphrase metadata must be gathered for that collection.
8#It is passed as arguments, the documents text and possibly some options for how
9#the keyphrase data is to be collected if the keyphrase option flag was set in
10#the collection configuration file. This module then writes the documents text
11#to a file because the stand-alone program Kea which will be called to do the
12#actual extraction of the keyphrases expects a file argument. Once Kea has been
13#called upon, the file containing the keyphrase data gathered by Kea should be
14#stored in gsdl/tmp and this file is read, the data we are interested in is extracted
15#and passed back to BasPlug.pm in an appropriate format.
16
17sub extract_KeyPhrases {
18
19 my $gsdlhome = $ENV{'GSDLHOME'};
20 my $doc = shift(@_); #documents text
21 my $args = shift(@_); #any options
22 my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
23 my $suffix = 'kea'; #default file will be called .kea
24 my $command = "";
25 my @keylist;
26 my @stemlist;
27
28 print STDERR "optionlist: @optionlist\n";
29
30 foreach $element (@optionlist){ #for each option
31 my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
32
33 $option = "-".$option; #place dash in front of option
34 $file = "" if(!defined($file)); #no file options specified
35 $suffix = $file if($option eq '-E'); #if option is extension (suffix) option
36 $command .= " $option $file "; #add to list of commands
37 }
38
39 print STDERR "Using output suffix: $suffix\n";
40
41 # remove all HTML tags
42 $doc =~ s/<P[^>]*>/\n/sgi;
43 $doc =~ s/<H[^>]*>/\n/sgi;
44 $doc =~ s/<[^>]*>//sgi;
45 $doc =~ tr/\n/\n/s;
46
47 #write text to a file eg doc.txt
48 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
49 print OUT $doc;
50 close(OUT);
51
52 #call Kea with specifed options
53 `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`;
54
55 #read doc.kea with keywords
56 open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist;
57 #this means doc.kea does not exist
58 #either because an option was wrongly specified
59 #or no keyphrases were found
60 while(<IN>){
61 chomp;
62 @key = split(/\t/); #split into array separated by a tab
63 push(@keylist, $key[0]); #add to list of keywords
64 push(@stemlist, $key[1]); #add to list of stems
65 }
66 close(IN);
67
68 #put data into appropriate format
69 $keylist = join(", ", @keylist);
70 $stemlist = join(", ", @stemlist);
71
72 #delete doc.extension so that in future it will not be opened and read
73 `rm $gsdlhome/tmp/doc.$suffix`;
74
75 #return keywords + stems to basplug
76 my @keystemlist = ($keylist, $stemlist);
77 return @keystemlist;
78
79}
80
81
82
831;
84
85
Note: See TracBrowser for help on using the repository browser.