root/trunk/gsdl/perllib/Kea.pm @ 1954

Revision 1954, 2.9 KB (checked in by jmt14, 19 years ago)

*** empty log message ***

  • Property svn:keywords set to Author Date Id Revision
Line 
1package Kea;
2
3
4use BasPlug;
5
6#This function is called by BasPlug.pm when a flag in a collection configuration
7#document specifies that keyphrase metadata must be gathered for that collection.
8#It is passed as arguments, the documents text and possibly some options for how
9#the keyphrase data is to be collected if the keyphrase option flag was set in
10#the collection configuration file.  This module then writes the documents text
11#to a file because the stand-alone program Kea which will be called to do the
12#actual extraction of the keyphrases expects a file argument. Once Kea has been
13#called upon, the file containing the keyphrase data gathered by Kea should be
14#stored in gsdl/tmp and this file is read, the data we are interested in is extracted
15#and passed back to BasPlug.pm in an appropriate format.
16
17sub extract_KeyPhrases {
18
19    my $gsdlhome = $ENV{'GSDLHOME'};
20    my $doc = shift(@_); #documents text 
21    my $args = shift(@_); #any options
22    my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
23    my $suffix = 'kea'; #default file will be called .kea
24    my $command = "";
25    my @keylist;
26    my @stemlist;
27
28    print STDERR "optionlist: @optionlist\n";
29       
30    foreach $element (@optionlist){ #for each option
31    my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
32
33    $option  = "-".$option; #place dash in front of option
34    $file = "" if(!defined($file)); #no file options specified
35    $suffix = $file if($option eq '-E'); #if option is extension (suffix) option
36    $command .= " $option $file "; #add to list of commands
37    }
38
39    print STDERR "Using output suffix: $suffix\n";
40
41    # remove all HTML tags
42    $doc =~ s/<P[^>]*>/\n/sgi;
43    $doc =~ s/<H[^>]*>/\n/sgi;
44    $doc =~ s/<[^>]*>//sgi;
45    $doc =~ tr/\n/\n/s;
46
47    #write text to a file eg doc.txt
48    open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; 
49    print OUT $doc;
50    close(OUT);
51
52    #call Kea with specifed options
53    `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`;
54
55    #read doc.kea with keywords
56    open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist;
57                                          #this means doc.kea does not exist
58                                          #either because an option was wrongly specified
59                                      #or no keyphrases were found
60    while(<IN>){
61    chomp;
62    @key = split(/\t/); #split into array separated by a tab
63    push(@keylist, $key[0]); #add to list of keywords
64    push(@stemlist, $key[1]); #add to list of stems 
65    }
66    close(IN);
67
68    #put data into appropriate format
69    $keylist = join(", ", @keylist);
70    $stemlist = join(", ", @stemlist); 
71 
72    #delete doc.extension so that in future it will not be opened and read
73    `rm $gsdlhome/tmp/doc.$suffix`;
74
75    #return keywords + stems to basplug
76    my @keystemlist = ($keylist, $stemlist);
77    return @keystemlist;
78
79}
80
81
82
831;
84
85
Note: See TracBrowser for help on using the browser.