root/trunk/gsdl/perllib/Kea.pm @ 2018

Revision 2018, 2.9 KB (checked in by jrm21, 19 years ago)

removed "use BasPlug?" lines from metadata extractors, as they shouldn't
be there - BasPlug? uses these, not other way around.

  • Property svn:keywords set to Author Date Id Revision
Line 
1package Kea;
2
3# This function is called by BasPlug.pm when a flag in a collection
4# configuration document specifies that keyphrase metadata must be gathered for
5# that collection.
6# It is passed as arguments, the documents text and possibly some options for
7# how the keyphrase data is to be collected if the keyphrase option flag was
8# set in the collection configuration file.  This module then writes the
9# documents text to a file because the stand-alone program Kea which will be
10# called to do the actual extraction of the keyphrases expects a file argument.
11# Once Kea has been called upon, the file containing the keyphrase data
12# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
13# we are interested in is extracted and passed back to BasPlug.pm in an
14# appropriate format.
15
16sub extract_KeyPhrases {
17
18    my $gsdlhome = $ENV{'GSDLHOME'};
19    my $doc = shift(@_); #documents text 
20    my $args = shift(@_); #any options
21    my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
22    my $suffix = 'kea'; #default file will be called .kea
23    my $command = "";
24    my @keylist;
25    my @stemlist;
26
27    print STDERR "optionlist: @optionlist\n";
28       
29    foreach $element (@optionlist){ #for each option
30    my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
31
32    $option  = "-".$option; #place dash in front of option
33    $file = "" if(!defined($file)); #no file options specified
34    $suffix = $file if($option eq '-E'); #if option is extension (suffix) option
35    $command .= " $option $file "; #add to list of commands
36    }
37
38    print STDERR "Using output suffix: $suffix\n";
39
40    # remove all HTML tags
41    $doc =~ s/<P[^>]*>/\n/sgi;
42    $doc =~ s/<H[^>]*>/\n/sgi;
43    $doc =~ s/<[^>]*>//sgi;
44    $doc =~ tr/\n/\n/s;
45
46    #write text to a file eg doc.txt
47    open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; 
48    print OUT $doc;
49    close(OUT);
50
51    #call Kea with specifed options
52    `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`;
53
54    #read doc.kea with keywords
55    open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist;
56                                          #this means doc.kea does not exist
57                                          #either because an option was wrongly specified
58                                      #or no keyphrases were found
59    while(<IN>){
60    chomp;
61    @key = split(/\t/); #split into array separated by a tab
62    push(@keylist, $key[0]); #add to list of keywords
63    push(@stemlist, $key[1]); #add to list of stems 
64    }
65    close(IN);
66
67    #put data into appropriate format
68    $keylist = join(", ", @keylist);
69    $stemlist = join(", ", @stemlist); 
70 
71    #delete doc.extension so that in future it will not be opened and read
72    `rm $gsdlhome/tmp/doc.$suffix`;
73
74    #return keywords + stems to basplug
75    my @keystemlist = ($keylist, $stemlist);
76    return @keystemlist;
77
78}
79
80
81
821;
83
84
Note: See TracBrowser for help on using the browser.