1 | package Kea;
|
---|
2 |
|
---|
3 |
|
---|
4 | use BasPlug;
|
---|
5 |
|
---|
6 | #This function is called by BasPlug.pm when a flag in a collection configuration
|
---|
7 | #document specifies that keyphrase metadata must be gathered for that collection.
|
---|
8 | #It is passed as arguments, the documents text and possibly some options for how
|
---|
9 | #the keyphrase data is to be collected if the keyphrase option flag was set in
|
---|
10 | #the collection configuration file. This module then writes the documents text
|
---|
11 | #to a file because the stand-alone program Kea which will be called to do the
|
---|
12 | #actual extraction of the keyphrases expects a file argument. Once Kea has been
|
---|
13 | #called upon, the file containing the keyphrase data gathered by Kea should be
|
---|
14 | #stored in gsdl/tmp and this file is read, the data we are interested in is extracted
|
---|
15 | #and passed back to BasPlug.pm in an appropriate format.
|
---|
16 |
|
---|
17 | sub extract_KeyPhrases {
|
---|
18 |
|
---|
19 | my $gsdlhome = $ENV{'GSDLHOME'};
|
---|
20 | my $doc = shift(@_); #documents text
|
---|
21 | my $args = shift(@_); #any options
|
---|
22 | my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
|
---|
23 | my $suffix = 'kea'; #default file will be called .kea
|
---|
24 | my $command = "";
|
---|
25 | my @keylist;
|
---|
26 | my @stemlist;
|
---|
27 |
|
---|
28 | print STDERR "optionlist: @optionlist\n";
|
---|
29 |
|
---|
30 | foreach $element (@optionlist){ #for each option
|
---|
31 | my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
|
---|
32 |
|
---|
33 | $option = "-".$option; #place dash in front of option
|
---|
34 | $file = "" if(!defined($file)); #no file options specified
|
---|
35 | $suffix = $file if($option eq '-E'); #if option is extension (suffix) option
|
---|
36 | $command .= " $option $file "; #add to list of commands
|
---|
37 | }
|
---|
38 |
|
---|
39 | print STDERR "Using output suffix: $suffix\n";
|
---|
40 |
|
---|
41 | # remove all HTML tags
|
---|
42 | $doc =~ s/<P[^>]*>/\n/sgi;
|
---|
43 | $doc =~ s/<H[^>]*>/\n/sgi;
|
---|
44 | $doc =~ s/<[^>]*>//sgi;
|
---|
45 | $doc =~ tr/\n/\n/s;
|
---|
46 |
|
---|
47 | #write text to a file eg doc.txt
|
---|
48 | open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
|
---|
49 | print OUT $doc;
|
---|
50 | close(OUT);
|
---|
51 |
|
---|
52 | #call Kea with specifed options
|
---|
53 | `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`;
|
---|
54 |
|
---|
55 | #read doc.kea with keywords
|
---|
56 | open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist;
|
---|
57 | #this means doc.kea does not exist
|
---|
58 | #either because an option was wrongly specified
|
---|
59 | #or no keyphrases were found
|
---|
60 | while(<IN>){
|
---|
61 | chomp;
|
---|
62 | @key = split(/\t/); #split into array separated by a tab
|
---|
63 | push(@keylist, $key[0]); #add to list of keywords
|
---|
64 | push(@stemlist, $key[1]); #add to list of stems
|
---|
65 | }
|
---|
66 | close(IN);
|
---|
67 |
|
---|
68 | #put data into appropriate format
|
---|
69 | $keylist = join(", ", @keylist);
|
---|
70 | $stemlist = join(", ", @stemlist);
|
---|
71 |
|
---|
72 | #delete doc.extension so that in future it will not be opened and read
|
---|
73 | `rm $gsdlhome/tmp/doc.$suffix`;
|
---|
74 |
|
---|
75 | #return keywords + stems to basplug
|
---|
76 | my @keystemlist = ($keylist, $stemlist);
|
---|
77 | return @keystemlist;
|
---|
78 |
|
---|
79 | }
|
---|
80 |
|
---|
81 |
|
---|
82 |
|
---|
83 | 1;
|
---|
84 |
|
---|
85 |
|
---|