source: trunk/gsdl/perllib/Kea.pm@ 2018

Last change on this file since 2018 was 2018, checked in by jrm21, 23 years ago

removed "use BasPlug" lines from metadata extractors, as they shouldn't
be there - BasPlug uses these, not other way around.

  • Property svn:keywords set to Author Date Id Revision
File size: 2.9 KB
RevLine 
[1954]1package Kea;
2
[2018]3# This function is called by BasPlug.pm when a flag in a collection
4# configuration document specifies that keyphrase metadata must be gathered for
5# that collection.
6# It is passed as arguments, the documents text and possibly some options for
7# how the keyphrase data is to be collected if the keyphrase option flag was
8# set in the collection configuration file. This module then writes the
9# documents text to a file because the stand-alone program Kea which will be
10# called to do the actual extraction of the keyphrases expects a file argument.
11# Once Kea has been called upon, the file containing the keyphrase data
12# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
13# we are interested in is extracted and passed back to BasPlug.pm in an
14# appropriate format.
[1954]15
16sub extract_KeyPhrases {
17
18 my $gsdlhome = $ENV{'GSDLHOME'};
19 my $doc = shift(@_); #documents text
20 my $args = shift(@_); #any options
21 my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
22 my $suffix = 'kea'; #default file will be called .kea
23 my $command = "";
24 my @keylist;
25 my @stemlist;
26
27 print STDERR "optionlist: @optionlist\n";
28
29 foreach $element (@optionlist){ #for each option
30 my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
31
32 $option = "-".$option; #place dash in front of option
33 $file = "" if(!defined($file)); #no file options specified
34 $suffix = $file if($option eq '-E'); #if option is extension (suffix) option
35 $command .= " $option $file "; #add to list of commands
36 }
37
38 print STDERR "Using output suffix: $suffix\n";
39
40 # remove all HTML tags
41 $doc =~ s/<P[^>]*>/\n/sgi;
42 $doc =~ s/<H[^>]*>/\n/sgi;
43 $doc =~ s/<[^>]*>//sgi;
44 $doc =~ tr/\n/\n/s;
45
46 #write text to a file eg doc.txt
47 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";
48 print OUT $doc;
49 close(OUT);
50
51 #call Kea with specifed options
52 `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`;
53
54 #read doc.kea with keywords
55 open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist;
56 #this means doc.kea does not exist
57 #either because an option was wrongly specified
58 #or no keyphrases were found
59 while(<IN>){
60 chomp;
61 @key = split(/\t/); #split into array separated by a tab
62 push(@keylist, $key[0]); #add to list of keywords
63 push(@stemlist, $key[1]); #add to list of stems
64 }
65 close(IN);
66
67 #put data into appropriate format
68 $keylist = join(", ", @keylist);
69 $stemlist = join(", ", @stemlist);
70
71 #delete doc.extension so that in future it will not be opened and read
72 `rm $gsdlhome/tmp/doc.$suffix`;
73
74 #return keywords + stems to basplug
75 my @keystemlist = ($keylist, $stemlist);
76 return @keystemlist;
77
78}
79
80
81
821;
83
84
Note: See TracBrowser for help on using the repository browser.