1 | package Kea;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 |
|
---|
5 | # This function is called by BasPlug.pm when a flag in a collection
|
---|
6 | # configuration document specifies that keyphrase metadata must be gathered for
|
---|
7 | # that collection.
|
---|
8 | # It is passed as arguments, the documents text and possibly some options for
|
---|
9 | # how the keyphrase data is to be collected if the keyphrase option flag was
|
---|
10 | # set in the collection configuration file. This module then writes the
|
---|
11 | # documents text to a file because the stand-alone program Kea which will be
|
---|
12 | # called to do the actual extraction of the keyphrases expects a file argument.
|
---|
13 | # Once Kea has been called upon, the file containing the keyphrase data
|
---|
14 | # gathered by Kea should be stored in gsdl/tmp and this file is read, the data
|
---|
15 | # we are interested in is extracted and passed back to BasPlug.pm in an
|
---|
16 | # appropriate format.
|
---|
17 |
|
---|
18 | sub extract_KeyPhrases {
|
---|
19 |
|
---|
20 | my $gsdlhome = $ENV{'GSDLHOME'};
|
---|
21 | my $doc = shift(@_); #documents text
|
---|
22 | my $args = shift(@_); #any options
|
---|
23 | my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
|
---|
24 | my $suffix = 'kea'; #default file will be called .kea
|
---|
25 | my @kea_options;
|
---|
26 | my @keylist;
|
---|
27 | my @stemlist;
|
---|
28 |
|
---|
29 |
|
---|
30 | foreach my $element (@optionlist){ #for each option
|
---|
31 | my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
|
---|
32 |
|
---|
33 | $option = "-".$option; #place dash in front of option
|
---|
34 | push @kea_options, $option;
|
---|
35 | if (defined($file)) {
|
---|
36 | push @kea_options, $file;
|
---|
37 | }
|
---|
38 |
|
---|
39 | if ($option eq '-E') # option is extension (suffix) option
|
---|
40 | { $suffix = $file }
|
---|
41 | }
|
---|
42 |
|
---|
43 | # print STDERR "Using output suffix: $suffix\n";
|
---|
44 |
|
---|
45 | # remove all HTML tags
|
---|
46 | $doc =~ s/<[ph][^>]*>/\n/sgi; # replace headings/paragraphs with newline
|
---|
47 | $doc =~ s/<[^>]*>/ /sgi; # replace all others with a space
|
---|
48 |
|
---|
49 | # > lt amp
|
---|
50 | $doc =~ s/\&(?:gt|lt|amp)\;/ /gi;
|
---|
51 |
|
---|
52 | my $tmpfile="$gsdlhome/tmp/doc.txt";
|
---|
53 | open(OUT, ">$tmpfile") or die "Kea.pm could not create doc.txt: $!\n";
|
---|
54 | print OUT $doc;
|
---|
55 | close(OUT);
|
---|
56 |
|
---|
57 | # call Kea with specifed options
|
---|
58 | system("$gsdlhome/perllib/Kea-1.1.4/Kea", @kea_options,
|
---|
59 | $tmpfile);
|
---|
60 |
|
---|
61 | unlink($tmpfile); # don't need this file anymore
|
---|
62 |
|
---|
63 | # read doc.kea with keywords
|
---|
64 | my $inputfile="$gsdlhome/tmp/doc.$suffix";
|
---|
65 |
|
---|
66 | # If this file doesn't exist, then either an option was wrongly specified
|
---|
67 | # or no keyphrases were found
|
---|
68 | open(IN, "<$inputfile") or return ();
|
---|
69 |
|
---|
70 | while(<IN>){
|
---|
71 | chomp;
|
---|
72 | my @key = split(/\t/); #split into array separated by a tab
|
---|
73 | push(@keylist, $key[0]); #add to list of keywords
|
---|
74 | push(@stemlist, $key[1]); #add to list of stems
|
---|
75 | }
|
---|
76 | close(IN);
|
---|
77 |
|
---|
78 | #put data into appropriate format
|
---|
79 | my $keylistref = join(", ", @keylist);
|
---|
80 | my $stemlistref = join(", ", @stemlist);
|
---|
81 |
|
---|
82 | # delete doc.extension so that in future it will not be opened and read
|
---|
83 | unlink($inputfile);
|
---|
84 |
|
---|
85 | # return keywords + stems to basplug
|
---|
86 | return ($keylistref, $stemlistref);
|
---|
87 | }
|
---|
88 |
|
---|
89 |
|
---|
90 |
|
---|
91 | 1;
|
---|
92 |
|
---|
93 |
|
---|