source: main/tags/2.52/gsdl/perllib/Kea.pm@ 25422

Last change on this file since 25422 was 6792, checked in by jrm21, 20 years ago

tidied up code quite a bit, remove tmp files after use.

  • Property svn:keywords set to Author Date Id Revision
File size: 2.9 KB
Line 
1package Kea;
2
3use strict;
4
5# This function is called by BasPlug.pm when a flag in a collection
6# configuration document specifies that keyphrase metadata must be gathered for
7# that collection.
8# It is passed as arguments, the documents text and possibly some options for
9# how the keyphrase data is to be collected if the keyphrase option flag was
10# set in the collection configuration file. This module then writes the
11# documents text to a file because the stand-alone program Kea which will be
12# called to do the actual extraction of the keyphrases expects a file argument.
13# Once Kea has been called upon, the file containing the keyphrase data
14# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
15# we are interested in is extracted and passed back to BasPlug.pm in an
16# appropriate format.
17
18sub extract_KeyPhrases {
19
20 my $gsdlhome = $ENV{'GSDLHOME'};
21 my $doc = shift(@_); #documents text
22 my $args = shift(@_); #any options
23 my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
24 my $suffix = 'kea'; #default file will be called .kea
25 my @kea_options;
26 my @keylist;
27 my @stemlist;
28
29
30 foreach my $element (@optionlist){ #for each option
31 my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
32
33 $option = "-".$option; #place dash in front of option
34 push @kea_options, $option;
35 if (defined($file)) {
36 push @kea_options, $file;
37 }
38
39 if ($option eq '-E') # option is extension (suffix) option
40 { $suffix = $file }
41 }
42
43 # print STDERR "Using output suffix: $suffix\n";
44
45 # remove all HTML tags
46 $doc =~ s/<[ph][^>]*>/\n/sgi; # replace headings/paragraphs with newline
47 $doc =~ s/<[^>]*>/ /sgi; # replace all others with a space
48
49 # &gt; lt amp
50 $doc =~ s/\&(?:gt|lt|amp)\;/ /gi;
51
52 my $tmpfile="$gsdlhome/tmp/doc.txt";
53 open(OUT, ">$tmpfile") or die "Kea.pm could not create doc.txt: $!\n";
54 print OUT $doc;
55 close(OUT);
56
57 # call Kea with specifed options
58 system("$gsdlhome/perllib/Kea-1.1.4/Kea", @kea_options,
59 $tmpfile);
60
61 unlink($tmpfile); # don't need this file anymore
62
63 # read doc.kea with keywords
64 my $inputfile="$gsdlhome/tmp/doc.$suffix";
65
66 # If this file doesn't exist, then either an option was wrongly specified
67 # or no keyphrases were found
68 open(IN, "<$inputfile") or return ();
69
70 while(<IN>){
71 chomp;
72 my @key = split(/\t/); #split into array separated by a tab
73 push(@keylist, $key[0]); #add to list of keywords
74 push(@stemlist, $key[1]); #add to list of stems
75 }
76 close(IN);
77
78 #put data into appropriate format
79 my $keylistref = join(", ", @keylist);
80 my $stemlistref = join(", ", @stemlist);
81
82 # delete doc.extension so that in future it will not be opened and read
83 unlink($inputfile);
84
85 # return keywords + stems to basplug
86 return ($keylistref, $stemlistref);
87}
88
89
90
911;
92
93
Note: See TracBrowser for help on using the repository browser.