source: gsdl/trunk/perllib/plugins/KeyphraseExtractor.pm@ 15918

Last change on this file since 15918 was 15887, checked in by mdewsnip, 16 years ago

Added "use strict" to the few files that were missing it, and fixing resulting problems in MediaWikiPlug.pm.

  • Property svn:executable set to *
File size: 3.0 KB
Line 
1package KeyphraseExtractor;
2
3use Kea;
4use PrintInfo;
5use strict;
6no strict 'subs';
7
8BEGIN {
9 @KeyphraseExtractor::ISA = ('PrintInfo');
10}
11
12my $arguments = [
13 { 'name' => "extract_keyphrases",
14 'desc' => "{KeyphraseExtractor.extract_keyphrases}",
15 'type' => "flag",
16 'reqd' => "no" },
17 { 'name' => "extract_keyphrases_kea4",
18 'desc' => "{KeyphraseExtractor.extract_keyphrases_kea4}",
19 'type' => "flag",
20 'reqd' => "no" },
21 { 'name' => "extract_keyphrase_options",
22 'desc' => "{KeyphraseExtractor.extract_keyphrase_options}",
23 'type' => "string",
24 'deft' => "",
25 'reqd' => "no" }
26 ];
27
28my $options = { 'name' => "KeyphraseExtractor",
29 'desc' => "{KeyphraseExtractor.desc}",
30 'abstract' => "yes",
31 'inherits' => "yes",
32 'args' => $arguments };
33
34
35sub new {
36 my ($class) = shift (@_);
37 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
38 push(@$pluginlist, $class);
39
40 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
41 push(@{$hashArgOptLists->{"OptList"}},$options);
42
43 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
44
45 return bless $self, $class;
46
47}
48# extract metadata
49sub extract_keyphrase_metadata {
50
51 my $self = shift (@_);
52 my ($doc_obj) = @_;
53
54 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
55 $self->extract_keyphrases($doc_obj);
56 }
57
58}
59
60
61#adding kea keyphrases
62sub extract_keyphrases
63{
64 my $self = shift(@_);
65 my $doc_obj = shift(@_);
66
67 # Use Kea 3.0 unless 4.0 has been specified
68 my $kea_version = "3.0";
69 if ($self->{'extract_keyphrases_kea4'}) {
70 $kea_version = "4.0";
71 }
72
73 # Check that Kea exists, and tell the user where to get it if not
74 my $keahome = &Kea::get_Kea_directory($kea_version);
75 if (!-e $keahome) {
76 gsprintf(STDERR, "{KeyphraseExtractor.missing_kea}\n", $keahome, $kea_version);
77 return;
78 }
79
80 my $thissection = $doc_obj->get_top_section();
81 my $text = "";
82 my $list;
83
84 #loop through sections to gather whole doc
85 while (defined $thissection) {
86 my $sectiontext = $doc_obj->get_text($thissection);
87 $text = $text.$sectiontext;
88 $thissection = $doc_obj->get_next_section ($thissection);
89 }
90
91 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
92 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
93 } else { #otherwise call Kea with no options
94 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
95 }
96
97 if ($list){
98 # if a list of kea keyphrases was returned (ie not empty)
99 if ($self->{'verbosity'}) {
100 gsprintf(STDERR, "{KeyphraseExtractor.keyphrases}: $list\n");
101 }
102
103 #add metadata to top section
104 $thissection = $doc_obj->get_top_section();
105
106 # add all key phrases as one metadata
107 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
108
109 # add individual key phrases as multiple metadata
110 foreach my $keyphrase (split(',', $list)) {
111 $keyphrase =~ s/^\s+|\s+$//g;
112 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
113 }
114 }
115}
116
1171;
Note: See TracBrowser for help on using the repository browser.