source: gsdl/trunk/perllib/plugins/KeyphraseExtractor.pm@ 15867

Last change on this file since 15867 was 15867, checked in by kjdon, 16 years ago

plugin overhaul: automatic metadata extraction moved out of BasPlug into several extractor plugins (Keyphrase, Date, Acronym, EmailAddress Extractors). These are used by the AutoExtractMetadata plugin to add this functionality to BasePlugin (using multiple inheritance)

  • Property svn:executable set to *
File size: 3.0 KB
Line 
1package KeyphraseExtractor;
2
3use Kea;
4use PrintInfo;
5
6BEGIN {
7 @KeyphraseExtractor::ISA = ('PrintInfo');
8}
9
10my $arguments = [
11 { 'name' => "extract_keyphrases",
12 'desc' => "{KeyphraseExtractor.extract_keyphrases}",
13 'type' => "flag",
14 'reqd' => "no" },
15 { 'name' => "extract_keyphrases_kea4",
16 'desc' => "{KeyphraseExtractor.extract_keyphrases_kea4}",
17 'type' => "flag",
18 'reqd' => "no" },
19 { 'name' => "extract_keyphrase_options",
20 'desc' => "{KeyphraseExtractor.extract_keyphrase_options}",
21 'type' => "string",
22 'deft' => "",
23 'reqd' => "no" }
24 ];
25
26my $options = { 'name' => "KeyphraseExtractor",
27 'desc' => "{KeyphraseExtractor.desc}",
28 'abstract' => "yes",
29 'inherits' => "yes",
30 'args' => $arguments };
31
32
33sub new {
34 my ($class) = shift (@_);
35 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
36 push(@$pluginlist, $class);
37
38 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
39 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
40
41 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
42
43 return bless $self, $class;
44
45}
46# extract metadata
47sub extract_keyphrase_metadata {
48
49 my $self = shift (@_);
50 my ($doc_obj) = @_;
51
52 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
53 $self->extract_keyphrases($doc_obj);
54 }
55
56}
57
58
59#adding kea keyphrases
60sub extract_keyphrases
61{
62 my $self = shift(@_);
63 my $doc_obj = shift(@_);
64
65 # Use Kea 3.0 unless 4.0 has been specified
66 my $kea_version = "3.0";
67 if ($self->{'extract_keyphrases_kea4'}) {
68 $kea_version = "4.0";
69 }
70
71 # Check that Kea exists, and tell the user where to get it if not
72 my $keahome = &Kea::get_Kea_directory($kea_version);
73 if (!-e $keahome) {
74 gsprintf(STDERR, "{KeyphraseExtractor.missing_kea}\n", $keahome, $kea_version);
75 return;
76 }
77
78 my $thissection = $doc_obj->get_top_section();
79 my $text = "";
80 my $list;
81
82 #loop through sections to gather whole doc
83 while (defined $thissection) {
84 my $sectiontext = $doc_obj->get_text($thissection);
85 $text = $text.$sectiontext;
86 $thissection = $doc_obj->get_next_section ($thissection);
87 }
88
89 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
90 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
91 } else { #otherwise call Kea with no options
92 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
93 }
94
95 if ($list){
96 # if a list of kea keyphrases was returned (ie not empty)
97 if ($self->{'verbosity'}) {
98 gsprintf(STDERR, "{KeyphraseExtractor.keyphrases}: $list\n");
99 }
100
101 #add metadata to top section
102 $thissection = $doc_obj->get_top_section();
103
104 # add all key phrases as one metadata
105 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
106
107 # add individual key phrases as multiple metadata
108 foreach my $keyphrase (split(',', $list)) {
109 $keyphrase =~ s/^\s+|\s+$//g;
110 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
111 }
112 }
113}
114
1151;
Note: See TracBrowser for help on using the repository browser.