source: main/trunk/greenstone2/perllib/plugins/KeyphraseExtractor.pm@ 22597

Last change on this file since 22597 was 18748, checked in by kjdon, 15 years ago

added use gsprintf

  • Property svn:executable set to *
File size: 4.1 KB
Line 
1###########################################################################
2#
3# KeyphraseExtractor - helper plugin to extract key phrases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2008 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package KeyphraseExtractor;
28
29use Kea;
30use PrintInfo;
31use gsprintf 'gsprintf';
32
33use strict;
34no strict 'subs';
35
36BEGIN {
37 @KeyphraseExtractor::ISA = ('PrintInfo');
38}
39
40my $arguments = [
41 { 'name' => "extract_keyphrases",
42 'desc' => "{KeyphraseExtractor.extract_keyphrases}",
43 'type' => "flag",
44 'reqd' => "no" },
45 { 'name' => "extract_keyphrases_kea4",
46 'desc' => "{KeyphraseExtractor.extract_keyphrases_kea4}",
47 'type' => "flag",
48 'reqd' => "no" },
49 { 'name' => "extract_keyphrase_options",
50 'desc' => "{KeyphraseExtractor.extract_keyphrase_options}",
51 'type' => "string",
52 'deft' => "",
53 'reqd' => "no" }
54 ];
55
56my $options = { 'name' => "KeyphraseExtractor",
57 'desc' => "{KeyphraseExtractor.desc}",
58 'abstract' => "yes",
59 'inherits' => "yes",
60 'args' => $arguments };
61
62
63sub new {
64 my ($class) = shift (@_);
65 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
66 push(@$pluginlist, $class);
67
68 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
69 push(@{$hashArgOptLists->{"OptList"}},$options);
70
71 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
72
73 return bless $self, $class;
74
75}
76# extract metadata
77sub extract_keyphrase_metadata {
78
79 my $self = shift (@_);
80 my ($doc_obj) = @_;
81
82 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
83 $self->extract_keyphrases($doc_obj);
84 }
85
86}
87
88
89#adding kea keyphrases
90sub extract_keyphrases
91{
92 my $self = shift(@_);
93 my $doc_obj = shift(@_);
94
95 # Use Kea 3.0 unless 4.0 has been specified
96 my $kea_version = "3.0";
97 if ($self->{'extract_keyphrases_kea4'}) {
98 $kea_version = "4.0";
99 }
100
101 # Check that Kea exists, and tell the user where to get it if not
102 my $keahome = &Kea::get_Kea_directory($kea_version);
103 if (!-e $keahome) {
104 gsprintf(STDERR, "{KeyphraseExtractor.missing_kea}\n", $keahome, $kea_version);
105 return;
106 }
107
108 my $thissection = $doc_obj->get_top_section();
109 my $text = "";
110 my $list;
111
112 #loop through sections to gather whole doc
113 while (defined $thissection) {
114 my $sectiontext = $doc_obj->get_text($thissection);
115 $text = $text.$sectiontext;
116 $thissection = $doc_obj->get_next_section ($thissection);
117 }
118
119 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
120 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
121 } else { #otherwise call Kea with no options
122 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
123 }
124
125 if ($list){
126 # if a list of kea keyphrases was returned (ie not empty)
127 if ($self->{'verbosity'}) {
128 gsprintf(STDERR, "{KeyphraseExtractor.keyphrases}: $list\n");
129 }
130
131 #add metadata to top section
132 $thissection = $doc_obj->get_top_section();
133
134 # add all key phrases as one metadata
135 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
136
137 # add individual key phrases as multiple metadata
138 foreach my $keyphrase (split(',', $list)) {
139 $keyphrase =~ s/^\s+|\s+$//g;
140 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
141 }
142 }
143}
144
1451;
Note: See TracBrowser for help on using the repository browser.