source: gsdl/trunk/perllib/plugins/KeyphraseExtractor.pm@ 18327

Last change on this file since 18327 was 16025, checked in by kjdon, 16 years ago

added license info

  • Property svn:executable set to *
File size: 4.1 KB
Line 
1###########################################################################
2#
3# KeyphraseExtractor - helper plugin to extract key phrases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2008 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package KeyphraseExtractor;
28
29use Kea;
30use PrintInfo;
31use strict;
32no strict 'subs';
33
34BEGIN {
35 @KeyphraseExtractor::ISA = ('PrintInfo');
36}
37
38my $arguments = [
39 { 'name' => "extract_keyphrases",
40 'desc' => "{KeyphraseExtractor.extract_keyphrases}",
41 'type' => "flag",
42 'reqd' => "no" },
43 { 'name' => "extract_keyphrases_kea4",
44 'desc' => "{KeyphraseExtractor.extract_keyphrases_kea4}",
45 'type' => "flag",
46 'reqd' => "no" },
47 { 'name' => "extract_keyphrase_options",
48 'desc' => "{KeyphraseExtractor.extract_keyphrase_options}",
49 'type' => "string",
50 'deft' => "",
51 'reqd' => "no" }
52 ];
53
54my $options = { 'name' => "KeyphraseExtractor",
55 'desc' => "{KeyphraseExtractor.desc}",
56 'abstract' => "yes",
57 'inherits' => "yes",
58 'args' => $arguments };
59
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
70
71 return bless $self, $class;
72
73}
74# extract metadata
75sub extract_keyphrase_metadata {
76
77 my $self = shift (@_);
78 my ($doc_obj) = @_;
79
80 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
81 $self->extract_keyphrases($doc_obj);
82 }
83
84}
85
86
87#adding kea keyphrases
88sub extract_keyphrases
89{
90 my $self = shift(@_);
91 my $doc_obj = shift(@_);
92
93 # Use Kea 3.0 unless 4.0 has been specified
94 my $kea_version = "3.0";
95 if ($self->{'extract_keyphrases_kea4'}) {
96 $kea_version = "4.0";
97 }
98
99 # Check that Kea exists, and tell the user where to get it if not
100 my $keahome = &Kea::get_Kea_directory($kea_version);
101 if (!-e $keahome) {
102 gsprintf(STDERR, "{KeyphraseExtractor.missing_kea}\n", $keahome, $kea_version);
103 return;
104 }
105
106 my $thissection = $doc_obj->get_top_section();
107 my $text = "";
108 my $list;
109
110 #loop through sections to gather whole doc
111 while (defined $thissection) {
112 my $sectiontext = $doc_obj->get_text($thissection);
113 $text = $text.$sectiontext;
114 $thissection = $doc_obj->get_next_section ($thissection);
115 }
116
117 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
118 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
119 } else { #otherwise call Kea with no options
120 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
121 }
122
123 if ($list){
124 # if a list of kea keyphrases was returned (ie not empty)
125 if ($self->{'verbosity'}) {
126 gsprintf(STDERR, "{KeyphraseExtractor.keyphrases}: $list\n");
127 }
128
129 #add metadata to top section
130 $thissection = $doc_obj->get_top_section();
131
132 # add all key phrases as one metadata
133 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
134
135 # add individual key phrases as multiple metadata
136 foreach my $keyphrase (split(',', $list)) {
137 $keyphrase =~ s/^\s+|\s+$//g;
138 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
139 }
140 }
141}
142
1431;
Note: See TracBrowser for help on using the repository browser.