root/gsdl/trunk/perllib/plugins/KeyphraseExtractor.pm @ 18748

Revision 18748, 4.1 KB (checked in by kjdon, 11 years ago)

added use gsprintf

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# KeyphraseExtractor - helper plugin to extract key phrases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2008 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package KeyphraseExtractor;
28
29use Kea;
30use PrintInfo;
31use gsprintf 'gsprintf';
32
33use strict;
34no strict 'subs';
35
36BEGIN {
37    @KeyphraseExtractor::ISA = ('PrintInfo');
38}
39
40my $arguments = [
41      { 'name' => "extract_keyphrases",
42    'desc' => "{KeyphraseExtractor.extract_keyphrases}",
43    'type' => "flag",
44    'reqd' => "no" },
45      { 'name' => "extract_keyphrases_kea4",
46    'desc' => "{KeyphraseExtractor.extract_keyphrases_kea4}",
47    'type' => "flag",
48    'reqd' => "no" },
49      { 'name' => "extract_keyphrase_options",
50    'desc' => "{KeyphraseExtractor.extract_keyphrase_options}",
51    'type' => "string",
52    'deft' => "",
53    'reqd' => "no" }
54         ];
55
56my $options = { 'name'     => "KeyphraseExtractor",
57        'desc'     => "{KeyphraseExtractor.desc}",
58        'abstract' => "yes",
59        'inherits' => "yes",
60        'args'     => $arguments };
61
62
63sub new {
64    my ($class) = shift (@_);
65    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
66    push(@$pluginlist, $class);
67
68    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
69    push(@{$hashArgOptLists->{"OptList"}},$options);
70
71    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
72
73    return bless $self, $class;
74
75}
76# extract metadata
77sub extract_keyphrase_metadata {
78
79    my $self = shift (@_);
80    my ($doc_obj) = @_;
81
82    if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
83    $self->extract_keyphrases($doc_obj);
84    }
85
86}
87
88
89#adding kea keyphrases
90sub extract_keyphrases
91{
92    my $self = shift(@_);
93    my $doc_obj = shift(@_);
94
95    # Use Kea 3.0 unless 4.0 has been specified
96    my $kea_version = "3.0";
97    if ($self->{'extract_keyphrases_kea4'}) {
98    $kea_version = "4.0";
99    }
100
101    # Check that Kea exists, and tell the user where to get it if not
102    my $keahome = &Kea::get_Kea_directory($kea_version);
103    if (!-e $keahome) {
104    gsprintf(STDERR, "{KeyphraseExtractor.missing_kea}\n", $keahome, $kea_version);
105    return;
106    }
107
108    my $thissection = $doc_obj->get_top_section();
109    my $text = "";
110    my $list;
111
112    #loop through sections to gather whole doc
113    while (defined $thissection) {
114    my $sectiontext = $doc_obj->get_text($thissection);   
115    $text = $text.$sectiontext;
116    $thissection = $doc_obj->get_next_section ($thissection);
117    }
118   
119    if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
120    $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
121    } else { #otherwise call Kea with no options
122    $list = &Kea::extract_KeyPhrases ($kea_version, $text);
123    }
124 
125    if ($list){
126    # if a list of kea keyphrases was returned (ie not empty)
127    if ($self->{'verbosity'}) {
128        gsprintf(STDERR, "{KeyphraseExtractor.keyphrases}: $list\n");
129    }
130
131    #add metadata to top section
132    $thissection = $doc_obj->get_top_section();
133
134    # add all key phrases as one metadata
135    $doc_obj->add_metadata($thissection, "Keyphrases", $list);
136
137    # add individual key phrases as multiple metadata
138    foreach my $keyphrase (split(',', $list)) {
139        $keyphrase =~ s/^\s+|\s+$//g;
140        $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
141    }
142    }
143}
144
1451;
Note: See TracBrowser for help on using the browser.