source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/AcronymExtractor.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

  • Property svn:executable set to *
File size: 5.0 KB
Line 
1###########################################################################
2#
3# AcronymExtractor - helper plugin that extacts acronyms from text
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2008 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package AcronymExtractor;
28
29use acronym;
30use PrintInfo;
31use strict;
32
33BEGIN {
34 @AcronymExtractor::ISA = ('PrintInfo');
35}
36
37my $arguments = [
38 { 'name' => "extract_acronyms",
39 'desc' => "{AcronymExtractor.extract_acronyms}",
40 'type' => "flag",
41 'reqd' => "no" },
42 { 'name' => "markup_acronyms",
43 'desc' => "{AcronymExtractor.markup_acronyms}",
44 'type' => "flag",
45 'reqd' => "no" } ];
46
47my $options = { 'name' => "AcronymExtractor",
48 'desc' => "{AcronymExtractor.desc}",
49 'abstract' => "yes",
50 'inherits' => "yes",
51 'args' => $arguments };
52
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
60 push(@{$hashArgOptLists->{"OptList"}},$options);
61
62 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,1);
63
64 return bless $self, $class;
65
66}
67
68
69# initialise metadata extractors
70sub initialise_acronym_extractor {
71 my $self = shift (@_);
72
73 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
74 &acronym::initialise_acronyms();
75 }
76}
77
78# finalise metadata extractors
79sub finalise_acronym_extractor {
80 my $self = shift (@_);
81
82 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
83 &acronym::finalise_acronyms();
84 }
85}
86
87# extract metadata
88sub extract_acronym_metadata {
89
90 my $self = shift (@_);
91 my ($doc_obj) = @_;
92
93
94 if ($self->{'extract_acronyms'}) {
95 my $thissection = $doc_obj->get_top_section();
96 while (defined $thissection) {
97 my $text = $doc_obj->get_text($thissection);
98 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
99 $thissection = $doc_obj->get_next_section ($thissection);
100 }
101 }
102
103 if ($self->{'markup_acronyms'}) {
104 my $thissection = $doc_obj->get_top_section();
105 while (defined $thissection) {
106 my $text = $doc_obj->get_text($thissection);
107 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
108 $doc_obj->delete_text($thissection);
109 $doc_obj->add_text($thissection, $text);
110 $thissection = $doc_obj->get_next_section ($thissection);
111 }
112 }
113
114}
115
116
117
118# extract acronyms from a section in a document. progress is
119# reported to outhandle based on the verbosity. both the Acronym
120# and the AcronymKWIC metadata items are created.
121
122sub extract_acronyms {
123 my $self = shift (@_);
124 my ($textref, $doc_obj, $thissection) = @_;
125 my $outhandle = $self->{'outhandle'};
126
127 # print $outhandle " extracting acronyms ...\n"
128 gsprintf($outhandle, " {AcronymExtractor.extracting_acronyms}...\n")
129 if ($self->{'verbosity'} > 2);
130
131 my $acro_array = &acronym::acronyms($textref);
132
133 foreach my $acro (@$acro_array) {
134
135 #check that this is the first time ...
136 my $seen_before = "false";
137 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
138 foreach my $thisAcro (@$previous_data) {
139 if ($thisAcro eq $acro->to_string()) {
140 $seen_before = "true";
141 if ($self->{'verbosity'} >= 4) {
142 gsprintf($outhandle, " {AcronymExtractor.already_seen} " .
143 $acro->to_string() . "\n");
144 }
145 }
146 }
147
148 if ($seen_before eq "false") {
149 #write it to the file ...
150 $acro->write_to_file();
151
152 #do the normal acronym
153 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
154 gsprintf($outhandle, " {AcronymExtractor.adding} ".$acro->to_string()."\n")
155 if ($self->{'verbosity'} > 3);
156 }
157 }
158
159 gsprintf($outhandle, " {AcronymExtractor.done_acronym_extract}\n")
160 if ($self->{'verbosity'} > 2);
161}
162
163sub markup_acronyms {
164 my $self = shift (@_);
165 my ($text, $doc_obj, $thissection) = @_;
166 my $outhandle = $self->{'outhandle'};
167
168 gsprintf($outhandle, " {AcronymExtractor.marking_up_acronyms}...\n")
169 if ($self->{'verbosity'} > 2);
170
171 #self is passed in to check for verbosity ...
172 $text = &acronym::markup_acronyms($text, $self);
173
174 gsprintf($outhandle, " {AcronymExtractor.done_acronym_markup}\n")
175 if ($self->{'verbosity'} > 2);
176
177 return $text;
178}
179
1801;
Note: See TracBrowser for help on using the repository browser.