Context Navigation

AcronymExtractor.pm@ 31745

Last change on this file since 31745 was 25797, checked in by kjdon, 12 years ago
need to define gsprintf in order to use it
Property svn:executable set to ``*
File size: 5.1 KB

Rev	Line
[16025]	1	###########################################################################
	2	#
	3	# AcronymExtractor - helper plugin that extacts acronyms from text
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 2008 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15867]	27	package AcronymExtractor;
	28
	29	use acronym;
	30	use PrintInfo;
[15887]	31	use strict;
[15867]	32
[25797]	33	use gsprintf 'gsprintf';
	34
[15867]	35	BEGIN {
	36	@AcronymExtractor::ISA = ('PrintInfo');
	37	}
	38
	39	my $arguments = [
	40	{ 'name' => "extract_acronyms",
	41	'desc' => "{AcronymExtractor.extract_acronyms}",
	42	'type' => "flag",
	43	'reqd' => "no" },
	44	{ 'name' => "markup_acronyms",
	45	'desc' => "{AcronymExtractor.markup_acronyms}",
	46	'type' => "flag",
	47	'reqd' => "no" } ];
	48
	49	my $options = { 'name' => "AcronymExtractor",
	50	'desc' => "{AcronymExtractor.desc}",
	51	'abstract' => "yes",
	52	'inherits' => "yes",
	53	'args' => $arguments };
	54
	55
	56	sub new {
	57	my ($class) = shift (@_);
	58	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	59	push(@$pluginlist, $class);
	60
[15918]	61	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	62	push(@{$hashArgOptLists->{"OptList"}},$options);
[15867]	63
[15881]	64	my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,1);
[15867]	65
	66	return bless $self, $class;
	67
	68	}
	69
	70
	71	# initialise metadata extractors
	72	sub initialise_acronym_extractor {
	73	my $self = shift (@_);
	74
	75	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	76	&acronym::initialise_acronyms();
	77	}
	78	}
	79
	80	# finalise metadata extractors
	81	sub finalise_acronym_extractor {
	82	my $self = shift (@_);
	83
	84	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	85	&acronym::finalise_acronyms();
	86	}
	87	}
	88
	89	# extract metadata
	90	sub extract_acronym_metadata {
	91
	92	my $self = shift (@_);
	93	my ($doc_obj) = @_;
	94
	95
	96	if ($self->{'extract_acronyms'}) {
	97	my $thissection = $doc_obj->get_top_section();
	98	while (defined $thissection) {
	99	my $text = $doc_obj->get_text($thissection);
	100	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
	101	$thissection = $doc_obj->get_next_section ($thissection);
	102	}
	103	}
	104
	105	if ($self->{'markup_acronyms'}) {
	106	my $thissection = $doc_obj->get_top_section();
	107	while (defined $thissection) {
	108	my $text = $doc_obj->get_text($thissection);
	109	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
	110	$doc_obj->delete_text($thissection);
	111	$doc_obj->add_text($thissection, $text);
	112	$thissection = $doc_obj->get_next_section ($thissection);
	113	}
	114	}
	115
	116	}
	117
	118
	119
	120	# extract acronyms from a section in a document. progress is
	121	# reported to outhandle based on the verbosity. both the Acronym
	122	# and the AcronymKWIC metadata items are created.
	123
	124	sub extract_acronyms {
	125	my $self = shift (@_);
	126	my ($textref, $doc_obj, $thissection) = @_;
	127	my $outhandle = $self->{'outhandle'};
	128
	129	# print $outhandle " extracting acronyms ...\n"
[16013]	130	gsprintf($outhandle, " {AcronymExtractor.extracting_acronyms}...\n")
[15867]	131	if ($self->{'verbosity'} > 2);
	132
	133	my $acro_array = &acronym::acronyms($textref);
	134
	135	foreach my $acro (@$acro_array) {
	136
	137	#check that this is the first time ...
	138	my $seen_before = "false";
	139	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
	140	foreach my $thisAcro (@$previous_data) {
	141	if ($thisAcro eq $acro->to_string()) {
	142	$seen_before = "true";
	143	if ($self->{'verbosity'} >= 4) {
[16013]	144	gsprintf($outhandle, " {AcronymExtractor.already_seen} " .
[15867]	145	$acro->to_string() . "\n");
	146	}
	147	}
	148	}
	149
	150	if ($seen_before eq "false") {
	151	#write it to the file ...
	152	$acro->write_to_file();
	153
	154	#do the normal acronym
	155	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[16013]	156	gsprintf($outhandle, " {AcronymExtractor.adding} ".$acro->to_string()."\n")
[15867]	157	if ($self->{'verbosity'} > 3);
	158	}
	159	}
	160
[16013]	161	gsprintf($outhandle, " {AcronymExtractor.done_acronym_extract}\n")
[15867]	162	if ($self->{'verbosity'} > 2);
	163	}
	164
	165	sub markup_acronyms {
	166	my $self = shift (@_);
	167	my ($text, $doc_obj, $thissection) = @_;
	168	my $outhandle = $self->{'outhandle'};
	169
[16013]	170	gsprintf($outhandle, " {AcronymExtractor.marking_up_acronyms}...\n")
[15867]	171	if ($self->{'verbosity'} > 2);
	172
	173	#self is passed in to check for verbosity ...
	174	$text = &acronym::markup_acronyms($text, $self);
	175
[16013]	176	gsprintf($outhandle, " {AcronymExtractor.done_acronym_markup}\n")
[15867]	177	if ($self->{'verbosity'} > 2);
	178
	179	return $text;
	180	}
	181
	182	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/AcronymExtractor.pm@ 31745

Download in other formats: