source: main/trunk/greenstone2/perllib/plugins/AutoExtractMetadata.pm@ 21746

Last change on this file since 21746 was 17112, checked in by kjdon, 16 years ago

CJK text segmentation now done at indexing level (in buildproc), not plugin level. CJKTextSegmenter deleted, and not used in AutoExtractMetadata

  • Property svn:executable set to *
File size: 4.6 KB
Line 
1###########################################################################
2#
3# AutoExtractMetadata.pm -- base plugin for all plugins that want to do metadata extraction from text and/or metadata
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2008 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This plugin uses the supporting Extractors to add metadata extraction
27# functionality to BasePlugin.
28
29
30package AutoExtractMetadata;
31
32use strict;
33no strict 'subs';
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36use BasePlugin;
37use AcronymExtractor;
38use KeyphraseExtractor;
39use EmailAddressExtractor;
40use DateExtractor;
41use GISExtractor;
42
43sub BEGIN {
44 @AutoExtractMetadata::ISA = ( 'BasePlugin', 'AcronymExtractor', 'KeyphraseExtractor', 'EmailAddressExtractor', 'DateExtractor','GISExtractor' );
45}
46
47my $arguments = [
48 {'name' => "first",
49 'desc' => "{AutoExtractMetadata.first}",
50 'type' => "string",
51 'reqd' => "no" }
52 ];
53
54
55my $options = { 'name' => "AutoExtractMetadata",
56 'desc' => "{AutoExtractMetadata.desc}",
57 'abstract' => "yes",
58 'inherits' => "no",
59 'args' => $arguments };
60
61
62sub new {
63
64 # Start the AutoExtractMetadata Constructor
65 my $class = shift (@_);
66 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
67 push(@$pluginlist, $class);
68
69 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70 push(@{$hashArgOptLists->{"OptList"}},$options);
71
72 # load up the options and args for the supporting plugins
73 new AcronymExtractor($pluginlist, $inputargs, $hashArgOptLists);
74 new KeyphraseExtractor($pluginlist, $inputargs, $hashArgOptLists);
75 new EmailAddressExtractor($pluginlist, $inputargs, $hashArgOptLists);
76 new DateExtractor($pluginlist, $inputargs, $hashArgOptLists);
77 new GISExtractor($pluginlist, $inputargs, $hashArgOptLists);
78 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
79
80 return bless $self, $class;
81
82}
83
84sub begin {
85 my $self = shift (@_);
86 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
87
88 $self->SUPER::begin(@_);
89
90 #initialise those extractors that need initialisation
91 $self->initialise_acronym_extractor();
92 $self->initialise_gis_extractor();
93
94}
95
96sub end {
97 # potentially called at the end of each plugin pass
98 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
99
100 my ($self) = @_;
101 # finalise those extractors that need finalisation
102 $self->finalise_acronym_extractor();
103}
104
105# here is where we call methods from the supporting extractor plugins
106sub auto_extract_metadata {
107 my $self = shift(@_);
108 my ($doc_obj) = @_;
109
110 if ($self->{'first'}) {
111 my $thissection = $doc_obj->get_top_section();
112 while (defined $thissection) {
113 my $text = $doc_obj->get_text($thissection);
114 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
115 $thissection = $doc_obj->get_next_section ($thissection);
116 }
117 }
118 $self->extract_acronym_metadata($doc_obj);
119 $self->extract_keyphrase_metadata($doc_obj);
120 $self->extract_email_metadata($doc_obj);
121 $self->extract_date_metadata($doc_obj);
122 $self->extract_gis_metadata($doc_obj);
123
124}
125
126
127# FIRSTNNN: extract the first NNN characters as metadata
128sub extract_first_NNNN_characters {
129 my $self = shift (@_);
130 my ($textref, $doc_obj, $thissection) = @_;
131
132 foreach my $size (split /,/, $self->{'first'}) {
133 my $tmptext = $$textref;
134 $tmptext =~ s/^\s+//;
135 $tmptext =~ s/\s+$//;
136 $tmptext =~ s/\s+/ /gs;
137 $tmptext = substr ($tmptext, 0, $size);
138 $tmptext =~ s/\s\S*$/…/;
139 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
140 }
141}
142
143sub clean_up_after_doc_obj_processing {
144 my $self = shift(@_);
145
146 $self->SUPER::clean_up_after_doc_obj_processing();
147 $self->GISExtractor::clean_up_temp_files();
148}
149
1501;
Note: See TracBrowser for help on using the repository browser.