root/gsdl/trunk/perllib/plugins/AutoExtractMetadata.pm @ 17112

Revision 17112, 4.6 KB (checked in by kjdon, 11 years ago)

CJK text segmentation now done at indexing level (in buildproc), not plugin level. CJKTextSegmenter deleted, and not used in AutoExtractMetadata?

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# AutoExtractMetadata.pm -- base plugin for all plugins that want to do metadata extraction from text and/or metadata
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2008 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This plugin uses the supporting Extractors to add metadata extraction
27# functionality to BasePlugin.
28
29
30package AutoExtractMetadata;
31
32use strict;
33no strict 'subs';
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36use BasePlugin;
37use AcronymExtractor;
38use KeyphraseExtractor;
39use EmailAddressExtractor;
40use DateExtractor;
41use GISExtractor;
42
43sub BEGIN {
44    @AutoExtractMetadata::ISA = ( 'BasePlugin', 'AcronymExtractor', 'KeyphraseExtractor', 'EmailAddressExtractor', 'DateExtractor','GISExtractor' );
45}
46
47my $arguments = [
48         {'name' => "first",
49          'desc' => "{AutoExtractMetadata.first}",
50          'type' => "string",
51          'reqd' => "no" }
52         ];
53
54
55my $options = { 'name'     => "AutoExtractMetadata",
56        'desc'     => "{AutoExtractMetadata.desc}",
57        'abstract' => "yes",
58        'inherits' => "no",
59        'args'     => $arguments };
60
61
62sub new {
63
64    # Start the AutoExtractMetadata Constructor
65    my $class = shift (@_);
66    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
67    push(@$pluginlist, $class);
68   
69    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70    push(@{$hashArgOptLists->{"OptList"}},$options);
71
72    # load up the options and args for the supporting plugins
73    new AcronymExtractor($pluginlist, $inputargs, $hashArgOptLists);
74    new KeyphraseExtractor($pluginlist, $inputargs, $hashArgOptLists);
75    new EmailAddressExtractor($pluginlist, $inputargs, $hashArgOptLists);
76    new DateExtractor($pluginlist, $inputargs, $hashArgOptLists);
77    new GISExtractor($pluginlist, $inputargs, $hashArgOptLists);
78    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
79
80    return bless $self, $class;
81   
82}
83
84sub begin {
85    my $self = shift (@_);
86    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
87
88    $self->SUPER::begin(@_);
89
90    #initialise those extractors that need initialisation
91    $self->initialise_acronym_extractor();
92    $self->initialise_gis_extractor();
93
94}
95
96sub end {
97    # potentially called at the end of each plugin pass
98    # import.pl only has one plugin pass, but buildcol.pl has multiple ones
99
100    my ($self) = @_;
101    # finalise those extractors that need finalisation
102    $self->finalise_acronym_extractor();
103}
104
105# here is where we call methods from the supporting extractor plugins
106sub auto_extract_metadata {
107    my $self = shift(@_);
108    my ($doc_obj) = @_;
109
110    if ($self->{'first'}) {
111    my $thissection = $doc_obj->get_top_section();
112    while (defined $thissection) {
113        my $text = $doc_obj->get_text($thissection);
114        $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
115        $thissection = $doc_obj->get_next_section ($thissection);
116    }
117    }
118    $self->extract_acronym_metadata($doc_obj);
119    $self->extract_keyphrase_metadata($doc_obj);
120    $self->extract_email_metadata($doc_obj);
121    $self->extract_date_metadata($doc_obj);
122    $self->extract_gis_metadata($doc_obj);
123
124}
125
126
127# FIRSTNNN: extract the first NNN characters as metadata
128sub extract_first_NNNN_characters {
129    my $self = shift (@_);
130    my ($textref, $doc_obj, $thissection) = @_;
131   
132    foreach my $size (split /,/, $self->{'first'}) {
133    my $tmptext =  $$textref;
134    $tmptext =~ s/^\s+//;
135    $tmptext =~ s/\s+$//;
136    $tmptext =~ s/\s+/ /gs;
137    $tmptext = substr ($tmptext, 0, $size);
138    $tmptext =~ s/\s\S*$/…/;
139    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
140    }
141}
142
143sub clean_up_after_doc_obj_processing {
144    my $self = shift(@_);
145
146    $self->SUPER::clean_up_after_doc_obj_processing();
147    $self->GISExtractor::clean_up_temp_files();
148}
149
1501;
Note: See TracBrowser for help on using the browser.