source: gsdl/trunk/perllib/plugins/EmailAddressExtractor.pm@ 15880

Last change on this file since 15880 was 15867, checked in by kjdon, 16 years ago

plugin overhaul: automatic metadata extraction moved out of BasPlug into several extractor plugins (Keyphrase, Date, Acronym, EmailAddress Extractors). These are used by the AutoExtractMetadata plugin to add this functionality to BasePlugin (using multiple inheritance)

  • Property svn:executable set to *
File size: 2.5 KB
Line 
1package EmailAddressExtractor;
2
3use PrintInfo;
4
5BEGIN {
6 @EmailAddressExtractor::ISA = ('PrintInfo');
7}
8
9my $arguments = [
10 { 'name' => "extract_email",
11 'desc' => "{EmailAddressExtractor.extract_email}",
12 'type' => "flag",
13 'reqd' => "no" },
14 { 'name' => "new_extract_email",
15 'desc' => "",
16 'type' => "flag",
17 'reqd' => "no",
18 'hiddengli' => "yes" }
19 ];
20
21my $options = { 'name' => "EmailAddressExtractor",
22 'desc' => "{EmailAddressExtractor.desc}",
23 'abstract' => "yes",
24 'inherits' => "yes",
25 'args' => $arguments };
26
27
28sub new {
29 my ($class) = shift (@_);
30 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
31 push(@$pluginlist, $class);
32
33 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
34 push(@{$hashArgOptLists->{"OptList"}},$options);
35
36 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
37
38 return bless $self, $class;
39
40}
41
42# extract metadata
43sub extract_email_metadata {
44
45 my $self = shift (@_);
46 my ($doc_obj) = @_;
47
48 if ($self->{'extract_email'}) {
49 my $thissection = $doc_obj->get_top_section();
50 while (defined $thissection) {
51 my $text = $doc_obj->get_text($thissection);
52 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
53 $thissection = $doc_obj->get_next_section ($thissection);
54 }
55 }
56
57}
58
59sub extract_email {
60 my $self = shift (@_);
61 my ($textref, $doc_obj, $thissection) = @_;
62 my $outhandle = $self->{'outhandle'};
63
64 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
65 if ($self->{'verbosity'} > 2);
66
67 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
68 @email = sort @email;
69
70# if($self->{"new_extract_email"} == 0)
71# {
72# my @email2 = ();
73# foreach my $address (@email)
74# {
75# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
76# {
77# push @email2, $address;
78# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
79# # print $outhandle " extracting $address\n"
80# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
81# if ($self->{'verbosity'} > 3);
82# }
83# }
84# }
85# else
86# {
87 my $hashExistMail = {};
88 foreach my $address (@email) {
89 if (!(defined $hashExistMail->{$address}))
90 {
91 $hashExistMail->{$address} = 1;
92 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
93 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
94 if ($self->{'verbosity'} > 3);
95 }
96 }
97 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
98 if ($self->{'verbosity'} > 2);
99}
100
101
1021;
Note: See TracBrowser for help on using the repository browser.