source: gsdl/trunk/perllib/plugins/EmailAddressExtractor.pm@ 16013

Last change on this file since 16013 was 16013, checked in by kjdon, 16 years ago

updated soem plugin names in some of the keys for strings.properties

  • Property svn:executable set to *
File size: 2.6 KB
Line 
1package EmailAddressExtractor;
2
3use PrintInfo;
4use strict;
5
6BEGIN {
7 @EmailAddressExtractor::ISA = ('PrintInfo');
8}
9
10my $arguments = [
11 { 'name' => "extract_email",
12 'desc' => "{EmailAddressExtractor.extract_email}",
13 'type' => "flag",
14 'reqd' => "no" },
15 { 'name' => "new_extract_email",
16 'desc' => "",
17 'type' => "flag",
18 'reqd' => "no",
19 'hiddengli' => "yes" }
20 ];
21
22my $options = { 'name' => "EmailAddressExtractor",
23 'desc' => "{EmailAddressExtractor.desc}",
24 'abstract' => "yes",
25 'inherits' => "yes",
26 'args' => $arguments };
27
28
29sub new {
30 my ($class) = shift (@_);
31 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
32 push(@$pluginlist, $class);
33
34 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
35 push(@{$hashArgOptLists->{"OptList"}},$options);
36
37 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
38
39 return bless $self, $class;
40
41}
42
43# extract metadata
44sub extract_email_metadata {
45
46 my $self = shift (@_);
47 my ($doc_obj) = @_;
48
49 if ($self->{'extract_email'}) {
50 my $thissection = $doc_obj->get_top_section();
51 while (defined $thissection) {
52 my $text = $doc_obj->get_text($thissection);
53 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
54 $thissection = $doc_obj->get_next_section ($thissection);
55 }
56 }
57
58}
59
60sub extract_email {
61 my $self = shift (@_);
62 my ($textref, $doc_obj, $thissection) = @_;
63 my $outhandle = $self->{'outhandle'};
64
65 gsprintf($outhandle, " {EmailAddressExtractor.extracting_emails}...\n")
66 if ($self->{'verbosity'} > 2);
67
68 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
69 @email = sort @email;
70
71# if($self->{"new_extract_email"} == 0)
72# {
73# my @email2 = ();
74# foreach my $address (@email)
75# {
76# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
77# {
78# push @email2, $address;
79# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
80# # print $outhandle " extracting $address\n"
81# &gsprintf($outhandle, " {AutoExtractMetadata.extracting} $address\n")
82# if ($self->{'verbosity'} > 3);
83# }
84# }
85# }
86# else
87# {
88 my $hashExistMail = {};
89 foreach my $address (@email) {
90 if (!(defined $hashExistMail->{$address}))
91 {
92 $hashExistMail->{$address} = 1;
93 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
94 gsprintf($outhandle, " {AutoExtractMetadata.extracting} $address\n")
95 if ($self->{'verbosity'} > 3);
96 }
97 }
98 gsprintf($outhandle, " {EmailAddressExtractor.done_email_extract}\n")
99 if ($self->{'verbosity'} > 2);
100}
101
102
1031;
Note: See TracBrowser for help on using the repository browser.