source: gsdl/trunk/perllib/plugins/EmailAddressExtractor.pm@ 15881

Last change on this file since 15881 was 15881, checked in by kjdon, 16 years ago

auxiliary plugins now pass an extra argument to the PrintInfo constructor so that argument parsing is not done - parsing needs to be done by the main plugin which has all the arguments

  • Property svn:executable set to *
File size: 2.5 KB
Line 
1package EmailAddressExtractor;
2
3use PrintInfo;
4
5BEGIN {
6 @EmailAddressExtractor::ISA = ('PrintInfo');
7}
8
9my $arguments = [
10 { 'name' => "extract_email",
11 'desc' => "{EmailAddressExtractor.extract_email}",
12 'type' => "flag",
13 'reqd' => "no" },
14 { 'name' => "new_extract_email",
15 'desc' => "",
16 'type' => "flag",
17 'reqd' => "no",
18 'hiddengli' => "yes" }
19 ];
20
21my $options = { 'name' => "EmailAddressExtractor",
22 'desc' => "{EmailAddressExtractor.desc}",
23 'abstract' => "yes",
24 'inherits' => "yes",
25 'args' => $arguments };
26
27
28sub new {
29 my ($class) = shift (@_);
30 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
31 push(@$pluginlist, $class);
32
33 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
34 push(@{$hashArgOptLists->{"OptList"}},$options);
35
36 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
37
38 return bless $self, $class;
39
40}
41
42# extract metadata
43sub extract_email_metadata {
44
45 my $self = shift (@_);
46 my ($doc_obj) = @_;
47
48 if ($self->{'extract_email'}) {
49 my $thissection = $doc_obj->get_top_section();
50 while (defined $thissection) {
51 my $text = $doc_obj->get_text($thissection);
52 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
53 $thissection = $doc_obj->get_next_section ($thissection);
54 }
55 }
56
57}
58
59sub extract_email {
60 my $self = shift (@_);
61 my ($textref, $doc_obj, $thissection) = @_;
62 my $outhandle = $self->{'outhandle'};
63
64 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
65 if ($self->{'verbosity'} > 2);
66
67 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
68 @email = sort @email;
69
70# if($self->{"new_extract_email"} == 0)
71# {
72# my @email2 = ();
73# foreach my $address (@email)
74# {
75# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
76# {
77# push @email2, $address;
78# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
79# # print $outhandle " extracting $address\n"
80# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
81# if ($self->{'verbosity'} > 3);
82# }
83# }
84# }
85# else
86# {
87 my $hashExistMail = {};
88 foreach my $address (@email) {
89 if (!(defined $hashExistMail->{$address}))
90 {
91 $hashExistMail->{$address} = 1;
92 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
93 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
94 if ($self->{'verbosity'} > 3);
95 }
96 }
97 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
98 if ($self->{'verbosity'} > 2);
99}
100
101
1021;
Note: See TracBrowser for help on using the repository browser.