source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 1895

Last change on this file since 1895 was 1895, checked in by jrm21, 23 years ago

Email plug now uses SplitPlug for mbox mail files. Hopefully this won't
cause problems with the other file formats (eg maildir) which don't use
the magic regexp /From / as a mail separator...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.0 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in mh_mail folders) or with the
35# extension .email
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata:
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header - this will be stored as Creator
46# $DateText Date: header
47# $Date Date: header in GSDL format (eg: 19990924)
48
49package EMAILPlug;
50
51use SplitPlug;
52
53use sorttools;
54use util;
55
56
57# EMAILPlug is a sub-class of SplitPlug.
58
59sub BEGIN {
60 @ISA = ('SplitPlug');
61}
62
63# Create a new EMAILPlug object with which to parse a file.
64# Accomplished by creating a new BasPlug and using bless to
65# turn it into an EMAILPlug.
66
67sub new {
68 my ($class) = @_;
69 my $self = new BasPlug ("EMAILPlug", @_);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^\d+(\.email)?$^;
78}
79
80# This plugin splits the mbox mail files at lines starting with From<sp>
81sub get_default_split_exp {
82 return q^From .*\n^;
83}
84
85
86# do plugin specific processing of doc_obj
87sub process {
88 my $self = shift (@_);
89 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
90 my $outhandle = $self->{'outhandle'};
91
92 # Check that we're dealing with a valid mail file
93 return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));
94
95 # slightly more strict validity check, to prevent us from matching
96 # .so.x files ...
97 return undef unless (($$textref =~ /^From /) ||
98 ($$textref =~ /^[-A-Za-z]{2,100}:/));
99
100 print $outhandle "EMAILPlug: processing $file\n"
101 if $self->{'verbosity'} > 1;
102
103 my $cursection = $doc_obj->get_top_section();
104
105 #
106 # Parse the document's text and extract metadata
107 #
108
109 # Separate header from body of message
110 my $Headers = $$textref;
111 $Headers =~ s/\n\n.*//s;
112 $$textref = substr $$textref, (length $Headers);
113
114 # Extract basic metadata from header
115 my @headers = ("From", "To", "Subject", "Date");
116 my %raw;
117 foreach my $name (@headers) {
118 $raw{$name} = "No $name value";
119 }
120
121 # Examine each line of the headers
122 my ($line, $name, $value);
123 my @parts;
124 foreach $line (split(/\n/, $Headers)) {
125
126 # Ignore lines with no content or which begin with whitespace
127 next unless ($line =~ /:/);
128 next if ($line =~ /^\s/);
129
130 # Find out what metadata is on this line
131 @parts = split(/:/, $line);
132 $name = shift @parts;
133 next unless $name;
134 next unless ($raw{$name});
135
136 # Find the value of that metadata
137 $value = join(":", @parts);
138 $value =~ s/^\s+//;
139 $value =~ s/\s+$//;
140
141 # Store the metadata
142 $raw{$name} = $value;
143 }
144
145 # Process Date information
146 if ($raw{"Date"} !~ /No Date/) {
147 $raw{"DateText"} = $raw{"Date"};
148
149 # Convert the date text to internal date format
150 $value = $raw{"Date"};
151 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
152 if ($year < 100) { $year += 1900; }
153 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
154
155 } else {
156 # We have not extracted a date
157 $raw{"DateText"} = "Unknown.";
158 $raw{"Date"} = "19000000";
159 }
160
161
162 # Add extracted metadata to document object
163 foreach my $name (keys %raw) {
164 $value = $raw{$name};
165 if ($value) {
166 $value = &text_into_html($value);
167 } else {
168 $value = "No $name field";
169 }
170 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
171 }
172
173 # Add "All headers" metadata
174 $Headers = &text_into_html($Headers);
175 $Headers = "No headers" unless ($Headers =~ /\w/);
176 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
177
178 # Add text to document object
179 $$textref = &text_into_html($$textref);
180 $$textref = "No message" unless ($$textref =~ /\w/);
181 $doc_obj->add_utf8_text($cursection, $$textref);
182
183 return 1;
184}
185
186
187# Convert a text string into HTML.
188#
189# The HTML is going to be inserted into a GML file, so
190# we have to be careful not to use symbols like ">",
191# which ocurs frequently in email messages (and use
192# &gt instead.
193#
194# This function also turns links and email addresses into hyperlinks,
195# and replaces carriage returns with <BR> tags (and multiple carriage
196# returns with <P> tags).
197
198
199sub text_into_html {
200 my ($text) = @_;
201
202 # Convert problem characters into HTML symbols
203 $text =~ s/&/&amp;/go;
204 $text =~ s/</&lt;/go;
205 $text =~ s/>/&gt;/go;
206 $text =~ s/\"/&quot;/go;
207
208 # convert email addresses and URLs into links
209 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
210 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
211
212 # Clean up whitespace and convert \n charaters to <BR> or <P>
213 $text =~ s/ +/ /go;
214 $text =~ s/\s*$//o;
215 $text =~ s/^\s*//o;
216 $text =~ s/\n/\n<BR>/go;
217 $text =~ s/<BR>\s*<BR>/<P>/go;
218
219 return $text;
220}
221
222
223# Perl packages have to return true if they are run.
2241;
Note: See TracBrowser for help on using the repository browser.