Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2630

Last change on this file since 2630 was 2630, checked in by jrm21, 23 years ago
Mime support for multipart messages. Doesn't extract attachments yet... Also made sure we don't use textcat to guess language - it runs over the whole file, and grows really big really quickly.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.6 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata:
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header - this will be stored as Creator
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	#
52	# John McPherson - June/July 2001
53	# added (basic) MIME support and quoted-printable and base64 decodings.
54	# Minor fixes for names that are actually email addresses (ie <...> was lost)
55	#
56	# See: * RFC 822 - ARPA Internet Text Messages
57	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
58	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
59	# * RFC 1806 - Content Dispositions (ie inline/attachment)
60	package EMAILPlug;
61
62	use SplitPlug;
63
64	use sorttools;
65	use util;
66
67
68	# EMAILPlug is a sub-class of SplitPlug.
69
70	sub BEGIN {
71	@ISA = ('SplitPlug');
72	}
73
74	# Create a new EMAILPlug object with which to parse a file.
75	# Accomplished by creating a new BasPlug and using bless to
76	# turn it into an EMAILPlug.
77
78	sub new {
79	my ($class) = @_;
80	my $self = new BasPlug ("EMAILPlug", @_);
81	# make sure we don't run textcat (defaults to "auto");
82	$self->{'input_encoding'}="ascii";
83	return bless $self, $class;
84	}
85
86	sub get_default_process_exp {
87	my $self = shift (@_);
88	# mbx/email for mailbox file format, \d+ for maildir (each message is
89	# in a separate file, with a unique number for filename)
90	return q@([\\/]\d+\|\.(mbx\|email))$@;
91	}
92
93	# This plugin splits the mbox mail files at lines starting with From<sp>
94	sub get_default_split_exp {
95	return q^\nFrom .*\n^;
96	}
97
98
99	# do plugin specific processing of doc_obj
100	sub process {
101
102	my $self = shift (@_);
103	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
104	my $outhandle = $self->{'outhandle'};
105
106	# Check that we're dealing with a valid mail file
107	return undef unless (($$textref =~ /From:/m) \|\| ($$textref =~ /To:/m));
108
109	# slightly more strict validity check, to prevent us from matching
110	# .so.x files ...
111	return undef unless (($$textref =~ /^From /) \|\|
112	($$textref =~ /^[-A-Za-z]{2,100}:/m));
113
114	print $outhandle "EMAILPlug: processing $file\n"
115	if $self->{'verbosity'} > 1;
116
117	my $cursection = $doc_obj->get_top_section();
118
119	#
120	# Parse the document's text and extract metadata
121	#
122
123	# Separate header from body of message
124	my $Headers = $$textref;
125	$Headers =~ s/\r?\n\r?\n(.*)$//s;
126	$$textref = $1;
127
128	# Unfold headers - see rfc822
129	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
130	# Extract basic metadata from header
131	my @headers = ("From", "To", "Subject", "Date");
132	my %raw;
133	foreach my $name (@headers) {
134	$raw{$name} = "No $name value";
135	}
136
137	# Examine each line of the headers
138	my ($line, $name, $value);
139	my @parts;
140	foreach $line (split(/\n/, $Headers)) {
141
142	# Ignore lines with no content or which begin with whitespace
143	next unless ($line =~ /:/);
144	next if ($line =~ /^\s/);
145
146	# Find out what metadata is on this line
147	@parts = split(/:/, $line);
148	$name = shift @parts;
149	# uppercase the first character according to the current locale
150	$name=~s/(.+)/\u$1/;
151	next unless $name;
152	next unless ($raw{$name});
153
154	# Find the value of that metadata
155	$value = join(":", @parts);
156	$value =~ s/^\s+//;
157	$value =~ s/\s+$//;
158
159	# Store the metadata
160	$raw{$name} = $value;
161	}
162
163	# Extract the name and e-mail address from the From metadata
164	$frommeta = $raw{"From"};
165	$frommeta =~ m/(.)<(.)>/;
166	my $fromnamemeta=$1;
167	my $fromaddrmeta=$2;
168	if (!defined($fromaddrmeta)) {
169	$fromaddrmeta=$frommeta;
170	}
171	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
172
173	if (defined($fromnameneta)) {
174	$fromnamemeta =~ s/\"//g;
175	$fromnamemeta =~ s/(.*) /$1/; # Remove trailing space
176	}
177	else {
178	$fromnamemeta = $fromaddrmeta;
179	}
180	# if name is an address
181	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
182	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
183
184	# Escape < and > in the whole From field;
185	$frommeta =~ s/</</g; $frommeta =~ s/>/>/g;
186	$raw{"From"}=$frommeta;
187
188	# Process Date information
189	if ($raw{"Date"} !~ /No Date/) {
190	$raw{"DateText"} = $raw{"Date"};
191
192	# Convert the date text to internal date format
193	$value = $raw{"Date"};
194	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
195	if ($year < 100) { $year += 1900; }
196	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
197
198	} else {
199	# We have not extracted a date
200	$raw{"DateText"} = "Unknown.";
201	$raw{"Date"} = "19000000";
202	}
203
204
205	# Add extracted metadata to document object
206	foreach my $name (keys %raw) {
207	$value = $raw{$name};
208	if ($value) {
209	$value = &text_into_html($value);
210	} else {
211	$value = "No $name field";
212	}
213	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
214	}
215
216	my $mimetype="text/plain";
217	my $mimeinfo="";
218	# Do MIME and encoding stuff
219	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s\;?\s([^\s]+)\s$/mi)
220	{
221	$mimetype=$1;
222	$mimetype =~ tr/[A-Z]/[a-z]/;
223	$mimeinfo=$2;
224	}
225
226	if ($mimetype ne "text/plain") {
227	$$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref);
228	} # end of not text/plain
229
230
231	# Add "All headers" metadata
232	$Headers = &text_into_html($Headers);
233	$Headers = "No headers" unless ($Headers =~ /\w/);
234	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
235
236	# Add text to document object
237	if ($mimetype eq "text/plain") {
238	$$textref = &text_into_html($$textref);
239	}
240	$$textref = "No message" unless ($$textref =~ /\w/);
241	$doc_obj->add_utf8_text($cursection, $$textref);
242
243	return 1;
244	}
245
246
247	# Convert a text string into HTML.
248	#
249	# The HTML is going to be inserted into a GML file, so
250	# we have to be careful not to use symbols like ">",
251	# which ocurs frequently in email messages (and use
252	# &gt instead.
253	#
254	# This function also turns links and email addresses into hyperlinks,
255	# and replaces carriage returns with <BR> tags (and multiple carriage
256	# returns with <P> tags).
257
258
259	sub text_into_html {
260	my ($text) = @_;
261
262	# Convert problem characters into HTML symbols
263	$text =~ s/&/&/go;
264	$text =~ s/</</go;
265	$text =~ s/>/>/go;
266	$text =~ s/\"/"/go;
267
268	# convert email addresses and URIs into links
269	# don't markup email addresses for now
270	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
271
272	# assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
273	# URI can't finish with a '.'
274	$text =~ s/((http\|ftp\|https):\/\/[\w\d\-]+(\.[\w\d\-]+)\/?((&\|\.)?[\w\d\?\=\-_\/~]+))/<a href=\"$1\">$1<\/a>/g;
275
276
277	# Clean up whitespace and convert \n charaters to <BR> or <P>
278	$text =~ s/ +/ /go;
279	$text =~ s/\s*$//o;
280	$text =~ s/^\s*//o;
281	$text =~ s/\n/\n<BR>/go;
282	$text =~ s/<BR>\s*<BR>/<P>/go;
283
284	return $text;
285	}
286
287
288
289
290	#Process a MIME message.
291	# the textref we are given DOES NOT include the header.
292	sub text_from_mime_message {
293	my ($mimetype,$mimeinfo,$text)=(@_);
294
295	# Check for multiparts - $mimeinfo will be a boundary
296	if ($mimetype =~ /multipart/) {
297	$boundary="";
298	if ($mimeinfo =~ /boundary="?([^\s]+?)"?\s*$/im) {
299	$boundary=$1;
300	}
301	# parts start with "--$boundary"
302	# message ends with "--$boundary--"
303	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
304	# that perl might want to interpolate.
305
306	$boundary=~s/\\/\\\\/g;
307	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
308	my @message_parts = split("\r?\n\-\-$boundary", $text);
309	# remove first "part" and last "part" (final --)
310	shift @message_parts;
311	my $last=pop @message_parts;
312	# make sure it is only -- and whitespace
313	if ($last !~ /^\-\-\s*$/ms) {
314	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
315	}
316	foreach my $message_part (@message_parts) {
317	# remove the leading newline left from split.
318	$message_part=~s/^\r?\n//;
319	}
320	if ($mimetype eq "multipart/alternative") {
321	# check for an HTML version first, then TEXT, otherwise use first.
322	my $part_text="";
323	foreach my $message_part (@message_parts) {
324	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
325	{
326	# Use the HTML version
327	$part_text=text_from_part($message_part);
328	$mimetype="text/html";
329	last;
330	}
331	}
332	if ($part_text eq "") { # try getting a text part instead
333	foreach my $message_part (@message_parts) {
334	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
335	{
336	# Use the plain version
337	$part_text=text_from_part($message_part);
338	$mimetype="text/plain";
339	last;
340	}
341	}
342	}
343	if ($part_text eq "") { # use first part
344	$part_text=text_from_part(shift @message_parts);
345	}
346	if ($part_text eq "") { # we couldn't get anything!!!
347	# or it was an empty message...
348	# do nothing...
349	print $outhandle "EMAILPlug: no text - empty body?\n";
350	} else {
351	$text=$part_text;
352	}
353	} elsif ($mimetype eq "multipart/mixed" \|\|
354	$mimetype eq "multipart/digest") {
355	$text="";
356	foreach my $message_part (@message_parts) {
357	my $part_header=$message_part;
358	$part_header=~s/\r?\n\r?\n(.*)$//sg;
359	my $part_body=$1;
360	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
361	my $part_content_type="";
362	my $part_content_info="";
363	if ($mimetype eq "multipart/digest") {
364	# default type - RTFRFC!!
365	$part_content_type="message/rfc822";
366	}
367	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s*([^\s]+)@mi) {
368	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
369	$part_content_info=$2;
370	}
371	my $filename="";
372	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
373	$filename=$1;
374	}
375
376	# disposition - either inline or attachment.
377	# NOT CURRENTLY USED - we display all text types instead...
378	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
379
380	# add <<attachment>> to each part except the first...
381	if ($text ne "") {
382	$text.="<p><hr><strong><<attachment>></>";
383	# add part info header
384	$text.="<br>Type: $part_content_type<br>\n";
385	if ($filename ne "") {
386	$text.="Filename: $filename\n";
387	}
388	$text.="</strong></p>\n";
389	}
390
391	if ($part_content_type =~ m@text/@)
392	{
393	my $part_text=text_from_part($message_part);
394	if ($part_content_type !~ m@text/(ht\|x)ml@) {
395	$part_text=text_into_html($part_text);
396	}
397	if ($part_text eq "") {
398	$part_text='<<empty message>>';
399	}
400	$text.=$part_text;
401	} elsif ($part_content_type =~ m@message/rfc822@) {
402	# This is a forwarded message
403	my $message_part_headers=$part_body;
404	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
405	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
406	my $message_part_body=$1;
407
408	my $rfc822_formatted_body=""; # put result in here
409	if ($message_part_headers =~
410	/^content\-type:\s([\w\/\-]+)\s\;?\s?([^\s]+)?\s$/ims)
411	{
412	# The message header uses MIME flags
413	my $message_content_type=$1;
414	my $message_content_info=$2;
415	if (!defined($message_content_info)) {
416	$message_content_info="";
417	}
418	$message_content_type =~ tr/A-Z/a-z/;
419	if ($message_content_type =~ /multipart/) {
420	$rfc822_formatted_body=
421	text_from_mime_message($message_content_type,
422	$message_content_info,
423	$message_part_body);
424	} else {
425	$message_part_body=text_from_part($part_body);
426	$rfc822_formatted_body=text_into_html($message_part_body);
427	}
428	} else {
429	# message doesn't use MIME flags
430	$rfc822_formatted_body=text_into_html($message_part_body);
431	}
432	# Add the returned text to the output
433	# don't put all the headers...
434	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
435	$text.=text_into_html($message_part_headers);
436	$text.="<p>\n";
437	$text.=$rfc822_formatted_body;
438	# end of message/rfc822
439	} elsif ($part_content_type =~ /multipart/) {
440	# recurse again
441	$tmptext=text_from_mime_message($part_content_type,
442	$part_content_info,
443	$part_body);
444	$text.=$tmptext;
445	} elsif ($text eq "") {
446	# we can't do anything with this part, but if it's the first
447	# part then make sure it is mentioned..
448
449	$text.="<p><hr><strong><<attachment>></>";
450	# add part info header
451	$text.="<br>Type: $part_content_type<br>\n";
452	if ($filename ne "") {
453	$text.="Filename: $filename\n";
454	}
455	$text.="</strong></p>\n";
456	}
457	} # foreach message part.
458	} else {
459	# we can't handle this multipart type (not mixed or alternative)
460	# the RFC also mentions "parallel".
461	}
462	} # end of multipart
463	return $text;
464	}
465
466
467
468
469
470
471	# Process a MIME part. Return "" if we can't decode it.
472	sub text_from_part {
473	my $text=shift;
474	my $part_header=$text;
475	$part_header =~ s/\r?\n\r?\n(.*)$//s;
476	$text=$1;
477	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
478	$part_header =~ /content\-type:\s*([\w\/]+)/is;
479	my $type=$1;
480	my $encoding="";
481	if ($part_header =~ /^content\-transfer\-encoding:\s*([^s]+)/mis) {
482	$encoding=$1; $encoding=~tr/A-Z/a-z/;
483	}
484	# Content-Transfer-Encoding is per-part
485	if ($encoding ne "") {
486	if ($encoding =~ /quoted\-printable/) {
487	$text=qp_decode($text);
488	} elsif ($encoding =~ /base64/) {
489	$text=base64_decode($text);
490	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
491	# rfc2045 also allows binary, which we ignore (for now).
492	# maybe this shouldn't go to stderr, but anyway...
493	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
494	return "";
495	}
496	}
497
498	if ($type eq "text/html") {
499	# only get stuff between <body> tags, or <html> tags.
500	$text =~ s/^.?<(html\|HTML)[^>]>//s;
501	$text =~ s/<\/(html\|HTML)>.*$//s;
502
503	$text =~ s/^.?<(body\|BODY)[^>]>//s;
504	$text =~ s/<\/(body\|BODY)>.*$//s;
505	}
506	elsif ($type eq "text/xml") {
507	$text=~s/</</g;$text=~s/>/>/g;
508	$text="<pre>\n$text\n</pre>\n";
509	}
510	return $text;
511	}
512
513
514	# decode quoted-printable text
515	sub qp_decode {
516	my $text=shift;
517
518	my @lines=split('\n', $text);
519
520	# if a line ends with "=\s*", it is a soft line break, otherwise
521	# keep in any newline characters.
522	foreach my $line (@lines) {
523	if ($line =~ s/=\s*$//) {}
524	else {$line.="\n";}
525
526	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
527	my @hexcode_segments=split('=',$line);
528	shift @hexcode_segments;
529	my @hexcodes;
530	foreach my $hexcode (@hexcode_segments) {
531	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
532	chomp($hexcode); # just in case...
533	my $char=chr (hex "0x$hexcode");
534	$line =~ s/=$hexcode/$char/g;
535	}
536	}
537	}
538	$text= join('', @lines);
539	return $text;
540	}
541
542	# decode base64 text. This is fairly slow (since it's interpreted perl rather
543	# than compiled XS stuff like in the ::MIME modules, but this is more portable
544	# for us at least).
545	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
546	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
547	# from each byte.
548
549
550	sub base64_decode {
551	my $enc_text = shift;
552	# A=>0, B=>1, ..., '+'=>62, '/'=>63
553	# also '=' is used for padding at the end, but we remove it anyway.
554	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
555	# map each MIME char into it's value, for more efficient lookup.
556	my %index;
557	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
558	# remove all non-base64 chars. eval to get variable in transliteration...
559	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
560	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
561	my $decoded="";
562	while (length ($enc_text)>3)
563	{
564	my $fourchars=substr($enc_text,0,4,"");
565	my @chars=(split '',$fourchars);
566	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
567	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
568	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
569	}
570	# if there are any input chars left, there are either
571	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
572	my @chars=(split '',$enc_text);
573	if (length($enc_text)) {
574	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
575	}
576	if (length($enc_text)==3) {
577	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
578	}
579	return $decoded;
580	}
581
582
583	# Perl packages have to return true if they are run.
584	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: