Context Navigation

EMAILPlug.pm@ 2680

Last change on this file since 2680 was 2680, checked in by jrm21, 23 years ago

we escape 'and' chars in headers so greenstone doesn't try to expand it

as metadata.

fixed up FromName and FromAddr if the From: field isn't in the "Name" <addr>

format.

MIME boundaries are allowed spaces in them (didn't read RFC properly...)
Body is now correctly interpreted if in quoted-printable or base64 and

content-type is text/* (not multipart).

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 18.7 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata:
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header - this will be stored as Creator
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	#
52	# John McPherson - June/July 2001
53	# added (basic) MIME support and quoted-printable and base64 decodings.
54	# Minor fixes for names that are actually email addresses (ie <...> was lost)
55	#
56	# See: * RFC 822 - ARPA Internet Text Messages
57	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
58	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
59	# * RFC 1806 - Content Dispositions (ie inline/attachment)
60	package EMAILPlug;
61
62	use SplitPlug;
63
64	use sorttools;
65	use util;
66
67
68	# EMAILPlug is a sub-class of SplitPlug.
69
70	sub BEGIN {
71	@ISA = ('SplitPlug');
72	}
73
74	# Create a new EMAILPlug object with which to parse a file.
75	# Accomplished by creating a new BasPlug and using bless to
76	# turn it into an EMAILPlug.
77
78	sub new {
79	my ($class) = @_;
80	my $self = new BasPlug ("EMAILPlug", @_);
81	# make sure we don't run textcat (defaults to "auto");
82	$self->{'input_encoding'}="ascii"; # this might not be good enough...
83	return bless $self, $class;
84	}
85
86	sub get_default_process_exp {
87	my $self = shift (@_);
88	# mbx/email for mailbox file format, \d+ for maildir (each message is
89	# in a separate file, with a unique number for filename)
90	return q@([\\/]\d+\|\.(mbx\|email))$@;
91	}
92
93	# This plugin splits the mbox mail files at lines starting with From<sp>
94	sub get_default_split_exp {
95	return q^\nFrom .*\n^;
96	}
97
98
99	# do plugin specific processing of doc_obj
100	sub process {
101
102	my $self = shift (@_);
103	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
104	my $outhandle = $self->{'outhandle'};
105
106	# Check that we're dealing with a valid mail file
107	return undef unless (($$textref =~ /From:/m) \|\| ($$textref =~ /To:/m));
108
109	# slightly more strict validity check, to prevent us from matching
110	# .so.x files ...
111	return undef unless (($$textref =~ /^From /) \|\|
112	($$textref =~ /^[-A-Za-z]{2,100}:/m));
113
114	print $outhandle "EMAILPlug: processing $file\n"
115	if $self->{'verbosity'} > 1;
116
117	my $cursection = $doc_obj->get_top_section();
118
119	#
120	# Parse the document's text and extract metadata
121	#
122
123	# Protect backslashes
124	$$textref =~ s@\\@\\\\@g;
125
126	# Separate header from body of message
127	my $Headers = $$textref;
128	$Headers =~ s/\r?\n\r?\n(.*)$//s;
129	$$textref = $1;
130	# escape [] so it isn't re-interpreted as metadata
131	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
132
133
134	# Unfold headers - see rfc822
135	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
136	# Extract basic metadata from header
137	my @headers = ("From", "To", "Subject", "Date");
138	my %raw;
139	foreach my $name (@headers) {
140	$raw{$name} = "No $name value";
141	}
142
143	# Examine each line of the headers
144	my ($line, $name, $value);
145	my @parts;
146	foreach $line (split(/\n/, $Headers)) {
147
148	# Ignore lines with no content or which begin with whitespace
149	next unless ($line =~ /:/);
150	next if ($line =~ /^\s/);
151
152	# Find out what metadata is on this line
153	@parts = split(/:/, $line);
154	$name = shift @parts;
155	# uppercase the first character according to the current locale
156	$name=~s/(.+)/\u$1/;
157	next unless $name;
158	next unless ($raw{$name});
159
160	# Find the value of that metadata
161	$value = join(":", @parts);
162	$value =~ s/^\s+//;
163	$value =~ s/\s+$//;
164
165	# Store the metadata
166	$raw{$name} = $value;
167	}
168
169	# Extract the name and e-mail address from the From metadata
170	$frommeta = $raw{"From"};
171	my $fromnamemeta;
172	my $fromaddrmeta;
173
174	$frommeta =~ s/\s*$//; # Remove trailing space, if any
175
176	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
177	$fromnamemeta=$1;
178	$fromaddrmeta=$2;
179	} elsif ($frommeta =~ m/(.+@.+)\s+$(.*)$/) {
180	$fromnamemeta=$2;
181	$fromaddrmeta=$1;
182	}
183	if (!defined($fromaddrmeta)) {
184	$fromaddrmeta=$frommeta;
185	}
186	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
187	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
188
189	if (defined($fromnamemeta)) {
190	$fromnamemeta =~ s/\"//g;
191	}
192	else {
193	$fromnamemeta = $fromaddrmeta;
194	}
195	# if name is an address
196	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
197	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
198
199	# Escape < and > in the whole From field;
200	$frommeta =~ s/</</g; $frommeta =~ s/>/>/g;
201	$raw{"From"}=$frommeta;
202
203	# Process Date information
204	if ($raw{"Date"} !~ /No Date/) {
205	$raw{"DateText"} = $raw{"Date"};
206
207	# Convert the date text to internal date format
208	$value = $raw{"Date"};
209	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
210	if ($year < 100) { $year += 1900; }
211	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
212
213	} else {
214	# We have not extracted a date
215	$raw{"DateText"} = "Unknown.";
216	$raw{"Date"} = "19000000";
217	}
218
219
220	# Add extracted metadata to document object
221	foreach my $name (keys %raw) {
222	$value = $raw{$name};
223	if ($value) {
224	$value = &text_into_html($value);
225	} else {
226	$value = "No $name field";
227	}
228	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
229	}
230
231	my $mimetype="text/plain";
232	my $mimeinfo="";
233	# Do MIME and encoding stuff
234	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
235	{
236	$mimetype=$1;
237	$mimetype =~ tr/[A-Z]/[a-z]/;
238	$mimeinfo=$2;
239	}
240
241	my $transfer_encoding="7bit";
242	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
243	$transfer_encoding=$1;
244	}
245	if ($mimetype ne "text/plain") {
246	$$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref,
247	$outhandle);
248	} elsif ($transfer_encoding =~ /quoted\-printable/) {
249	$$textref=qp_decode($$textref);
250	} elsif ($transfer_encoding =~ /base64/) {
251	$$textref=base64_decode($$textref);
252	}
253
254
255	# Add "All headers" metadata
256	$Headers = &text_into_html($Headers);
257	$Headers = "No headers" unless ($Headers =~ /\w/);
258	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
259
260	# Add text to document object
261	if ($mimetype eq "text/plain") {
262	$$textref = &text_into_html($$textref);
263	}
264	$$textref = "No message" unless ($$textref =~ /\w/);
265	$doc_obj->add_utf8_text($cursection, $$textref);
266
267	return 1;
268	}
269
270
271	# Convert a text string into HTML.
272	#
273	# The HTML is going to be inserted into a GML file, so
274	# we have to be careful not to use symbols like ">",
275	# which ocurs frequently in email messages (and use
276	# &gt instead.
277	#
278	# This function also turns links and email addresses into hyperlinks,
279	# and replaces carriage returns with <BR> tags (and multiple carriage
280	# returns with <P> tags).
281
282
283	sub text_into_html {
284	my ($text) = @_;
285
286	# Convert problem characters into HTML symbols
287	$text =~ s/&/&/go;
288	$text =~ s/</</go;
289	$text =~ s/>/>/go;
290	$text =~ s/\"/"/go;
291
292	# convert email addresses and URIs into links
293	# don't markup email addresses for now
294	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
295
296	# assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
297	# URI can't finish with a '.'
298	$text =~ s/((http\|ftp\|https):\/\/[\w\d\-]+(\.[\w\d\-]+)\/?((&\|\.)?[\w\d\?\=\-_\/~]+))/<a href=\"$1\">$1<\/a>/g;
299
300
301	# Clean up whitespace and convert \n charaters to <BR> or <P>
302	$text =~ s/ +/ /go;
303	$text =~ s/\s*$//o;
304	$text =~ s/^\s*//o;
305	$text =~ s/\n/\n<BR>/go;
306	$text =~ s/<BR>\s*<BR>/<P>/go;
307
308	return $text;
309	}
310
311
312
313
314	#Process a MIME message.
315	# the textref we are given DOES NOT include the header.
316	sub text_from_mime_message {
317	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
318
319	# Check for multiparts - $mimeinfo will be a boundary
320	if ($mimetype =~ /multipart/) {
321	$boundary="";
322	if ($mimeinfo =~ /boundary=\"?([^"]+)\"?\s*$/im) {
323	$boundary=$1;
324	}
325	# parts start with "--$boundary"
326	# message ends with "--$boundary--"
327	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
328	# that perl might want to interpolate. Also allows spaces...
329
330	$boundary=~s/\\/\\\\/g;
331	$boundary=~s/([\?\+\.\:\/\'])/\\$1/g;
332	my @message_parts = split("\r?\n\-\-$boundary", $text);
333	# remove first "part" and last "part" (final --)
334	shift @message_parts;
335	my $last=pop @message_parts;
336	# if our boundaries are a bit dodgy and we only found 1 part...
337	if (!defined($last)) {$last="";}
338	# make sure it is only -- and whitespace
339	if ($last !~ /^\-\-\s*$/ms) {
340	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
341	}
342	foreach my $message_part (@message_parts) {
343	# remove the leading newline left from split.
344	$message_part=~s/^\r?\n//;
345	}
346	if ($mimetype eq "multipart/alternative") {
347	# check for an HTML version first, then TEXT, otherwise use first.
348	my $part_text="";
349	foreach my $message_part (@message_parts) {
350	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
351	{
352	# Use the HTML version
353	$part_text=text_from_part($message_part);
354	$mimetype="text/html";
355	last;
356	}
357	}
358	if ($part_text eq "") { # try getting a text part instead
359	foreach my $message_part (@message_parts) {
360	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
361	{
362	# Use the plain version
363	$part_text=text_from_part($message_part);
364	$mimetype="text/plain";
365	last;
366	}
367	}
368	}
369	if ($part_text eq "") { # use first part
370	$part_text=text_from_part(shift @message_parts);
371	}
372	if ($part_text eq "") { # we couldn't get anything!!!
373	# or it was an empty message...
374	# do nothing...
375	print $outhandle "EMAILPlug: no text - empty body?\n";
376	} else {
377	$text=$part_text;
378	}
379	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related)@) {
380	$text="";
381	foreach my $message_part (@message_parts) {
382	my $part_header=$message_part;
383	my $part_body;
384	if ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
385	$part_body=$1;
386	} else {
387	# no header... use defaults
388	$part_body=$message_part;
389	$part_header="Content-type: text/plain; charset=us-ascii";
390	}
391	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
392	my $part_content_type="";
393	my $part_content_info="";
394	if ($mimetype eq "multipart/digest") {
395	# default type - RTFRFC!!
396	$part_content_type="message/rfc822";
397	}
398	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s([^\s])@mi) {
399	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
400	$part_content_info=$2;
401	}
402	my $filename="";
403	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
404	$filename=$1;
405	}
406
407	# disposition - either inline or attachment.
408	# NOT CURRENTLY USED - we display all text types instead...
409	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
410
411	# add <<attachment>> to each part except the first...
412	if ($text ne "") {
413	$text.="<p><hr><strong><<attachment>></>";
414	# add part info header
415	$text.="<br>Type: $part_content_type<br>\n";
416	if ($filename ne "") {
417	$text.="Filename: $filename\n";
418	}
419	$text.="</strong></p>\n";
420	}
421
422	if ($part_content_type =~ m@text/@)
423	{
424	my $part_text=text_from_part($message_part);
425	if ($part_content_type !~ m@text/(ht\|x)ml@) {
426	$part_text=text_into_html($part_text);
427	}
428	if ($part_text eq "") {
429	$part_text='<<empty message>>';
430	}
431	$text.=$part_text;
432	} elsif ($part_content_type =~ m@message/rfc822@) {
433	# This is a forwarded message
434	my $message_part_headers=$part_body;
435	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
436	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
437	my $message_part_body=$1;
438
439	my $rfc822_formatted_body=""; # put result in here
440	if ($message_part_headers =~
441	/^content\-type:\s([\w\/\-]+)\s\;?\s?([^\s]+)?\s$/ims)
442	{
443	# The message header uses MIME flags
444	my $message_content_type=$1;
445	my $message_content_info=$2;
446	if (!defined($message_content_info)) {
447	$message_content_info="";
448	}
449	$message_content_type =~ tr/A-Z/a-z/;
450	if ($message_content_type =~ /multipart/) {
451	$rfc822_formatted_body=
452	text_from_mime_message($message_content_type,
453	$message_content_info,
454	$message_part_body,
455	$outhandle);
456	} else {
457	$message_part_body=text_from_part($part_body);
458	$rfc822_formatted_body=text_into_html($message_part_body);
459	}
460	} else {
461	# message doesn't use MIME flags
462	$rfc822_formatted_body=text_into_html($message_part_body);
463	}
464	# Add the returned text to the output
465	# don't put all the headers...
466	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
467	$text.=text_into_html($message_part_headers);
468	$text.="<p>\n";
469	$text.=$rfc822_formatted_body;
470	# end of message/rfc822
471	} elsif ($part_content_type =~ /multipart/) {
472	# recurse again
473	$tmptext=text_from_mime_message($part_content_type,
474	$part_content_info,
475	$part_body,
476	$outhandle);
477	$text.=$tmptext;
478	} elsif ($text eq "") {
479	# we can't do anything with this part, but if it's the first
480	# part then make sure it is mentioned..
481
482	$text.="<p><hr><strong><<attachment>></>";
483	# add part info header
484	$text.="<br>Type: $part_content_type<br>\n";
485	if ($filename ne "") {
486	$text.="Filename: $filename\n";
487	}
488	$text.="</strong></p>\n";
489	}
490	} # foreach message part.
491	} else {
492	# we can't handle this multipart type (not mixed or alternative)
493	# the RFC also mentions "parallel".
494	}
495	} # end of multipart
496	return $text;
497	}
498
499
500
501
502
503
504	# Process a MIME part. Return "" if we can't decode it.
505	sub text_from_part {
506	my $text=shift;
507	my $part_header=$text;
508	$part_header =~ s/\r?\n\r?\n(.*)$//s;
509	$text=$1; if (!defined($text)) {$text="";}
510	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
511	$part_header =~ /content\-type:\s*([\w\/]+)/is;
512	my $type=$1; if (!defined($type)) {$type="";}
513	my $encoding="";
514	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
515	$encoding=$1; $encoding=~tr/A-Z/a-z/;
516	}
517	# Content-Transfer-Encoding is per-part
518	if ($encoding ne "") {
519	if ($encoding =~ /quoted\-printable/) {
520	$text=qp_decode($text);
521	} elsif ($encoding =~ /base64/) {
522	$text=base64_decode($text);
523	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
524	# rfc2045 also allows binary, which we ignore (for now).
525	# maybe this shouldn't go to stderr, but anyway...
526	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
527	return "";
528	}
529	}
530
531	if ($type eq "text/html") {
532	# only get stuff between <body> tags, or <html> tags.
533	$text =~ s/^.?<(html\|HTML)[^>]>//s;
534	$text =~ s/<\/(html\|HTML)>.*$//s;
535
536	$text =~ s/^.?<(body\|BODY)[^>]>//s;
537	$text =~ s/<\/(body\|BODY)>.*$//s;
538	}
539	elsif ($type eq "text/xml") {
540	$text=~s/</</g;$text=~s/>/>/g;
541	$text="<pre>\n$text\n</pre>\n";
542	}
543	return $text;
544	}
545
546
547	# decode quoted-printable text
548	sub qp_decode {
549	my $text=shift;
550
551	my @lines=split('\n', $text);
552
553	# if a line ends with "=\s*", it is a soft line break, otherwise
554	# keep in any newline characters.
555	foreach my $line (@lines) {
556	if ($line =~ s/=\s*$//) {}
557	else {$line.="\n";}
558
559	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
560	my @hexcode_segments=split('=',$line);
561	shift @hexcode_segments;
562	my @hexcodes;
563	foreach my $hexcode (@hexcode_segments) {
564	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
565	chomp($hexcode); # just in case...
566	my $char=chr (hex "0x$hexcode");
567	$line =~ s/=$hexcode/$char/g;
568	}
569	}
570	}
571	$text= join('', @lines);
572	return $text;
573	}
574
575	# decode base64 text. This is fairly slow (since it's interpreted perl rather
576	# than compiled XS stuff like in the ::MIME modules, but this is more portable
577	# for us at least).
578	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
579	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
580	# from each encoded byte.
581
582
583	sub base64_decode {
584	my $enc_text = shift;
585	# A=>0, B=>1, ..., '+'=>62, '/'=>63
586	# also '=' is used for padding at the end, but we remove it anyway.
587	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
588	# map each MIME char into it's value, for more efficient lookup.
589	my %index;
590	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
591	# remove all non-base64 chars. eval to get variable in transliteration...
592	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
593	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
594	my $decoded="";
595	while (length ($enc_text)>3)
596	{
597	my $fourchars=substr($enc_text,0,4,"");
598	my @chars=(split '',$fourchars);
599	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
600	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
601	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
602	}
603	# if there are any input chars left, there are either
604	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
605	my @chars=(split '',$enc_text);
606	if (length($enc_text)) {
607	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
608	}
609	if (length($enc_text)==3) {
610	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
611	}
612	return $decoded;
613	}
614
615
616	# Perl packages have to return true if they are run.
617	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2680

Download in other formats: