Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3143

Last change on this file since 3143 was 3143, checked in by jrm21, 22 years ago
Minor tweak for badly formatted dates. We now use a window, so anything less than 20 is assumed to be 2000 - 2020, and anything over 20 is assumed to be 1920. (I got spam dated "22 May 01" or some such). Previously we were just adding 1900, which meant 1901 in this case.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.8 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63	package EMAILPlug;
64
65	use SplitPlug;
66
67	use unicode;
68
69	use sorttools;
70	use util;
71
72
73	# EMAILPlug is a sub-class of SplitPlug.
74
75	sub BEGIN {
76	@ISA = ('SplitPlug');
77	}
78
79	# Create a new EMAILPlug object with which to parse a file.
80	# Accomplished by creating a new BasPlug and using bless to
81	# turn it into an EMAILPlug.
82
83	sub new {
84	my ($class) = @_;
85	my $self = new BasPlug ("EMAILPlug", @_);
86	# this might not actually be true at read-time, but after processing
87	# it should all be utf8.
88	$self->{'input_encoding'}="utf8";
89	return bless $self, $class;
90	}
91
92	sub get_default_process_exp {
93	my $self = shift (@_);
94	# mbx/email for mailbox file format, \d+ for maildir (each message is
95	# in a separate file, with a unique number for filename)
96	# mozilla and IE will save individual mbx format files with a ".eml" ext.
97	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
98	}
99
100	# This plugin splits the mbox mail files at lines starting with From<sp>
101	# It is supposed to be "\n\nFrom ", but this isn't always used.
102	sub get_default_split_exp {
103	return q^\nFrom .*\n^;
104	}
105
106
107	# do plugin specific processing of doc_obj
108	sub process {
109
110	my $self = shift (@_);
111	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112	my $outhandle = $self->{'outhandle'};
113
114	# Check that we're dealing with a valid mail file
115	# mbox message files start with "From "
116	# maildir messages usually start with Return-Path and Delivered-To
117	# mh is very similar to maildir
118	my $startoffile=substr($$textref,0,256);
119	if (($startoffile !~ /^(From )/) &&
120	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
121	return undef;
122	}
123
124
125	print $outhandle "EMAILPlug: processing $file\n"
126	if $self->{'verbosity'} > 1;
127
128	my $cursection = $doc_obj->get_top_section();
129
130	#
131	# Parse the document's text and extract metadata
132	#
133
134	# Protect backslashes
135	$$textref =~ s@\\@\\\\@g;
136
137	# Separate header from body of message
138	my $Headers = $$textref;
139	$Headers =~ s/\r?\n\r?\n(.*)$//s;
140	$$textref = $1;
141
142	# Unfold headers - see rfc822
143	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
144	# Extract basic metadata from header
145	my @headers = ("From", "To", "Subject", "Date");
146	my %raw;
147	foreach my $name (@headers) {
148	$raw{$name} = "No $name value";
149	}
150
151	# Get a default encoding for the header - RFC says should be ascii...
152	my $default_heading_encoding="iso_8859_1";
153
154	# We don't know what character set is the user's default...
155	# We could use textcat to guess... for now we'll look at mime content-type
156	# if ($Headers =~ /([[:^ascii:]])/) {
157	# }
158	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
159	$default_header_encoding=$1;
160	$default_header_encoding =~ s@\-@_@g;
161	$default_header_encoding =~ tr/A-Z/a-z/;
162	}
163
164
165	# Examine each line of the headers
166	my ($line, $name, $value);
167	my @parts;
168	foreach $line (split(/\n/, $Headers)) {
169
170	# Ignore lines with no content or which begin with whitespace
171	next unless ($line =~ /:/);
172	next if ($line =~ /^\s/);
173
174	# Find out what metadata is on this line
175	@parts = split(/:/, $line);
176	$name = shift @parts;
177	# get fieldname in canonical form - first cap, then lower case.
178	$name =~ tr/A-Z/a-z/;
179	# uppercase the first character according to the current locale
180	$name=~s/(.+)/\u$1/;
181	next unless $name;
182	next unless ($raw{$name});
183
184	# Find the value of that metadata
185	$value = join(":", @parts);
186	$value =~ s/^\s+//;
187	$value =~ s/\s+$//;
188
189	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
190	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
191	my $original_value=$value;
192	my $encoded=$value;
193	$value="";
194	# this isn't quite right yet regarding spaces between encoded-texts
195	# (see examples, section 8. of rfc).
196	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=\s*//i) {
197	my ($charset, $encoding, $data)=($2,$3,$4);
198	my ($decoded_data);
199	$value.="$1"; # any leading chars
200	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
201	chomp $data;
202	$encoding =~ tr/BQ/bq/;
203	if ($encoding eq "q") { # quoted printable
204	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
205	$decoded_data=qp_decode($data);
206	} else { # base 64
207	$decoded_data=base64_decode($data);
208	}
209	$self->convert2unicode($charset, \$decoded_data);
210	$value .= $decoded_data;
211	} # end of while loop
212
213	# get any trailing characters
214	$self->convert2unicode($default_header_encoding, \$encoded);
215	$value.=$encoded;
216
217	if ($value =~ /^\s*$/) { # we couldn't extract anything...
218	$self->convert2unicode($default_header_encoding,
219	\$original_value);
220	$value=original_value;
221	}
222	} # end of if =?...?=
223
224	# In the absense of other charset information, assume the
225	# header is the default (usually "iso_8859_1") and convert it to unicode.
226	else {
227	$self->convert2unicode($default_header_encoding, \$value);
228	}
229
230	# Store the metadata
231	$raw{$name} = $value;
232	}
233
234	# Extract the name and e-mail address from the From metadata
235	$frommeta = $raw{"From"};
236	my $fromnamemeta;
237	my $fromaddrmeta;
238
239	$frommeta =~ s/\s*$//; # Remove trailing space, if any
240
241	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
242	$fromnamemeta=$1;
243	$fromaddrmeta=$2;
244	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
245	$fromnamemeta=$2;
246	$fromaddrmeta=$1;
247	}
248	if (!defined($fromaddrmeta)) {
249	$fromaddrmeta=$frommeta;
250	}
251	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
252	# minor attempt to prevent spam-bots from harvesting addresses...
253	$fromaddrmeta=~s/@/@/;
254	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
255
256	if (defined($fromnamemeta)) {
257	$fromnamemeta =~ s/\"//g;
258	}
259	else {
260	$fromnamemeta = $fromaddrmeta;
261	}
262	# if name is an address
263	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
264	$fromnamemeta=~s/@/&#64\;/;
265	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
266
267	$raw{"From"}=$frommeta;
268
269	# Process Date information
270	if ($raw{"Date"} !~ /No Date/) {
271	$raw{"DateText"} = $raw{"Date"};
272
273	# Convert the date text to internal date format
274	$value = $raw{"Date"};
275	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
276	# make some assumptions about the year formatting...
277	# some (old) software thinks 2001 is 101, some think 2001 is 01
278	if ($year < 20) { $year += 2000; } # assume not really 1920...
279	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
280	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
281
282	} else {
283	# We have not extracted a date
284	$raw{"DateText"} = "Unknown.";
285	$raw{"Date"} = "19000000";
286	}
287
288	# Add extracted metadata to document object
289	foreach my $name (keys %raw) {
290	$value = $raw{$name};
291	if ($value) {
292	# assume subject, etc headers have no special HTML meaning.
293	$value = &text_into_html($value);
294	# escape [] so it isn't re-interpreted as metadata
295	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
296	} else {
297	$value = "No $name field";
298	}
299	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
300	}
301
302	my $mimetype="text/plain";
303	my $mimeinfo="";
304	my $charset = $default_header_encoding;
305	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
306	# more than one parameter given to Content-type.
307	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
308	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s(\;\s.)\s*$/mi)
309	{
310	$mimetype=$1;
311	$mimetype =~ tr/[A-Z]/[a-z]/;
312
313	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
314	$mimetype = "text/plain";
315	}
316
317	$mimeinfo=$2;
318	if (!defined $mimeinfo) {
319	$mimeinfo="";
320	} else { # strip leading and trailing stuff
321	$mimeinfo =~ s/^\;\s*//;
322	$mimeinfo =~ s/\s*$//;
323	}
324	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
325	$charset = $1;
326	}
327	}
328
329	my $transfer_encoding="7bit";
330	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
331	$transfer_encoding=$1;
332	}
333	if ($mimetype eq "text/html") {
334	$$textref= $self->text_from_part("$Headers\n$$textref");
335	} elsif ($mimetype ne "text/plain") {
336	$$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
337	$outhandle);
338	} elsif ($transfer_encoding =~ /quoted\-printable/) {
339	$$textref=qp_decode($$textref);
340	$self->convert2unicode($charset, $textref);
341	} elsif ($transfer_encoding =~ /base64/) {
342	$$textref=base64_decode($$textref);
343	$self->convert2unicode($charset, $textref);
344	} else {
345	$self->convert2unicode($charset, $textref);
346	}
347
348
349	# Add "All headers" metadata
350	$Headers = &text_into_html($Headers);
351
352	$Headers = "No headers" unless ($Headers =~ /\w/);
353	$Headers =~ s/@/&#64\;/g;
354	# escape [] so it isn't re-interpreted as metadata
355	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
356	$self->convert2unicode($charset, \$Headers);
357	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
358
359
360	# Add Title metadata
361	my $Title = text_into_html($raw{'Subject'});
362	$Title .= "<br>From: " . text_into_html($raw{'From'});
363	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
364	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
365
366	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
367
368
369	# Add text to document object
370	if ($mimetype eq "text/plain") {
371	$$textref = &text_into_html($$textref);
372	}
373	$$textref = "No message" unless ($$textref =~ /\w/);
374	$doc_obj->add_utf8_text($cursection, $$textref);
375
376	return 1;
377	}
378
379
380	# Convert a text string into HTML.
381	#
382	# The HTML is going to be inserted into a GML file, so
383	# we have to be careful not to use symbols like ">",
384	# which ocurs frequently in email messages (and use
385	# &gt instead.
386	#
387	# This function also turns links and email addresses into hyperlinks,
388	# and replaces carriage returns with <BR> tags (and multiple carriage
389	# returns with <P> tags).
390
391
392	sub text_into_html {
393	my ($text) = @_;
394
395	# Convert problem characters into HTML symbols
396	$text =~ s/&/&/g;
397	$text =~ s/</</g;
398	$text =~ s/>/>/g;
399	$text =~ s/\"/"/g;
400
401	# convert email addresses and URIs into links
402	# don't markup email addresses for now
403	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
404
405	# try to munge email addresses a little bit...
406	$text =~ s/@/@/;
407	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
408	# assume URI doesn't finish with a '.'
409	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+))@<a href=\"$1\">$1<\/a>@g;
410
411
412	# Clean up whitespace and convert \n charaters to <BR> or <P>
413	$text =~ s/ +/ /g;
414	$text =~ s/\s*$//g;
415	$text =~ s/^\s*//g;
416	$text =~ s/\n/\n<br>/g;
417	$text =~ s/<br>\s*<br>/<p>/gi;
418
419	return $text;
420	}
421
422
423
424
425	#Process a MIME message.
426	# the textref we are given DOES NOT include the header.
427	sub text_from_mime_message {
428	my $self = shift(@_);
429	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
430
431	# Check for multiparts - $mimeinfo will be a boundary
432	if ($mimetype =~ /multipart/) {
433	$boundary="";
434	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
435	$boundary=$1;
436	if ($boundary =~ m@^\"@) {
437	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
438	}
439	} else {
440	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
441	}
442	# parts start with "--$boundary"
443	# message ends with "--$boundary--"
444	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
445	# that perl might want to interpolate. Also allows spaces...
446	$boundary=~s/\\/\\\\/g;
447	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
448	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
449	# remove first "part" and last "part" (final --)
450	shift @message_parts;
451	my $last=pop @message_parts;
452	# if our boundaries are a bit dodgy and we only found 1 part...
453	if (!defined($last)) {$last="";}
454	# make sure it is only -- and whitespace
455	if ($last !~ /^\-\-\s*$/ms) {
456	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
457	}
458	foreach my $message_part (@message_parts) {
459	# remove the leading newline left from split.
460	$message_part=~s/^\r?\n//;
461	}
462	if ($mimetype eq "multipart/alternative") {
463	# check for an HTML version first, then TEXT, otherwise use first.
464	my $part_text="";
465	foreach my $message_part (@message_parts) {
466	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
467	{
468	# Use the HTML version
469	$part_text= $self->text_from_part($message_part);
470	$mimetype="text/html";
471	last;
472	}
473	}
474	if ($part_text eq "") { # try getting a text part instead
475	foreach my $message_part (@message_parts) {
476	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
477	{
478	# Use the plain version
479	$part_text= $self->text_from_part($message_part);
480	if ($part_text =~/[^\s]/) {
481	$part_text="<pre>".$part_text."</pre>";
482	}
483	$mimetype="text/plain";
484	last;
485	}
486	}
487	}
488	if ($part_text eq "") { # use first part
489	$part_text= $self->text_from_part(shift @message_parts);
490	}
491	if ($part_text eq "") { # we couldn't get anything!!!
492	# or it was an empty message...
493	# do nothing...
494	print $outhandle "EMAILPlug: no text - empty body?\n";
495	} else {
496	$text=$part_text;
497	}
498	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related)@) {
499	$text="";
500	foreach my $message_part (@message_parts) {
501	my $part_header=$message_part;
502	my $part_body;
503	if ($message_part=~ /^\s*\n/) {
504	# no header... use defaults
505	$part_body=$message_part;
506	$part_header="Content-type: text/plain; charset=us-ascii";
507	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
508	$part_body=$1;
509	} else {
510	# something's gone wrong...
511	$part_header="";
512	$part_body=$message_part;
513	}
514
515	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
516	my $part_content_type="";
517	my $part_content_info="";
518	if ($mimetype eq "multipart/digest") {
519	# default type - RTFRFC!!
520	$part_content_type="message/rfc822";
521	}
522	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s(.?)\s*$@mi) {
523	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
524	$part_content_info=$2;
525	}
526	my $filename="";
527	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
528	$filename=$1;
529	}
530
531	# disposition - either inline or attachment.
532	# NOT CURRENTLY USED - we display all text types instead...
533	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
534
535	# add <<attachment>> to each part except the first...
536	if ($text ne "") {
537	$text.="\n<p><hr><strong><<attachment>></>";
538	# add part info header
539	$text.="<br>Type: $part_content_type<br>\n";
540	if ($filename ne "") {
541	$text.="Filename: $filename\n";
542	}
543	$text.="</strong></p>\n";
544	}
545
546	if ($part_content_type =~ m@text/@)
547	{
548	my $part_text= $self->text_from_part($message_part);
549	if ($part_content_type !~ m@text/(ht\|x)ml@) {
550	$part_text=text_into_html($part_text);
551	}
552	if ($part_text eq "") {
553	$part_text='<<empty message>>';
554	}
555	$text.=$part_text;
556	} elsif ($part_content_type =~ m@message/rfc822@) {
557	# This is a forwarded message
558	my $message_part_headers=$part_body;
559	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
560	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
561	my $message_part_body=$1;
562
563	my $rfc822_formatted_body=""; # put result in here
564	if ($message_part_headers =~
565	/^content\-type:\s([\w\/\-]+)\s\;?\s(.?)\s*$/ims)
566	{
567	# The message header uses MIME flags
568	my $message_content_type=$1;
569	my $message_content_info=$2;
570	if (!defined($message_content_info)) {
571	$message_content_info="";
572	}
573	$message_content_type =~ tr/A-Z/a-z/;
574	if ($message_content_type =~ /multipart/) {
575	$rfc822_formatted_body=
576	$self->text_from_mime_message($message_content_type,
577	$message_content_info,
578	$message_part_body,
579	$outhandle);
580	} else {
581	$message_part_body= $self->text_from_part($part_body);
582	$rfc822_formatted_body=text_into_html($message_part_body);
583	}
584	} else {
585	# message doesn't use MIME flags
586	$rfc822_formatted_body=text_into_html($message_part_body);
587	}
588	# Add the returned text to the output
589	# don't put all the headers...
590	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
591	$text.=text_into_html($message_part_headers);
592	$text.="<p>\n";
593	$text.=$rfc822_formatted_body;
594	# end of message/rfc822
595	} elsif ($part_content_type =~ /multipart/) {
596	# recurse again
597
598	$tmptext= $self->text_from_mime_message($part_content_type,
599	$part_content_info,
600	$part_body,
601	$outhandle);
602	$text.=$tmptext;
603	} elsif ($text eq "") {
604	# we can't do anything with this part, but if it's the first
605	# part then make sure it is mentioned..
606
607	$text.="\n<p><hr><strong><<attachment>></>";
608	# add part info header
609	$text.="<br>Type: $part_content_type<br>\n";
610	if ($filename ne "") {
611	$text.="Filename: $filename\n";
612	}
613	$text.="</strong></p>\n";
614	}
615	} # foreach message part.
616	} else {
617	# we can't handle this multipart type (not mixed or alternative)
618	# the RFC also mentions "parallel".
619	}
620	} # end of ($mimetype !~ multipart)
621	elsif ($mimetype =~ m@message/rfc822@) {
622	my $msg_header = $text;
623	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
624	$text = $1;
625
626	if ($msg_header =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
627	{
628	$mimetype=$1;
629	$mimetype =~ tr/[A-Z]/[a-z]/;
630	$mimeinfo=$2;
631	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
632	$charset = $1;
633	}
634	my $msg_text;
635	if ($mimetype =~ m@multipart/@) {
636	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
637	$text, $outhandle);
638	} else {$msg_text=$text;}
639
640	my $brief_header="";
641	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
642	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
643	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
644	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
645	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
646	$text= "\n<b><<attached message>></b><br>";
647	$text.= "<table><tr><td width=\"5%\"> </td>\n";
648	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
649	. "</td></tr></table>";
650	}
651	} else {
652	# we don't do any processing of the content.
653	}
654
655	return $text;
656	}
657
658
659
660
661
662
663	# Process a MIME part. Return "" if we can't decode it.
664	sub text_from_part {
665	my $self = shift;
666	my $text = shift \|\| '';
667	my $part_header = $text;
668
669	# check for empty part header (leading blank line)
670	if ($text =~ /^\s*\r?\n/) {
671	$part_header="Content-type: text/plain; charset=us-ascii";
672	} else {
673	$part_header =~ s/\r?\n\r?\n(.*)$//s;
674	$text=$1; if (!defined($text)) {$text="";}
675	}
676	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
677	$part_header =~ /content\-type:\s([\w\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
678	my $type=$1;
679	my $charset=$2;
680	if (!defined($type)) {$type="";}
681	if (!defined($charset)) {$charset="ascii";}
682	my $encoding="";
683	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
684	$encoding=$1; $encoding=~tr/A-Z/a-z/;
685	}
686	# Content-Transfer-Encoding is per-part
687	if ($encoding ne "") {
688	if ($encoding =~ /quoted\-printable/) {
689	$text=qp_decode($text);
690	} elsif ($encoding =~ /base64/) {
691	$text=base64_decode($text);
692	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
693	# rfc2045 also allows binary, which we ignore (for now).
694	# maybe this shouldn't go to stderr, but anyway...
695	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
696	return "";
697	}
698	}
699	if ($type eq "text/html") {
700	# only get stuff between <body> tags, or <html> tags.
701	$text =~ s@^.<html[^>]>@@is;
702	$text =~ s@</html>.*$@@is;
703	$text =~ s/^.?<body[^>]>//si;
704	$text =~ s/<\/body>.*$//si;
705	}
706	elsif ($type eq "text/xml") {
707	$text=~s/</</g;$text=~s/>/>/g;
708	$text="<pre>\n$text\n</pre>\n";
709	}
710	# convert to unicode
711	$self->convert2unicode($charset, \$text);
712	return $text;
713	}
714
715
716	# decode quoted-printable text
717	sub qp_decode {
718	my $text=shift;
719
720	my @lines=split('\n', $text);
721
722	# if a line ends with "=\s*", it is a soft line break, otherwise
723	# keep in any newline characters.
724	foreach my $line (@lines) {
725	if ($line !~ s/=\s*$//) {$line.="\n";}
726
727	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
728	my @hexcode_segments=split('=',$line);
729	shift @hexcode_segments;
730	my @hexcodes;
731	foreach my $hexcode (@hexcode_segments) {
732	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
733	chomp($hexcode); # just in case...
734	my $char=chr (hex "0x$hexcode");
735	$line =~ s/=$hexcode/$char/g;
736	}
737	}
738	}
739	$text= join('', @lines);
740	return $text;
741	}
742
743	# decode base64 text. This is fairly slow (since it's interpreted perl rather
744	# than compiled XS stuff like in the ::MIME modules, but this is more portable
745	# for us at least).
746	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
747	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
748	# from each encoded byte.
749
750
751	sub base64_decode {
752	my $enc_text = shift;
753	# A=>0, B=>1, ..., '+'=>62, '/'=>63
754	# also '=' is used for padding at the end, but we remove it anyway.
755	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
756	# map each MIME char into it's value, for more efficient lookup.
757	my %index;
758	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
759	# remove all non-base64 chars. eval to get variable in transliteration...
760	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
761	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
762	my $decoded="";
763	while (length ($enc_text)>3)
764	{
765	my $fourchars=substr($enc_text,0,4,"");
766	my @chars=(split '',$fourchars);
767	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
768	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
769	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
770	}
771	# if there are any input chars left, there are either
772	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
773	my @chars=(split '',$enc_text);
774	if (length($enc_text)) {
775	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
776	}
777	if (length($enc_text)==3) {
778	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
779	}
780	return $decoded;
781	}
782
783	sub convert2unicode {
784	my $self = shift(@_);
785	my ($charset, $textref) = @_;
786
787	# first get our character encoding name in the right form.
788	$charset = "iso_8859_1" unless defined $charset;
789	$charset=~tr/A-Z/a-z/;
790	$charset=~s/\-/_/g;
791	$charset=~s/gb2312/gb/;
792	# assumes EUC-KR, not ISO-2022 !?
793	$charset=~s/ks_c_5601_1987/korean/;
794
795	# It appears that we can't always trust ascii text so we'll treat it
796	# as iso-8859-1 (letting characters above 0x80 through without
797	# converting them to utf-8 will result in invalid XML documents
798	# which can't be parsed at build time).
799	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
800
801	if ($charset eq "iso_8859_1") {
802	# test if the mailer lied, and it has win1252 chars in it...
803	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
804	if ($$textref =~ m/[\x80-\x9f]/) {
805	my $outhandle = $self->{'outhandle'};
806	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
807	print $outhandle "codepage 1252 detected.\n";
808	$charset = "windows_1252";
809	}
810	}
811	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
812	}
813
814
815	# Perl packages have to return true if they are run.
816	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: