Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3143

Last change on this file since 3143 was 3143, checked in by jrm21, 22 years ago
Minor tweak for badly formatted dates. We now use a window, so anything less than 20 is assumed to be 2000 - 2020, and anything over 20 is assumed to be 1920. (I got spam dated "22 May 01" or some such). Previously we were just adding 1900, which meant 1901 in this case.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.8 KB

Rev	Line
[638]	1	###########################################################################
	2	#
	3	# EMAILPlug.pm - a plugin for parsing email files
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[2717]	9	# Copyright (C) 1999-2001 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
	29	# EMAILPlug
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[1206]	42	# $Headers All the header content
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[2630]	52	#
[2918]	53	#
[2630]	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[638]	63	package EMAILPlug;
	64
[1895]	65	use SplitPlug;
	66
[2730]	67	use unicode;
	68
[638]	69	use sorttools;
	70	use util;
	71
	72
[1895]	73	# EMAILPlug is a sub-class of SplitPlug.
[638]	74
[1206]	75	sub BEGIN {
[1895]	76	@ISA = ('SplitPlug');
[638]	77	}
	78
	79	# Create a new EMAILPlug object with which to parse a file.
[1206]	80	# Accomplished by creating a new BasPlug and using bless to
[638]	81	# turn it into an EMAILPlug.
	82
	83	sub new {
	84	my ($class) = @_;
[1244]	85	my $self = new BasPlug ("EMAILPlug", @_);
[2730]	86	# this might not actually be true at read-time, but after processing
	87	# it should all be utf8.
	88	$self->{'input_encoding'}="utf8";
[638]	89	return bless $self, $class;
	90	}
	91
[1244]	92	sub get_default_process_exp {
	93	my $self = shift (@_);
[2096]	94	# mbx/email for mailbox file format, \d+ for maildir (each message is
	95	# in a separate file, with a unique number for filename)
[3111]	96	# mozilla and IE will save individual mbx format files with a ".eml" ext.
	97	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
[638]	98	}
	99
[1895]	100	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	101	# It is supposed to be "\n\nFrom ", but this isn't always used.
[1895]	102	sub get_default_split_exp {
[2096]	103	return q^\nFrom .*\n^;
[1895]	104	}
	105
	106
[1244]	107	# do plugin specific processing of doc_obj
	108	sub process {
[2630]	109
[638]	110	my $self = shift (@_);
[1244]	111	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
[1424]	112	my $outhandle = $self->{'outhandle'};
	113
[1244]	114	# Check that we're dealing with a valid mail file
[3111]	115	# mbox message files start with "From "
	116	# maildir messages usually start with Return-Path and Delivered-To
	117	# mh is very similar to maildir
	118	my $startoffile=substr($$textref,0,256);
	119	if (($startoffile !~ /^(From )/) &&
	120	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	121	return undef;
	122	}
[638]	123
[1757]	124
[1424]	125	print $outhandle "EMAILPlug: processing $file\n"
[1244]	126	if $self->{'verbosity'} > 1;
[1206]	127
[1244]	128	my $cursection = $doc_obj->get_top_section();
[638]	129
[1206]	130	#
	131	# Parse the document's text and extract metadata
	132	#
[638]	133
[2652]	134	# Protect backslashes
	135	$$textref =~ s@\\@\\\\@g;
	136
[1206]	137	# Separate header from body of message
[1244]	138	my $Headers = $$textref;
[2630]	139	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	140	$$textref = $1;
[2779]	141
[2630]	142	# Unfold headers - see rfc822
	143	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	144	# Extract basic metadata from header
	145	my @headers = ("From", "To", "Subject", "Date");
	146	my %raw;
[1658]	147	foreach my $name (@headers) {
	148	$raw{$name} = "No $name value";
	149	}
[1206]	150
[3132]	151	# Get a default encoding for the header - RFC says should be ascii...
	152	my $default_heading_encoding="iso_8859_1";
	153
	154	# We don't know what character set is the user's default...
	155	# We could use textcat to guess... for now we'll look at mime content-type
	156	# if ($Headers =~ /([[:^ascii:]])/) {
	157	# }
	158	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	159	$default_header_encoding=$1;
	160	$default_header_encoding =~ s@\-@_@g;
	161	$default_header_encoding =~ tr/A-Z/a-z/;
	162	}
	163
	164
[1658]	165	# Examine each line of the headers
	166	my ($line, $name, $value);
	167	my @parts;
	168	foreach $line (split(/\n/, $Headers)) {
	169
	170	# Ignore lines with no content or which begin with whitespace
	171	next unless ($line =~ /:/);
	172	next if ($line =~ /^\s/);
	173
	174	# Find out what metadata is on this line
	175	@parts = split(/:/, $line);
	176	$name = shift @parts;
[3136]	177	# get fieldname in canonical form - first cap, then lower case.
[3134]	178	$name =~ tr/A-Z/a-z/;
[3136]	179	# uppercase the first character according to the current locale
[2630]	180	$name=~s/(.+)/\u$1/;
[1658]	181	next unless $name;
	182	next unless ($raw{$name});
	183
	184	# Find the value of that metadata
	185	$value = join(":", @parts);
	186	$value =~ s/^\s+//;
	187	$value =~ s/\s+$//;
	188
[2730]	189	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
[3132]	190	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
[2730]	191	my $original_value=$value;
	192	my $encoded=$value;
	193	$value="";
[2733]	194	# this isn't quite right yet regarding spaces between encoded-texts
	195	# (see examples, section 8. of rfc).
[2730]	196	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=\s*//i) {
	197	my ($charset, $encoding, $data)=($2,$3,$4);
[2847]	198	my ($decoded_data);
[2730]	199	$value.="$1"; # any leading chars
	200	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	201	chomp $data;
	202	$encoding =~ tr/BQ/bq/;
	203	if ($encoding eq "q") { # quoted printable
[2733]	204	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
[2730]	205	$decoded_data=qp_decode($data);
	206	} else { # base 64
	207	$decoded_data=base64_decode($data);
	208	}
[2847]	209	$self->convert2unicode($charset, \$decoded_data);
	210	$value .= $decoded_data;
	211	} # end of while loop
[3132]	212
[2847]	213	# get any trailing characters
[3132]	214	$self->convert2unicode($default_header_encoding, \$encoded);
[2847]	215	$value.=$encoded;
	216
[2730]	217	if ($value =~ /^\s*$/) { # we couldn't extract anything...
[3132]	218	$self->convert2unicode($default_header_encoding,
	219	\$original_value);
[2847]	220	$value=original_value;
[2730]	221	}
[3136]	222	} # end of if =?...?=
	223
	224	# In the absense of other charset information, assume the
	225	# header is the default (usually "iso_8859_1") and convert it to unicode.
	226	else {
[3132]	227	$self->convert2unicode($default_header_encoding, \$value);
	228	}
[2730]	229
[1658]	230	# Store the metadata
[1206]	231	$raw{$name} = $value;
	232	}
	233
[2630]	234	# Extract the name and e-mail address from the From metadata
	235	$frommeta = $raw{"From"};
[2680]	236	my $fromnamemeta;
	237	my $fromaddrmeta;
	238
	239	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	240
	241	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	242	$fromnamemeta=$1;
	243	$fromaddrmeta=$2;
	244	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
	245	$fromnamemeta=$2;
	246	$fromaddrmeta=$1;
	247	}
[2630]	248	if (!defined($fromaddrmeta)) {
	249	$fromaddrmeta=$frommeta;
	250	}
[2680]	251	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	252	# minor attempt to prevent spam-bots from harvesting addresses...
	253	$fromaddrmeta=~s/@/@/;
[2630]	254	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	255
[2662]	256	if (defined($fromnamemeta)) {
[2630]	257	$fromnamemeta =~ s/\"//g;
	258	}
	259	else {
	260	$fromnamemeta = $fromaddrmeta;
	261	}
	262	# if name is an address
	263	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	264	$fromnamemeta=~s/@/&#64\;/;
[2630]	265	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	266
	267	$raw{"From"}=$frommeta;
	268
[1206]	269	# Process Date information
[1658]	270	if ($raw{"Date"} !~ /No Date/) {
[1206]	271	$raw{"DateText"} = $raw{"Date"};
[638]	272
[1206]	273	# Convert the date text to internal date format
	274	$value = $raw{"Date"};
	275	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[3143]	276	# make some assumptions about the year formatting...
	277	# some (old) software thinks 2001 is 101, some think 2001 is 01
	278	if ($year < 20) { $year += 2000; } # assume not really 1920...
	279	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	280	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	281
[1206]	282	} else {
	283	# We have not extracted a date
	284	$raw{"DateText"} = "Unknown.";
	285	$raw{"Date"} = "19000000";
[638]	286	}
	287
[1244]	288	# Add extracted metadata to document object
[1206]	289	foreach my $name (keys %raw) {
	290	$value = $raw{$name};
	291	if ($value) {
[2730]	292	# assume subject, etc headers have no special HTML meaning.
[1206]	293	$value = &text_into_html($value);
[2730]	294	# escape [] so it isn't re-interpreted as metadata
	295	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	296	} else {
	297	$value = "No $name field";
	298	}
[1244]	299	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	300	}
[638]	301
[2630]	302	my $mimetype="text/plain";
	303	my $mimeinfo="";
[3132]	304	my $charset = $default_header_encoding;
[3073]	305	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	306	# more than one parameter given to Content-type.
	307	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
	308	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s(\;\s.)\s*$/mi)
[2630]	309	{
	310	$mimetype=$1;
	311	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	312
	313	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	314	$mimetype = "text/plain";
	315	}
	316
[2630]	317	$mimeinfo=$2;
[3073]	318	if (!defined $mimeinfo) {
	319	$mimeinfo="";
	320	} else { # strip leading and trailing stuff
	321	$mimeinfo =~ s/^\;\s*//;
	322	$mimeinfo =~ s/\s*$//;
	323	}
	324	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	325	$charset = $1;
	326	}
[2630]	327	}
	328
[2680]	329	my $transfer_encoding="7bit";
	330	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	331	$transfer_encoding=$1;
	332	}
[2886]	333	if ($mimetype eq "text/html") {
	334	$$textref= $self->text_from_part("$Headers\n$$textref");
	335	} elsif ($mimetype ne "text/plain") {
[2847]	336	$$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
	337	$outhandle);
[2680]	338	} elsif ($transfer_encoding =~ /quoted\-printable/) {
	339	$$textref=qp_decode($$textref);
[2886]	340	$self->convert2unicode($charset, $textref);
[2680]	341	} elsif ($transfer_encoding =~ /base64/) {
	342	$$textref=base64_decode($$textref);
[2886]	343	$self->convert2unicode($charset, $textref);
[2847]	344	} else {
[2886]	345	$self->convert2unicode($charset, $textref);
[2680]	346	}
[2630]	347
	348
[1206]	349	# Add "All headers" metadata
	350	$Headers = &text_into_html($Headers);
[2754]	351
[1206]	352	$Headers = "No headers" unless ($Headers =~ /\w/);
[2717]	353	$Headers =~ s/@/&#64\;/g;
[2730]	354	# escape [] so it isn't re-interpreted as metadata
	355	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
[3134]	356	$self->convert2unicode($charset, \$Headers);
[1244]	357	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
[638]	358
[2918]	359
	360	# Add Title metadata
	361	my $Title = text_into_html($raw{'Subject'});
	362	$Title .= "<br>From: " . text_into_html($raw{'From'});
	363	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	364	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	365
	366	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	367
	368
[1244]	369	# Add text to document object
[2630]	370	if ($mimetype eq "text/plain") {
	371	$$textref = &text_into_html($$textref);
	372	}
[1244]	373	$$textref = "No message" unless ($$textref =~ /\w/);
	374	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	375
[1244]	376	return 1;
[638]	377	}
	378
	379
	380	# Convert a text string into HTML.
	381	#
	382	# The HTML is going to be inserted into a GML file, so
	383	# we have to be careful not to use symbols like ">",
	384	# which ocurs frequently in email messages (and use
	385	# &gt instead.
	386	#
	387	# This function also turns links and email addresses into hyperlinks,
	388	# and replaces carriage returns with <BR> tags (and multiple carriage
	389	# returns with <P> tags).
	390
[1206]	391
[638]	392	sub text_into_html {
	393	my ($text) = @_;
	394
[1244]	395	# Convert problem characters into HTML symbols
[3132]	396	$text =~ s/&/&/g;
	397	$text =~ s/</</g;
	398	$text =~ s/>/>/g;
	399	$text =~ s/\"/"/g;
[638]	400
[2630]	401	# convert email addresses and URIs into links
	402	# don't markup email addresses for now
	403	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	404
[2918]	405	# try to munge email addresses a little bit...
	406	$text =~ s/@/@/;
[2730]	407	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	408	# assume URI doesn't finish with a '.'
	409	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+))@<a href=\"$1\">$1<\/a>@g;
[2630]	410
	411
[638]	412	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	413	$text =~ s/ +/ /g;
	414	$text =~ s/\s*$//g;
	415	$text =~ s/^\s*//g;
	416	$text =~ s/\n/\n<br>/g;
	417	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	418
	419	return $text;
	420	}
	421
	422
[2630]	423
	424
	425	#Process a MIME message.
	426	# the textref we are given DOES NOT include the header.
	427	sub text_from_mime_message {
[2847]	428	my $self = shift(@_);
[2680]	429	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
[2630]	430
	431	# Check for multiparts - $mimeinfo will be a boundary
	432	if ($mimetype =~ /multipart/) {
	433	$boundary="";
[2732]	434	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	435	$boundary=$1;
[2732]	436	if ($boundary =~ m@^\"@) {
	437	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	438	}
	439	} else {
	440	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
[2630]	441	}
	442	# parts start with "--$boundary"
	443	# message ends with "--$boundary--"
	444	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	445	# that perl might want to interpolate. Also allows spaces...
[2630]	446	$boundary=~s/\\/\\\\/g;
	447	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]	448	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	449	# remove first "part" and last "part" (final --)
	450	shift @message_parts;
	451	my $last=pop @message_parts;
[2680]	452	# if our boundaries are a bit dodgy and we only found 1 part...
	453	if (!defined($last)) {$last="";}
[2630]	454	# make sure it is only -- and whitespace
	455	if ($last !~ /^\-\-\s*$/ms) {
	456	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
	457	}
	458	foreach my $message_part (@message_parts) {
	459	# remove the leading newline left from split.
	460	$message_part=~s/^\r?\n//;
	461	}
	462	if ($mimetype eq "multipart/alternative") {
	463	# check for an HTML version first, then TEXT, otherwise use first.
	464	my $part_text="";
	465	foreach my $message_part (@message_parts) {
	466	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
	467	{
	468	# Use the HTML version
[2847]	469	$part_text= $self->text_from_part($message_part);
[2630]	470	$mimetype="text/html";
	471	last;
	472	}
	473	}
	474	if ($part_text eq "") { # try getting a text part instead
	475	foreach my $message_part (@message_parts) {
	476	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
	477	{
	478	# Use the plain version
[2847]	479	$part_text= $self->text_from_part($message_part);
[2732]	480	if ($part_text =~/[^\s]/) {
	481	$part_text="<pre>".$part_text."</pre>";
	482	}
[2630]	483	$mimetype="text/plain";
	484	last;
	485	}
	486	}
	487	}
	488	if ($part_text eq "") { # use first part
[2847]	489	$part_text= $self->text_from_part(shift @message_parts);
[2630]	490	}
	491	if ($part_text eq "") { # we couldn't get anything!!!
	492	# or it was an empty message...
	493	# do nothing...
	494	print $outhandle "EMAILPlug: no text - empty body?\n";
	495	} else {
	496	$text=$part_text;
	497	}
[2652]	498	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related)@) {
[2630]	499	$text="";
	500	foreach my $message_part (@message_parts) {
	501	my $part_header=$message_part;
[2680]	502	my $part_body;
[2681]	503	if ($message_part=~ /^\s*\n/) {
	504	# no header... use defaults
	505	$part_body=$message_part;
	506	$part_header="Content-type: text/plain; charset=us-ascii";
	507	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
[2680]	508	$part_body=$1;
	509	} else {
[2681]	510	# something's gone wrong...
	511	$part_header="";
[2680]	512	$part_body=$message_part;
	513	}
[2681]	514
[2630]	515	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	516	my $part_content_type="";
	517	my $part_content_info="";
	518	if ($mimetype eq "multipart/digest") {
	519	# default type - RTFRFC!!
	520	$part_content_type="message/rfc822";
	521	}
[2781]	522	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s(.?)\s*$@mi) {
[2630]	523	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	524	$part_content_info=$2;
	525	}
	526	my $filename="";
	527	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
	528	$filename=$1;
	529	}
	530
	531	# disposition - either inline or attachment.
	532	# NOT CURRENTLY USED - we display all text types instead...
	533	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	534
	535	# add <<attachment>> to each part except the first...
	536	if ($text ne "") {
[2730]	537	$text.="\n<p><hr><strong><<attachment>></>";
[2630]	538	# add part info header
	539	$text.="<br>Type: $part_content_type<br>\n";
	540	if ($filename ne "") {
	541	$text.="Filename: $filename\n";
	542	}
	543	$text.="</strong></p>\n";
	544	}
	545
	546	if ($part_content_type =~ m@text/@)
	547	{
[2847]	548	my $part_text= $self->text_from_part($message_part);
[2630]	549	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	550	$part_text=text_into_html($part_text);
	551	}
	552	if ($part_text eq "") {
	553	$part_text='<<empty message>>';
	554	}
	555	$text.=$part_text;
	556	} elsif ($part_content_type =~ m@message/rfc822@) {
	557	# This is a forwarded message
	558	my $message_part_headers=$part_body;
	559	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	560	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	561	my $message_part_body=$1;
	562
	563	my $rfc822_formatted_body=""; # put result in here
	564	if ($message_part_headers =~
[2779]	565	/^content\-type:\s([\w\/\-]+)\s\;?\s(.?)\s*$/ims)
[2630]	566	{
	567	# The message header uses MIME flags
	568	my $message_content_type=$1;
	569	my $message_content_info=$2;
	570	if (!defined($message_content_info)) {
	571	$message_content_info="";
	572	}
	573	$message_content_type =~ tr/A-Z/a-z/;
	574	if ($message_content_type =~ /multipart/) {
	575	$rfc822_formatted_body=
[2847]	576	$self->text_from_mime_message($message_content_type,
	577	$message_content_info,
	578	$message_part_body,
	579	$outhandle);
[2630]	580	} else {
[2847]	581	$message_part_body= $self->text_from_part($part_body);
[2630]	582	$rfc822_formatted_body=text_into_html($message_part_body);
	583	}
	584	} else {
	585	# message doesn't use MIME flags
	586	$rfc822_formatted_body=text_into_html($message_part_body);
	587	}
	588	# Add the returned text to the output
	589	# don't put all the headers...
	590	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	591	$text.=text_into_html($message_part_headers);
	592	$text.="<p>\n";
	593	$text.=$rfc822_formatted_body;
	594	# end of message/rfc822
	595	} elsif ($part_content_type =~ /multipart/) {
	596	# recurse again
[2781]	597
[2847]	598	$tmptext= $self->text_from_mime_message($part_content_type,
	599	$part_content_info,
	600	$part_body,
	601	$outhandle);
[2630]	602	$text.=$tmptext;
	603	} elsif ($text eq "") {
	604	# we can't do anything with this part, but if it's the first
	605	# part then make sure it is mentioned..
	606
[2730]	607	$text.="\n<p><hr><strong><<attachment>></>";
[2630]	608	# add part info header
	609	$text.="<br>Type: $part_content_type<br>\n";
	610	if ($filename ne "") {
	611	$text.="Filename: $filename\n";
	612	}
	613	$text.="</strong></p>\n";
	614	}
	615	} # foreach message part.
	616	} else {
	617	# we can't handle this multipart type (not mixed or alternative)
	618	# the RFC also mentions "parallel".
	619	}
[2886]	620	} # end of ($mimetype !~ multipart)
[2918]	621	elsif ($mimetype =~ m@message/rfc822@) {
	622	my $msg_header = $text;
	623	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	624	$text = $1;
	625
	626	if ($msg_header =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
	627	{
	628	$mimetype=$1;
	629	$mimetype =~ tr/[A-Z]/[a-z]/;
	630	$mimeinfo=$2;
	631	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
	632	$charset = $1;
	633	}
	634	my $msg_text;
	635	if ($mimetype =~ m@multipart/@) {
	636	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
	637	$text, $outhandle);
	638	} else {$msg_text=$text;}
	639
	640	my $brief_header="";
	641	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
	642	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
	643	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
	644	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
	645	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
	646	$text= "\n<b><<attached message>></b><br>";
	647	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	648	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	649	. "</td></tr></table>";
	650	}
	651	} else {
[2886]	652	# we don't do any processing of the content.
	653	}
	654
[2630]	655	return $text;
	656	}
	657
	658
	659
	660
	661
	662
	663	# Process a MIME part. Return "" if we can't decode it.
	664	sub text_from_part {
[3132]	665	my $self = shift;
[3136]	666	my $text = shift \|\| '';
	667	my $part_header = $text;
	668
[2681]	669	# check for empty part header (leading blank line)
	670	if ($text =~ /^\s*\r?\n/) {
	671	$part_header="Content-type: text/plain; charset=us-ascii";
	672	} else {
	673	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	674	$text=$1; if (!defined($text)) {$text="";}
	675	}
[2630]	676	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2730]	677	$part_header =~ /content\-type:\s([\w\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
	678	my $type=$1;
	679	my $charset=$2;
	680	if (!defined($type)) {$type="";}
	681	if (!defined($charset)) {$charset="ascii";}
[2630]	682	my $encoding="";
[2638]	683	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	684	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	685	}
	686	# Content-Transfer-Encoding is per-part
	687	if ($encoding ne "") {
	688	if ($encoding =~ /quoted\-printable/) {
	689	$text=qp_decode($text);
	690	} elsif ($encoding =~ /base64/) {
	691	$text=base64_decode($text);
	692	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	693	# rfc2045 also allows binary, which we ignore (for now).
	694	# maybe this shouldn't go to stderr, but anyway...
	695	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
	696	return "";
	697	}
	698	}
	699	if ($type eq "text/html") {
	700	# only get stuff between <body> tags, or <html> tags.
[2730]	701	$text =~ s@^.<html[^>]>@@is;
	702	$text =~ s@</html>.*$@@is;
	703	$text =~ s/^.?<body[^>]>//si;
	704	$text =~ s/<\/body>.*$//si;
[2630]	705	}
	706	elsif ($type eq "text/xml") {
	707	$text=~s/</</g;$text=~s/>/>/g;
	708	$text="<pre>\n$text\n</pre>\n";
	709	}
[2730]	710	# convert to unicode
[2847]	711	$self->convert2unicode($charset, \$text);
[2630]	712	return $text;
	713	}
	714
	715
	716	# decode quoted-printable text
	717	sub qp_decode {
	718	my $text=shift;
	719
	720	my @lines=split('\n', $text);
	721
	722	# if a line ends with "=\s*", it is a soft line break, otherwise
	723	# keep in any newline characters.
	724	foreach my $line (@lines) {
[2886]	725	if ($line !~ s/=\s*$//) {$line.="\n";}
[2630]	726
	727	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
	728	my @hexcode_segments=split('=',$line);
	729	shift @hexcode_segments;
	730	my @hexcodes;
	731	foreach my $hexcode (@hexcode_segments) {
	732	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
	733	chomp($hexcode); # just in case...
	734	my $char=chr (hex "0x$hexcode");
	735	$line =~ s/=$hexcode/$char/g;
	736	}
	737	}
	738	}
	739	$text= join('', @lines);
	740	return $text;
	741	}
	742
	743	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	744	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	745	# for us at least).
	746	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	747	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	748	# from each encoded byte.
[2630]	749
	750
	751	sub base64_decode {
	752	my $enc_text = shift;
	753	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	754	# also '=' is used for padding at the end, but we remove it anyway.
	755	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	756	# map each MIME char into it's value, for more efficient lookup.
	757	my %index;
	758	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	759	# remove all non-base64 chars. eval to get variable in transliteration...
	760	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	761	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	762	my $decoded="";
	763	while (length ($enc_text)>3)
	764	{
	765	my $fourchars=substr($enc_text,0,4,"");
	766	my @chars=(split '',$fourchars);
	767	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	768	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	769	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	770	}
	771	# if there are any input chars left, there are either
	772	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	773	my @chars=(split '',$enc_text);
	774	if (length($enc_text)) {
	775	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	776	}
	777	if (length($enc_text)==3) {
	778	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	779	}
	780	return $decoded;
	781	}
	782
[2847]	783	sub convert2unicode {
	784	my $self = shift(@_);
	785	my ($charset, $textref) = @_;
[2630]	786
[2847]	787	# first get our character encoding name in the right form.
	788	$charset = "iso_8859_1" unless defined $charset;
	789	$charset=~tr/A-Z/a-z/;
	790	$charset=~s/\-/_/g;
	791	$charset=~s/gb2312/gb/;
	792	# assumes EUC-KR, not ISO-2022 !?
	793	$charset=~s/ks_c_5601_1987/korean/;
	794
	795	# It appears that we can't always trust ascii text so we'll treat it
	796	# as iso-8859-1 (letting characters above 0x80 through without
	797	# converting them to utf-8 will result in invalid XML documents
	798	# which can't be parsed at build time).
	799	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	800
[3073]	801	if ($charset eq "iso_8859_1") {
	802	# test if the mailer lied, and it has win1252 chars in it...
	803	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	804	if ($$textref =~ m/[\x80-\x9f]/) {
	805	my $outhandle = $self->{'outhandle'};
	806	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
	807	print $outhandle "codepage 1252 detected.\n";
	808	$charset = "windows_1252";
	809	}
	810	}
[2847]	811	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	812	}
	813
	814
[1206]	815	# Perl packages have to return true if they are run.
	816	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: