Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3111

Last change on this file since 3111 was 3111, checked in by jrm21, 22 years ago

Allow .eml extension (IE and mozilla default to this for individual files).

Tidy up the checks for testing for valid mail file format.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 25.0 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63	package EMAILPlug;
64
65	use SplitPlug;
66
67	use unicode;
68
69	use sorttools;
70	use util;
71
72
73	# EMAILPlug is a sub-class of SplitPlug.
74
75	sub BEGIN {
76	@ISA = ('SplitPlug');
77	}
78
79	# Create a new EMAILPlug object with which to parse a file.
80	# Accomplished by creating a new BasPlug and using bless to
81	# turn it into an EMAILPlug.
82
83	sub new {
84	my ($class) = @_;
85	my $self = new BasPlug ("EMAILPlug", @_);
86	# this might not actually be true at read-time, but after processing
87	# it should all be utf8.
88	$self->{'input_encoding'}="utf8";
89	return bless $self, $class;
90	}
91
92	sub get_default_process_exp {
93	my $self = shift (@_);
94	# mbx/email for mailbox file format, \d+ for maildir (each message is
95	# in a separate file, with a unique number for filename)
96	# mozilla and IE will save individual mbx format files with a ".eml" ext.
97	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
98	}
99
100	# This plugin splits the mbox mail files at lines starting with From<sp>
101	# It is supposed to be "\n\nFrom ", but this isn't always used.
102	sub get_default_split_exp {
103	return q^\nFrom .*\n^;
104	}
105
106
107	# do plugin specific processing of doc_obj
108	sub process {
109
110	my $self = shift (@_);
111	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112	my $outhandle = $self->{'outhandle'};
113
114	# Check that we're dealing with a valid mail file
115	# mbox message files start with "From "
116	# maildir messages usually start with Return-Path and Delivered-To
117	# mh is very similar to maildir
118	my $startoffile=substr($$textref,0,256);
119	if (($startoffile !~ /^(From )/) &&
120	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
121	return undef;
122	}
123
124
125	print $outhandle "EMAILPlug: processing $file\n"
126	if $self->{'verbosity'} > 1;
127
128	my $cursection = $doc_obj->get_top_section();
129
130	#
131	# Parse the document's text and extract metadata
132	#
133
134	# Protect backslashes
135	$$textref =~ s@\\@\\\\@g;
136
137	# Separate header from body of message
138	my $Headers = $$textref;
139	$Headers =~ s/\r?\n\r?\n(.*)$//s;
140	$$textref = $1;
141
142	# See if headers include non-ascii - RFC says whole header should be ascii.
143	# not yet implemented, as we don't know what character set is the
144	# user's default... We can do textcat to guess, or we can just choose
145	# one of the charset fields later in the document (if there are any...)
146	# if ($Headers =~ /([[:^ascii:]])/) {
147	# }
148
149	# Unfold headers - see rfc822
150	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
151	# Extract basic metadata from header
152	my @headers = ("From", "To", "Subject", "Date");
153	my %raw;
154	foreach my $name (@headers) {
155	$raw{$name} = "No $name value";
156	}
157
158	# Examine each line of the headers
159	my ($line, $name, $value);
160	my @parts;
161	foreach $line (split(/\n/, $Headers)) {
162
163	# Ignore lines with no content or which begin with whitespace
164	next unless ($line =~ /:/);
165	next if ($line =~ /^\s/);
166
167	# Find out what metadata is on this line
168	@parts = split(/:/, $line);
169	$name = shift @parts;
170	# uppercase the first character according to the current locale
171	$name=~s/(.+)/\u$1/;
172	next unless $name;
173	next unless ($raw{$name});
174
175	# Find the value of that metadata
176	$value = join(":", @parts);
177	$value =~ s/^\s+//;
178	$value =~ s/\s+$//;
179
180	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
181	if ($value =~ /=\?/) {
182	my $original_value=$value;
183	my $encoded=$value;
184	$value="";
185	# this isn't quite right yet regarding spaces between encoded-texts
186	# (see examples, section 8. of rfc).
187	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=\s*//i) {
188	my ($charset, $encoding, $data)=($2,$3,$4);
189	my ($decoded_data);
190	$value.="$1"; # any leading chars
191	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
192	chomp $data;
193	$encoding =~ tr/BQ/bq/;
194	if ($encoding eq "q") { # quoted printable
195	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
196	$decoded_data=qp_decode($data);
197	} else { # base 64
198	$decoded_data=base64_decode($data);
199	}
200	$self->convert2unicode($charset, \$decoded_data);
201	$value .= $decoded_data;
202	} # end of while loop
203
204	# get any trailing characters
205	$self->convert2unicode("iso_8859_1", \$encoded);
206	$value.=$encoded;
207
208	if ($value =~ /^\s*$/) { # we couldn't extract anything...
209	$self->convert2unicode("iso_8859_1", \$original_value);
210	$value=original_value;
211	}
212	} # end of if =?...?=
213
214	# Store the metadata
215	$raw{$name} = $value;
216	}
217
218	# Extract the name and e-mail address from the From metadata
219	$frommeta = $raw{"From"};
220	my $fromnamemeta;
221	my $fromaddrmeta;
222
223	$frommeta =~ s/\s*$//; # Remove trailing space, if any
224
225	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
226	$fromnamemeta=$1;
227	$fromaddrmeta=$2;
228	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
229	$fromnamemeta=$2;
230	$fromaddrmeta=$1;
231	}
232	if (!defined($fromaddrmeta)) {
233	$fromaddrmeta=$frommeta;
234	}
235	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
236	# minor attempt to prevent spam-bots from harvesting addresses...
237	$fromaddrmeta=~s/@/@/;
238	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
239
240	if (defined($fromnamemeta)) {
241	$fromnamemeta =~ s/\"//g;
242	}
243	else {
244	$fromnamemeta = $fromaddrmeta;
245	}
246	# if name is an address
247	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
248	$fromnamemeta=~s/@/&#64\;/;
249	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
250
251	$raw{"From"}=$frommeta;
252
253	# Process Date information
254	if ($raw{"Date"} !~ /No Date/) {
255	$raw{"DateText"} = $raw{"Date"};
256
257	# Convert the date text to internal date format
258	$value = $raw{"Date"};
259	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
260	if ($year < 100) { $year += 1900; }
261	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
262
263	} else {
264	# We have not extracted a date
265	$raw{"DateText"} = "Unknown.";
266	$raw{"Date"} = "19000000";
267	}
268
269	# Add extracted metadata to document object
270	foreach my $name (keys %raw) {
271	$value = $raw{$name};
272	if ($value) {
273	# assume subject, etc headers have no special HTML meaning.
274	$value = &text_into_html($value);
275	# escape [] so it isn't re-interpreted as metadata
276	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
277	} else {
278	$value = "No $name field";
279	}
280	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
281	}
282
283	my $mimetype="text/plain";
284	my $mimeinfo="";
285	my $charset = "iso_8859_1";
286	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
287	# more than one parameter given to Content-type.
288	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
289	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s(\;\s.)\s*$/mi)
290	{
291	$mimetype=$1;
292	$mimetype =~ tr/[A-Z]/[a-z]/;
293
294	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
295	$mimetype = "text/plain";
296	}
297
298	$mimeinfo=$2;
299	if (!defined $mimeinfo) {
300	$mimeinfo="";
301	} else { # strip leading and trailing stuff
302	$mimeinfo =~ s/^\;\s*//;
303	$mimeinfo =~ s/\s*$//;
304	}
305	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
306	$charset = $1;
307	}
308	}
309
310	my $transfer_encoding="7bit";
311	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
312	$transfer_encoding=$1;
313	}
314	if ($mimetype eq "text/html") {
315	$$textref= $self->text_from_part("$Headers\n$$textref");
316	} elsif ($mimetype ne "text/plain") {
317	$$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
318	$outhandle);
319	} elsif ($transfer_encoding =~ /quoted\-printable/) {
320	$$textref=qp_decode($$textref);
321	$self->convert2unicode($charset, $textref);
322	} elsif ($transfer_encoding =~ /base64/) {
323	$$textref=base64_decode($$textref);
324	$self->convert2unicode($charset, $textref);
325	} else {
326	$self->convert2unicode($charset, $textref);
327	}
328
329
330	# Add "All headers" metadata
331	$Headers = &text_into_html($Headers);
332
333	$Headers = "No headers" unless ($Headers =~ /\w/);
334	$Headers =~ s/@/&#64\;/g;
335	# escape [] so it isn't re-interpreted as metadata
336	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
337
338	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
339
340
341	# Add Title metadata
342	my $Title = text_into_html($raw{'Subject'});
343	$Title .= "<br>From: " . text_into_html($raw{'From'});
344	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
345	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
346
347	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
348
349
350	# Add text to document object
351	if ($mimetype eq "text/plain") {
352	$$textref = &text_into_html($$textref);
353	}
354	$$textref = "No message" unless ($$textref =~ /\w/);
355	$doc_obj->add_utf8_text($cursection, $$textref);
356
357	return 1;
358	}
359
360
361	# Convert a text string into HTML.
362	#
363	# The HTML is going to be inserted into a GML file, so
364	# we have to be careful not to use symbols like ">",
365	# which ocurs frequently in email messages (and use
366	# &gt instead.
367	#
368	# This function also turns links and email addresses into hyperlinks,
369	# and replaces carriage returns with <BR> tags (and multiple carriage
370	# returns with <P> tags).
371
372
373	sub text_into_html {
374	my ($text) = @_;
375
376	# Convert problem characters into HTML symbols
377	$text =~ s/&/&/go;
378	$text =~ s/</</go;
379	$text =~ s/>/>/go;
380	$text =~ s/\"/"/go;
381
382	# convert email addresses and URIs into links
383	# don't markup email addresses for now
384	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
385
386	# try to munge email addresses a little bit...
387	$text =~ s/@/@/;
388	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
389	# assume URI doesn't finish with a '.'
390	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+))@<a href=\"$1\">$1<\/a>@g;
391
392
393	# Clean up whitespace and convert \n charaters to <BR> or <P>
394	$text =~ s/ +/ /go;
395	$text =~ s/\s*$//o;
396	$text =~ s/^\s*//o;
397	$text =~ s/\n/\n<BR>/go;
398	$text =~ s/<BR>\s*<BR>/<P>/go;
399
400	return $text;
401	}
402
403
404
405
406	#Process a MIME message.
407	# the textref we are given DOES NOT include the header.
408	sub text_from_mime_message {
409	my $self = shift(@_);
410	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
411
412	# Check for multiparts - $mimeinfo will be a boundary
413	if ($mimetype =~ /multipart/) {
414	$boundary="";
415	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
416	$boundary=$1;
417	if ($boundary =~ m@^\"@) {
418	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
419	}
420	} else {
421	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
422	}
423	# parts start with "--$boundary"
424	# message ends with "--$boundary--"
425	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
426	# that perl might want to interpolate. Also allows spaces...
427	$boundary=~s/\\/\\\\/g;
428	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
429	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
430	# remove first "part" and last "part" (final --)
431	shift @message_parts;
432	my $last=pop @message_parts;
433	# if our boundaries are a bit dodgy and we only found 1 part...
434	if (!defined($last)) {$last="";}
435	# make sure it is only -- and whitespace
436	if ($last !~ /^\-\-\s*$/ms) {
437	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
438	}
439	foreach my $message_part (@message_parts) {
440	# remove the leading newline left from split.
441	$message_part=~s/^\r?\n//;
442	}
443	if ($mimetype eq "multipart/alternative") {
444	# check for an HTML version first, then TEXT, otherwise use first.
445	my $part_text="";
446	foreach my $message_part (@message_parts) {
447	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
448	{
449	# Use the HTML version
450	$part_text= $self->text_from_part($message_part);
451	$mimetype="text/html";
452	last;
453	}
454	}
455	if ($part_text eq "") { # try getting a text part instead
456	foreach my $message_part (@message_parts) {
457	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
458	{
459	# Use the plain version
460	$part_text= $self->text_from_part($message_part);
461	if ($part_text =~/[^\s]/) {
462	$part_text="<pre>".$part_text."</pre>";
463	}
464	$mimetype="text/plain";
465	last;
466	}
467	}
468	}
469	if ($part_text eq "") { # use first part
470	$part_text= $self->text_from_part(shift @message_parts);
471	}
472	if ($part_text eq "") { # we couldn't get anything!!!
473	# or it was an empty message...
474	# do nothing...
475	print $outhandle "EMAILPlug: no text - empty body?\n";
476	} else {
477	$text=$part_text;
478	}
479	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related)@) {
480	$text="";
481	foreach my $message_part (@message_parts) {
482	my $part_header=$message_part;
483	my $part_body;
484	if ($message_part=~ /^\s*\n/) {
485	# no header... use defaults
486	$part_body=$message_part;
487	$part_header="Content-type: text/plain; charset=us-ascii";
488	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
489	$part_body=$1;
490	} else {
491	# something's gone wrong...
492	$part_header="";
493	$part_body=$message_part;
494	}
495
496	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
497	my $part_content_type="";
498	my $part_content_info="";
499	if ($mimetype eq "multipart/digest") {
500	# default type - RTFRFC!!
501	$part_content_type="message/rfc822";
502	}
503	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s(.?)\s*$@mi) {
504	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
505	$part_content_info=$2;
506	}
507	my $filename="";
508	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
509	$filename=$1;
510	}
511
512	# disposition - either inline or attachment.
513	# NOT CURRENTLY USED - we display all text types instead...
514	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
515
516	# add <<attachment>> to each part except the first...
517	if ($text ne "") {
518	$text.="\n<p><hr><strong><<attachment>></>";
519	# add part info header
520	$text.="<br>Type: $part_content_type<br>\n";
521	if ($filename ne "") {
522	$text.="Filename: $filename\n";
523	}
524	$text.="</strong></p>\n";
525	}
526
527	if ($part_content_type =~ m@text/@)
528	{
529	my $part_text= $self->text_from_part($message_part);
530	if ($part_content_type !~ m@text/(ht\|x)ml@) {
531	$part_text=text_into_html($part_text);
532	}
533	if ($part_text eq "") {
534	$part_text='<<empty message>>';
535	}
536	$text.=$part_text;
537	} elsif ($part_content_type =~ m@message/rfc822@) {
538	# This is a forwarded message
539	my $message_part_headers=$part_body;
540	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
541	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
542	my $message_part_body=$1;
543
544	my $rfc822_formatted_body=""; # put result in here
545	if ($message_part_headers =~
546	/^content\-type:\s([\w\/\-]+)\s\;?\s(.?)\s*$/ims)
547	{
548	# The message header uses MIME flags
549	my $message_content_type=$1;
550	my $message_content_info=$2;
551	if (!defined($message_content_info)) {
552	$message_content_info="";
553	}
554	$message_content_type =~ tr/A-Z/a-z/;
555	if ($message_content_type =~ /multipart/) {
556	$rfc822_formatted_body=
557	$self->text_from_mime_message($message_content_type,
558	$message_content_info,
559	$message_part_body,
560	$outhandle);
561	} else {
562	$message_part_body= $self->text_from_part($part_body);
563	$rfc822_formatted_body=text_into_html($message_part_body);
564	}
565	} else {
566	# message doesn't use MIME flags
567	$rfc822_formatted_body=text_into_html($message_part_body);
568	}
569	# Add the returned text to the output
570	# don't put all the headers...
571	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
572	$text.=text_into_html($message_part_headers);
573	$text.="<p>\n";
574	$text.=$rfc822_formatted_body;
575	# end of message/rfc822
576	} elsif ($part_content_type =~ /multipart/) {
577	# recurse again
578
579	$tmptext= $self->text_from_mime_message($part_content_type,
580	$part_content_info,
581	$part_body,
582	$outhandle);
583	$text.=$tmptext;
584	} elsif ($text eq "") {
585	# we can't do anything with this part, but if it's the first
586	# part then make sure it is mentioned..
587
588	$text.="\n<p><hr><strong><<attachment>></>";
589	# add part info header
590	$text.="<br>Type: $part_content_type<br>\n";
591	if ($filename ne "") {
592	$text.="Filename: $filename\n";
593	}
594	$text.="</strong></p>\n";
595	}
596	} # foreach message part.
597	} else {
598	# we can't handle this multipart type (not mixed or alternative)
599	# the RFC also mentions "parallel".
600	}
601	} # end of ($mimetype !~ multipart)
602	elsif ($mimetype =~ m@message/rfc822@) {
603	my $msg_header = $text;
604	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
605	$text = $1;
606
607	if ($msg_header =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
608	{
609	$mimetype=$1;
610	$mimetype =~ tr/[A-Z]/[a-z]/;
611	$mimeinfo=$2;
612	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
613	$charset = $1;
614	}
615	my $msg_text;
616	if ($mimetype =~ m@multipart/@) {
617	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
618	$text, $outhandle);
619	} else {$msg_text=$text;}
620
621	my $brief_header="";
622	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
623	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
624	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
625	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
626	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
627	$text= "\n<b><<attached message>></b><br>";
628	$text.= "<table><tr><td width=\"5%\"> </td>\n";
629	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
630	. "</td></tr></table>";
631	}
632	} else {
633	# we don't do any processing of the content.
634	}
635
636	return $text;
637	}
638
639
640
641
642
643
644	# Process a MIME part. Return "" if we can't decode it.
645	sub text_from_part {
646	my $self = shift(@_);
647	my $text=shift;
648	my $part_header=$text;
649	# check for empty part header (leading blank line)
650	if ($text =~ /^\s*\r?\n/) {
651	$part_header="Content-type: text/plain; charset=us-ascii";
652	} else {
653	$part_header =~ s/\r?\n\r?\n(.*)$//s;
654	$text=$1; if (!defined($text)) {$text="";}
655	}
656	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
657	$part_header =~ /content\-type:\s([\w\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
658	my $type=$1;
659	my $charset=$2;
660	if (!defined($type)) {$type="";}
661	if (!defined($charset)) {$charset="ascii";}
662	my $encoding="";
663	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
664	$encoding=$1; $encoding=~tr/A-Z/a-z/;
665	}
666	# Content-Transfer-Encoding is per-part
667	if ($encoding ne "") {
668	if ($encoding =~ /quoted\-printable/) {
669	$text=qp_decode($text);
670	} elsif ($encoding =~ /base64/) {
671	$text=base64_decode($text);
672	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
673	# rfc2045 also allows binary, which we ignore (for now).
674	# maybe this shouldn't go to stderr, but anyway...
675	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
676	return "";
677	}
678	}
679	if ($type eq "text/html") {
680	# only get stuff between <body> tags, or <html> tags.
681	$text =~ s@^.<html[^>]>@@is;
682	$text =~ s@</html>.*$@@is;
683	$text =~ s/^.?<body[^>]>//si;
684	$text =~ s/<\/body>.*$//si;
685	}
686	elsif ($type eq "text/xml") {
687	$text=~s/</</g;$text=~s/>/>/g;
688	$text="<pre>\n$text\n</pre>\n";
689	}
690	# convert to unicode
691	$self->convert2unicode($charset, \$text);
692	return $text;
693	}
694
695
696	# decode quoted-printable text
697	sub qp_decode {
698	my $text=shift;
699
700	my @lines=split('\n', $text);
701
702	# if a line ends with "=\s*", it is a soft line break, otherwise
703	# keep in any newline characters.
704	foreach my $line (@lines) {
705	if ($line !~ s/=\s*$//) {$line.="\n";}
706
707	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
708	my @hexcode_segments=split('=',$line);
709	shift @hexcode_segments;
710	my @hexcodes;
711	foreach my $hexcode (@hexcode_segments) {
712	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
713	chomp($hexcode); # just in case...
714	my $char=chr (hex "0x$hexcode");
715	$line =~ s/=$hexcode/$char/g;
716	}
717	}
718	}
719	$text= join('', @lines);
720	return $text;
721	}
722
723	# decode base64 text. This is fairly slow (since it's interpreted perl rather
724	# than compiled XS stuff like in the ::MIME modules, but this is more portable
725	# for us at least).
726	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
727	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
728	# from each encoded byte.
729
730
731	sub base64_decode {
732	my $enc_text = shift;
733	# A=>0, B=>1, ..., '+'=>62, '/'=>63
734	# also '=' is used for padding at the end, but we remove it anyway.
735	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
736	# map each MIME char into it's value, for more efficient lookup.
737	my %index;
738	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
739	# remove all non-base64 chars. eval to get variable in transliteration...
740	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
741	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
742	my $decoded="";
743	while (length ($enc_text)>3)
744	{
745	my $fourchars=substr($enc_text,0,4,"");
746	my @chars=(split '',$fourchars);
747	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
748	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
749	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
750	}
751	# if there are any input chars left, there are either
752	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
753	my @chars=(split '',$enc_text);
754	if (length($enc_text)) {
755	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
756	}
757	if (length($enc_text)==3) {
758	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
759	}
760	return $decoded;
761	}
762
763	sub convert2unicode {
764	my $self = shift(@_);
765	my ($charset, $textref) = @_;
766
767	# first get our character encoding name in the right form.
768	$charset = "iso_8859_1" unless defined $charset;
769	$charset=~tr/A-Z/a-z/;
770	$charset=~s/\-/_/g;
771	$charset=~s/gb2312/gb/;
772	# assumes EUC-KR, not ISO-2022 !?
773	$charset=~s/ks_c_5601_1987/korean/;
774
775	# It appears that we can't always trust ascii text so we'll treat it
776	# as iso-8859-1 (letting characters above 0x80 through without
777	# converting them to utf-8 will result in invalid XML documents
778	# which can't be parsed at build time).
779	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
780
781	if ($charset eq "iso_8859_1") {
782	# test if the mailer lied, and it has win1252 chars in it...
783	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
784	if ($$textref =~ m/[\x80-\x9f]/) {
785	my $outhandle = $self->{'outhandle'};
786	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
787	print $outhandle "codepage 1252 detected.\n";
788	$charset = "windows_1252";
789	}
790	}
791	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
792	}
793
794
795	# Perl packages have to return true if they are run.
796	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: