Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3039

Last change on this file since 3039 was 2918, checked in by jrm21, 22 years ago
Add [Title] metadata so that the default format strings will show something other than "Untitled". These titles currently have html tags in them, which might possibly cause problems (eg this becomes the title for the browser window).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.9 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63	package EMAILPlug;
64
65	use SplitPlug;
66
67	use unicode;
68
69	use sorttools;
70	use util;
71
72
73	# EMAILPlug is a sub-class of SplitPlug.
74
75	sub BEGIN {
76	@ISA = ('SplitPlug');
77	}
78
79	# Create a new EMAILPlug object with which to parse a file.
80	# Accomplished by creating a new BasPlug and using bless to
81	# turn it into an EMAILPlug.
82
83	sub new {
84	my ($class) = @_;
85	my $self = new BasPlug ("EMAILPlug", @_);
86	# this might not actually be true at read-time, but after processing
87	# it should all be utf8.
88	$self->{'input_encoding'}="utf8";
89	return bless $self, $class;
90	}
91
92	sub get_default_process_exp {
93	my $self = shift (@_);
94	# mbx/email for mailbox file format, \d+ for maildir (each message is
95	# in a separate file, with a unique number for filename)
96	return q@([\\/]\d+\|\.(mbx\|email))$@;
97	}
98
99	# This plugin splits the mbox mail files at lines starting with From<sp>
100	sub get_default_split_exp {
101	return q^\nFrom .*\n^;
102	}
103
104
105	# do plugin specific processing of doc_obj
106	sub process {
107
108	my $self = shift (@_);
109	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
110	my $outhandle = $self->{'outhandle'};
111
112	# Check that we're dealing with a valid mail file
113	return undef unless (($$textref =~ /From:/m) \|\| ($$textref =~ /To:/m));
114
115	# slightly more strict validity check, to prevent us from matching
116	# .so.x files ...
117	return undef unless (($$textref =~ /^From /) \|\|
118	($$textref =~ /^[-A-Za-z]{2,100}:/m));
119
120	print $outhandle "EMAILPlug: processing $file\n"
121	if $self->{'verbosity'} > 1;
122
123	my $cursection = $doc_obj->get_top_section();
124
125	#
126	# Parse the document's text and extract metadata
127	#
128
129	# Protect backslashes
130	$$textref =~ s@\\@\\\\@g;
131
132	# Separate header from body of message
133	my $Headers = $$textref;
134	$Headers =~ s/\r?\n\r?\n(.*)$//s;
135	$$textref = $1;
136
137	# See if headers include non-ascii - RFC says whole header should be ascii.
138	# not yet implemented, as we don't know what character set is the
139	# user's default... We can do textcat to guess, or we can just choose
140	# one of the charset fields later in the document (if there are any...)
141	# if ($Headers =~ /([[:^ascii:]])/) {
142	# }
143
144	# Unfold headers - see rfc822
145	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
146	# Extract basic metadata from header
147	my @headers = ("From", "To", "Subject", "Date");
148	my %raw;
149	foreach my $name (@headers) {
150	$raw{$name} = "No $name value";
151	}
152
153	# Examine each line of the headers
154	my ($line, $name, $value);
155	my @parts;
156	foreach $line (split(/\n/, $Headers)) {
157
158	# Ignore lines with no content or which begin with whitespace
159	next unless ($line =~ /:/);
160	next if ($line =~ /^\s/);
161
162	# Find out what metadata is on this line
163	@parts = split(/:/, $line);
164	$name = shift @parts;
165	# uppercase the first character according to the current locale
166	$name=~s/(.+)/\u$1/;
167	next unless $name;
168	next unless ($raw{$name});
169
170	# Find the value of that metadata
171	$value = join(":", @parts);
172	$value =~ s/^\s+//;
173	$value =~ s/\s+$//;
174
175	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
176	if ($value =~ /=\?/) {
177	my $original_value=$value;
178	my $encoded=$value;
179	$value="";
180	# this isn't quite right yet regarding spaces between encoded-texts
181	# (see examples, section 8. of rfc).
182	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=\s*//i) {
183	my ($charset, $encoding, $data)=($2,$3,$4);
184	my ($decoded_data);
185	$value.="$1"; # any leading chars
186	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
187	chomp $data;
188	$encoding =~ tr/BQ/bq/;
189	if ($encoding eq "q") { # quoted printable
190	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
191	$decoded_data=qp_decode($data);
192	} else { # base 64
193	$decoded_data=base64_decode($data);
194	}
195	$self->convert2unicode($charset, \$decoded_data);
196	$value .= $decoded_data;
197	} # end of while loop
198
199	# get any trailing characters
200	$self->convert2unicode("iso_8859_1", \$encoded);
201	$value.=$encoded;
202
203	if ($value =~ /^\s*$/) { # we couldn't extract anything...
204	$self->convert2unicode("iso_8859_1", \$original_value);
205	$value=original_value;
206	}
207	} # end of if =?...?=
208
209	# Store the metadata
210	$raw{$name} = $value;
211	}
212
213	# Extract the name and e-mail address from the From metadata
214	$frommeta = $raw{"From"};
215	my $fromnamemeta;
216	my $fromaddrmeta;
217
218	$frommeta =~ s/\s*$//; # Remove trailing space, if any
219
220	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
221	$fromnamemeta=$1;
222	$fromaddrmeta=$2;
223	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
224	$fromnamemeta=$2;
225	$fromaddrmeta=$1;
226	}
227	if (!defined($fromaddrmeta)) {
228	$fromaddrmeta=$frommeta;
229	}
230	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
231	# minor attempt to prevent spam-bots from harvesting addresses...
232	$fromaddrmeta=~s/@/@/;
233	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
234
235	if (defined($fromnamemeta)) {
236	$fromnamemeta =~ s/\"//g;
237	}
238	else {
239	$fromnamemeta = $fromaddrmeta;
240	}
241	# if name is an address
242	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
243	$fromnamemeta=~s/@/&#64\;/;
244	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
245
246	$raw{"From"}=$frommeta;
247
248	# Process Date information
249	if ($raw{"Date"} !~ /No Date/) {
250	$raw{"DateText"} = $raw{"Date"};
251
252	# Convert the date text to internal date format
253	$value = $raw{"Date"};
254	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
255	if ($year < 100) { $year += 1900; }
256	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
257
258	} else {
259	# We have not extracted a date
260	$raw{"DateText"} = "Unknown.";
261	$raw{"Date"} = "19000000";
262	}
263
264	# Add extracted metadata to document object
265	foreach my $name (keys %raw) {
266	$value = $raw{$name};
267	if ($value) {
268	# assume subject, etc headers have no special HTML meaning.
269	$value = &text_into_html($value);
270	# escape [] so it isn't re-interpreted as metadata
271	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
272	} else {
273	$value = "No $name field";
274	}
275	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
276	}
277
278	my $mimetype="text/plain";
279	my $mimeinfo="";
280	my $charset = "iso_8859_1";
281	# Do MIME and encoding stuff
282	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
283	{
284	$mimetype=$1;
285	$mimetype =~ tr/[A-Z]/[a-z]/;
286	$mimeinfo=$2;
287	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
288	$charset = $1;
289	}
290	}
291
292	my $transfer_encoding="7bit";
293	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
294	$transfer_encoding=$1;
295	}
296	if ($mimetype eq "text/html") {
297	$$textref= $self->text_from_part("$Headers\n$$textref");
298	} elsif ($mimetype ne "text/plain") {
299	$$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
300	$outhandle);
301	} elsif ($transfer_encoding =~ /quoted\-printable/) {
302	$$textref=qp_decode($$textref);
303	$self->convert2unicode($charset, $textref);
304	} elsif ($transfer_encoding =~ /base64/) {
305	$$textref=base64_decode($$textref);
306	$self->convert2unicode($charset, $textref);
307	} else {
308	$self->convert2unicode($charset, $textref);
309	}
310
311
312	# Add "All headers" metadata
313	$Headers = &text_into_html($Headers);
314
315	$Headers = "No headers" unless ($Headers =~ /\w/);
316	$Headers =~ s/@/&#64\;/g;
317	# escape [] so it isn't re-interpreted as metadata
318	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
319
320	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
321
322
323	# Add Title metadata
324	my $Title = text_into_html($raw{'Subject'});
325	$Title .= "<br>From: " . text_into_html($raw{'From'});
326	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
327
328	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
329
330
331	# Add text to document object
332	if ($mimetype eq "text/plain") {
333	$$textref = &text_into_html($$textref);
334	}
335	$$textref = "No message" unless ($$textref =~ /\w/);
336	$doc_obj->add_utf8_text($cursection, $$textref);
337
338	return 1;
339	}
340
341
342	# Convert a text string into HTML.
343	#
344	# The HTML is going to be inserted into a GML file, so
345	# we have to be careful not to use symbols like ">",
346	# which ocurs frequently in email messages (and use
347	# &gt instead.
348	#
349	# This function also turns links and email addresses into hyperlinks,
350	# and replaces carriage returns with <BR> tags (and multiple carriage
351	# returns with <P> tags).
352
353
354	sub text_into_html {
355	my ($text) = @_;
356
357	# Convert problem characters into HTML symbols
358	$text =~ s/&/&/go;
359	$text =~ s/</</go;
360	$text =~ s/>/>/go;
361	$text =~ s/\"/"/go;
362
363	# convert email addresses and URIs into links
364	# don't markup email addresses for now
365	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
366
367	# try to munge email addresses a little bit...
368	$text =~ s/@/@/;
369	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
370	# assume URI doesn't finish with a '.'
371	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+))@<a href=\"$1\">$1<\/a>@g;
372
373
374	# Clean up whitespace and convert \n charaters to <BR> or <P>
375	$text =~ s/ +/ /go;
376	$text =~ s/\s*$//o;
377	$text =~ s/^\s*//o;
378	$text =~ s/\n/\n<BR>/go;
379	$text =~ s/<BR>\s*<BR>/<P>/go;
380
381	return $text;
382	}
383
384
385
386
387	#Process a MIME message.
388	# the textref we are given DOES NOT include the header.
389	sub text_from_mime_message {
390	my $self = shift(@_);
391	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
392
393	# Check for multiparts - $mimeinfo will be a boundary
394	if ($mimetype =~ /multipart/) {
395	$boundary="";
396	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
397	$boundary=$1;
398	if ($boundary =~ m@^\"@) {
399	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
400	}
401	} else {
402	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
403	}
404	# parts start with "--$boundary"
405	# message ends with "--$boundary--"
406	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
407	# that perl might want to interpolate. Also allows spaces...
408	$boundary=~s/\\/\\\\/g;
409	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
410	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
411	# remove first "part" and last "part" (final --)
412	shift @message_parts;
413	my $last=pop @message_parts;
414	# if our boundaries are a bit dodgy and we only found 1 part...
415	if (!defined($last)) {$last="";}
416	# make sure it is only -- and whitespace
417	if ($last !~ /^\-\-\s*$/ms) {
418	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
419	}
420	foreach my $message_part (@message_parts) {
421	# remove the leading newline left from split.
422	$message_part=~s/^\r?\n//;
423	}
424	if ($mimetype eq "multipart/alternative") {
425	# check for an HTML version first, then TEXT, otherwise use first.
426	my $part_text="";
427	foreach my $message_part (@message_parts) {
428	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
429	{
430	# Use the HTML version
431	$part_text= $self->text_from_part($message_part);
432	$mimetype="text/html";
433	last;
434	}
435	}
436	if ($part_text eq "") { # try getting a text part instead
437	foreach my $message_part (@message_parts) {
438	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
439	{
440	# Use the plain version
441	$part_text= $self->text_from_part($message_part);
442	if ($part_text =~/[^\s]/) {
443	$part_text="<pre>".$part_text."</pre>";
444	}
445	$mimetype="text/plain";
446	last;
447	}
448	}
449	}
450	if ($part_text eq "") { # use first part
451	$part_text= $self->text_from_part(shift @message_parts);
452	}
453	if ($part_text eq "") { # we couldn't get anything!!!
454	# or it was an empty message...
455	# do nothing...
456	print $outhandle "EMAILPlug: no text - empty body?\n";
457	} else {
458	$text=$part_text;
459	}
460	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related)@) {
461	$text="";
462	foreach my $message_part (@message_parts) {
463	my $part_header=$message_part;
464	my $part_body;
465	if ($message_part=~ /^\s*\n/) {
466	# no header... use defaults
467	$part_body=$message_part;
468	$part_header="Content-type: text/plain; charset=us-ascii";
469	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
470	$part_body=$1;
471	} else {
472	# something's gone wrong...
473	$part_header="";
474	$part_body=$message_part;
475	}
476
477	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
478	my $part_content_type="";
479	my $part_content_info="";
480	if ($mimetype eq "multipart/digest") {
481	# default type - RTFRFC!!
482	$part_content_type="message/rfc822";
483	}
484	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s(.?)\s*$@mi) {
485	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
486	$part_content_info=$2;
487	}
488	my $filename="";
489	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
490	$filename=$1;
491	}
492
493	# disposition - either inline or attachment.
494	# NOT CURRENTLY USED - we display all text types instead...
495	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
496
497	# add <<attachment>> to each part except the first...
498	if ($text ne "") {
499	$text.="\n<p><hr><strong><<attachment>></>";
500	# add part info header
501	$text.="<br>Type: $part_content_type<br>\n";
502	if ($filename ne "") {
503	$text.="Filename: $filename\n";
504	}
505	$text.="</strong></p>\n";
506	}
507
508	if ($part_content_type =~ m@text/@)
509	{
510	my $part_text= $self->text_from_part($message_part);
511	if ($part_content_type !~ m@text/(ht\|x)ml@) {
512	$part_text=text_into_html($part_text);
513	}
514	if ($part_text eq "") {
515	$part_text='<<empty message>>';
516	}
517	$text.=$part_text;
518	} elsif ($part_content_type =~ m@message/rfc822@) {
519	# This is a forwarded message
520	my $message_part_headers=$part_body;
521	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
522	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
523	my $message_part_body=$1;
524
525	my $rfc822_formatted_body=""; # put result in here
526	if ($message_part_headers =~
527	/^content\-type:\s([\w\/\-]+)\s\;?\s(.?)\s*$/ims)
528	{
529	# The message header uses MIME flags
530	my $message_content_type=$1;
531	my $message_content_info=$2;
532	if (!defined($message_content_info)) {
533	$message_content_info="";
534	}
535	$message_content_type =~ tr/A-Z/a-z/;
536	if ($message_content_type =~ /multipart/) {
537	$rfc822_formatted_body=
538	$self->text_from_mime_message($message_content_type,
539	$message_content_info,
540	$message_part_body,
541	$outhandle);
542	} else {
543	$message_part_body= $self->text_from_part($part_body);
544	$rfc822_formatted_body=text_into_html($message_part_body);
545	}
546	} else {
547	# message doesn't use MIME flags
548	$rfc822_formatted_body=text_into_html($message_part_body);
549	}
550	# Add the returned text to the output
551	# don't put all the headers...
552	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
553	$text.=text_into_html($message_part_headers);
554	$text.="<p>\n";
555	$text.=$rfc822_formatted_body;
556	# end of message/rfc822
557	} elsif ($part_content_type =~ /multipart/) {
558	# recurse again
559
560	$tmptext= $self->text_from_mime_message($part_content_type,
561	$part_content_info,
562	$part_body,
563	$outhandle);
564	$text.=$tmptext;
565	} elsif ($text eq "") {
566	# we can't do anything with this part, but if it's the first
567	# part then make sure it is mentioned..
568
569	$text.="\n<p><hr><strong><<attachment>></>";
570	# add part info header
571	$text.="<br>Type: $part_content_type<br>\n";
572	if ($filename ne "") {
573	$text.="Filename: $filename\n";
574	}
575	$text.="</strong></p>\n";
576	}
577	} # foreach message part.
578	} else {
579	# we can't handle this multipart type (not mixed or alternative)
580	# the RFC also mentions "parallel".
581	}
582	} # end of ($mimetype !~ multipart)
583	elsif ($mimetype =~ m@message/rfc822@) {
584	my $msg_header = $text;
585	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
586	$text = $1;
587
588	if ($msg_header =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
589	{
590	$mimetype=$1;
591	$mimetype =~ tr/[A-Z]/[a-z]/;
592	$mimeinfo=$2;
593	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
594	$charset = $1;
595	}
596	my $msg_text;
597	if ($mimetype =~ m@multipart/@) {
598	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
599	$text, $outhandle);
600	} else {$msg_text=$text;}
601
602	my $brief_header="";
603	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
604	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
605	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
606	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
607	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
608	$text= "\n<b><<attached message>></b><br>";
609	$text.= "<table><tr><td width=\"5%\"> </td>\n";
610	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
611	. "</td></tr></table>";
612	}
613	} else {
614	# we don't do any processing of the content.
615	}
616
617	return $text;
618	}
619
620
621
622
623
624
625	# Process a MIME part. Return "" if we can't decode it.
626	sub text_from_part {
627	my $self = shift(@_);
628	my $text=shift;
629	my $part_header=$text;
630	# check for empty part header (leading blank line)
631	if ($text =~ /^\s*\r?\n/) {
632	$part_header="Content-type: text/plain; charset=us-ascii";
633	} else {
634	$part_header =~ s/\r?\n\r?\n(.*)$//s;
635	$text=$1; if (!defined($text)) {$text="";}
636	}
637	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
638	$part_header =~ /content\-type:\s([\w\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
639	my $type=$1;
640	my $charset=$2;
641	if (!defined($type)) {$type="";}
642	if (!defined($charset)) {$charset="ascii";}
643	my $encoding="";
644	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
645	$encoding=$1; $encoding=~tr/A-Z/a-z/;
646	}
647	# Content-Transfer-Encoding is per-part
648	if ($encoding ne "") {
649	if ($encoding =~ /quoted\-printable/) {
650	$text=qp_decode($text);
651	} elsif ($encoding =~ /base64/) {
652	$text=base64_decode($text);
653	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
654	# rfc2045 also allows binary, which we ignore (for now).
655	# maybe this shouldn't go to stderr, but anyway...
656	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
657	return "";
658	}
659	}
660	if ($type eq "text/html") {
661	# only get stuff between <body> tags, or <html> tags.
662	$text =~ s@^.<html[^>]>@@is;
663	$text =~ s@</html>.*$@@is;
664	$text =~ s/^.?<body[^>]>//si;
665	$text =~ s/<\/body>.*$//si;
666	}
667	elsif ($type eq "text/xml") {
668	$text=~s/</</g;$text=~s/>/>/g;
669	$text="<pre>\n$text\n</pre>\n";
670	}
671	# convert to unicode
672	$self->convert2unicode($charset, \$text);
673	return $text;
674	}
675
676
677	# decode quoted-printable text
678	sub qp_decode {
679	my $text=shift;
680
681	my @lines=split('\n', $text);
682
683	# if a line ends with "=\s*", it is a soft line break, otherwise
684	# keep in any newline characters.
685	foreach my $line (@lines) {
686	if ($line !~ s/=\s*$//) {$line.="\n";}
687
688	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
689	my @hexcode_segments=split('=',$line);
690	shift @hexcode_segments;
691	my @hexcodes;
692	foreach my $hexcode (@hexcode_segments) {
693	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
694	chomp($hexcode); # just in case...
695	my $char=chr (hex "0x$hexcode");
696	$line =~ s/=$hexcode/$char/g;
697	}
698	}
699	}
700	$text= join('', @lines);
701	return $text;
702	}
703
704	# decode base64 text. This is fairly slow (since it's interpreted perl rather
705	# than compiled XS stuff like in the ::MIME modules, but this is more portable
706	# for us at least).
707	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
708	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
709	# from each encoded byte.
710
711
712	sub base64_decode {
713	my $enc_text = shift;
714	# A=>0, B=>1, ..., '+'=>62, '/'=>63
715	# also '=' is used for padding at the end, but we remove it anyway.
716	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
717	# map each MIME char into it's value, for more efficient lookup.
718	my %index;
719	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
720	# remove all non-base64 chars. eval to get variable in transliteration...
721	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
722	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
723	my $decoded="";
724	while (length ($enc_text)>3)
725	{
726	my $fourchars=substr($enc_text,0,4,"");
727	my @chars=(split '',$fourchars);
728	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
729	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
730	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
731	}
732	# if there are any input chars left, there are either
733	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
734	my @chars=(split '',$enc_text);
735	if (length($enc_text)) {
736	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
737	}
738	if (length($enc_text)==3) {
739	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
740	}
741	return $decoded;
742	}
743
744	sub convert2unicode {
745	my $self = shift(@_);
746	my ($charset, $textref) = @_;
747
748	# first get our character encoding name in the right form.
749	$charset = "iso_8859_1" unless defined $charset;
750	$charset=~tr/A-Z/a-z/;
751	$charset=~s/\-/_/g;
752	$charset=~s/gb2312/gb/;
753	# assumes EUC-KR, not ISO-2022 !?
754	$charset=~s/ks_c_5601_1987/korean/;
755
756	# It appears that we can't always trust ascii text so we'll treat it
757	# as iso-8859-1 (letting characters above 0x80 through without
758	# converting them to utf-8 will result in invalid XML documents
759	# which can't be parsed at build time).
760	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
761
762	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
763	}
764
765
766	# Perl packages have to return true if they are run.
767	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: