Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3136

Last change on this file since 3136 was 3136, checked in by paynter, 22 years ago
Reconciled John's version of my changes to EMAILPlug with my version of my changes. It turns out they differed only in formatting and documentation.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.6 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63	package EMAILPlug;
64
65	use SplitPlug;
66
67	use unicode;
68
69	use sorttools;
70	use util;
71
72
73	# EMAILPlug is a sub-class of SplitPlug.
74
75	sub BEGIN {
76	@ISA = ('SplitPlug');
77	}
78
79	# Create a new EMAILPlug object with which to parse a file.
80	# Accomplished by creating a new BasPlug and using bless to
81	# turn it into an EMAILPlug.
82
83	sub new {
84	my ($class) = @_;
85	my $self = new BasPlug ("EMAILPlug", @_);
86	# this might not actually be true at read-time, but after processing
87	# it should all be utf8.
88	$self->{'input_encoding'}="utf8";
89	return bless $self, $class;
90	}
91
92	sub get_default_process_exp {
93	my $self = shift (@_);
94	# mbx/email for mailbox file format, \d+ for maildir (each message is
95	# in a separate file, with a unique number for filename)
96	# mozilla and IE will save individual mbx format files with a ".eml" ext.
97	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
98	}
99
100	# This plugin splits the mbox mail files at lines starting with From<sp>
101	# It is supposed to be "\n\nFrom ", but this isn't always used.
102	sub get_default_split_exp {
103	return q^\nFrom .*\n^;
104	}
105
106
107	# do plugin specific processing of doc_obj
108	sub process {
109
110	my $self = shift (@_);
111	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112	my $outhandle = $self->{'outhandle'};
113
114	# Check that we're dealing with a valid mail file
115	# mbox message files start with "From "
116	# maildir messages usually start with Return-Path and Delivered-To
117	# mh is very similar to maildir
118	my $startoffile=substr($$textref,0,256);
119	if (($startoffile !~ /^(From )/) &&
120	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
121	return undef;
122	}
123
124
125	print $outhandle "EMAILPlug: processing $file\n"
126	if $self->{'verbosity'} > 1;
127
128	my $cursection = $doc_obj->get_top_section();
129
130	#
131	# Parse the document's text and extract metadata
132	#
133
134	# Protect backslashes
135	$$textref =~ s@\\@\\\\@g;
136
137	# Separate header from body of message
138	my $Headers = $$textref;
139	$Headers =~ s/\r?\n\r?\n(.*)$//s;
140	$$textref = $1;
141
142	# Unfold headers - see rfc822
143	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
144	# Extract basic metadata from header
145	my @headers = ("From", "To", "Subject", "Date");
146	my %raw;
147	foreach my $name (@headers) {
148	$raw{$name} = "No $name value";
149	}
150
151	# Get a default encoding for the header - RFC says should be ascii...
152	my $default_heading_encoding="iso_8859_1";
153
154	# We don't know what character set is the user's default...
155	# We could use textcat to guess... for now we'll look at mime content-type
156	# if ($Headers =~ /([[:^ascii:]])/) {
157	# }
158	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
159	$default_header_encoding=$1;
160	$default_header_encoding =~ s@\-@_@g;
161	$default_header_encoding =~ tr/A-Z/a-z/;
162	}
163
164
165	# Examine each line of the headers
166	my ($line, $name, $value);
167	my @parts;
168	foreach $line (split(/\n/, $Headers)) {
169
170	# Ignore lines with no content or which begin with whitespace
171	next unless ($line =~ /:/);
172	next if ($line =~ /^\s/);
173
174	# Find out what metadata is on this line
175	@parts = split(/:/, $line);
176	$name = shift @parts;
177	# get fieldname in canonical form - first cap, then lower case.
178	$name =~ tr/A-Z/a-z/;
179	# uppercase the first character according to the current locale
180	$name=~s/(.+)/\u$1/;
181	next unless $name;
182	next unless ($raw{$name});
183
184	# Find the value of that metadata
185	$value = join(":", @parts);
186	$value =~ s/^\s+//;
187	$value =~ s/\s+$//;
188
189	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
190	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
191	my $original_value=$value;
192	my $encoded=$value;
193	$value="";
194	# this isn't quite right yet regarding spaces between encoded-texts
195	# (see examples, section 8. of rfc).
196	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=\s*//i) {
197	my ($charset, $encoding, $data)=($2,$3,$4);
198	my ($decoded_data);
199	$value.="$1"; # any leading chars
200	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
201	chomp $data;
202	$encoding =~ tr/BQ/bq/;
203	if ($encoding eq "q") { # quoted printable
204	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
205	$decoded_data=qp_decode($data);
206	} else { # base 64
207	$decoded_data=base64_decode($data);
208	}
209	$self->convert2unicode($charset, \$decoded_data);
210	$value .= $decoded_data;
211	} # end of while loop
212
213	# get any trailing characters
214	$self->convert2unicode($default_header_encoding, \$encoded);
215	$value.=$encoded;
216
217	if ($value =~ /^\s*$/) { # we couldn't extract anything...
218	$self->convert2unicode($default_header_encoding,
219	\$original_value);
220	$value=original_value;
221	}
222	} # end of if =?...?=
223
224	# In the absense of other charset information, assume the
225	# header is the default (usually "iso_8859_1") and convert it to unicode.
226	else {
227	$self->convert2unicode($default_header_encoding, \$value);
228	}
229
230	# Store the metadata
231	$raw{$name} = $value;
232	}
233
234	# Extract the name and e-mail address from the From metadata
235	$frommeta = $raw{"From"};
236	my $fromnamemeta;
237	my $fromaddrmeta;
238
239	$frommeta =~ s/\s*$//; # Remove trailing space, if any
240
241	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
242	$fromnamemeta=$1;
243	$fromaddrmeta=$2;
244	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
245	$fromnamemeta=$2;
246	$fromaddrmeta=$1;
247	}
248	if (!defined($fromaddrmeta)) {
249	$fromaddrmeta=$frommeta;
250	}
251	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
252	# minor attempt to prevent spam-bots from harvesting addresses...
253	$fromaddrmeta=~s/@/@/;
254	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
255
256	if (defined($fromnamemeta)) {
257	$fromnamemeta =~ s/\"//g;
258	}
259	else {
260	$fromnamemeta = $fromaddrmeta;
261	}
262	# if name is an address
263	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
264	$fromnamemeta=~s/@/&#64\;/;
265	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
266
267	$raw{"From"}=$frommeta;
268
269	# Process Date information
270	if ($raw{"Date"} !~ /No Date/) {
271	$raw{"DateText"} = $raw{"Date"};
272
273	# Convert the date text to internal date format
274	$value = $raw{"Date"};
275	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
276	if ($year < 100) { $year += 1900; }
277	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
278
279	} else {
280	# We have not extracted a date
281	$raw{"DateText"} = "Unknown.";
282	$raw{"Date"} = "19000000";
283	}
284
285	# Add extracted metadata to document object
286	foreach my $name (keys %raw) {
287	$value = $raw{$name};
288	if ($value) {
289	# assume subject, etc headers have no special HTML meaning.
290	$value = &text_into_html($value);
291	# escape [] so it isn't re-interpreted as metadata
292	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
293	} else {
294	$value = "No $name field";
295	}
296	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
297	}
298
299	my $mimetype="text/plain";
300	my $mimeinfo="";
301	my $charset = $default_header_encoding;
302	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
303	# more than one parameter given to Content-type.
304	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
305	if ($Headers =~ /^content\-type:\s([\w\/\-]+)\s(\;\s.)\s*$/mi)
306	{
307	$mimetype=$1;
308	$mimetype =~ tr/[A-Z]/[a-z]/;
309
310	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
311	$mimetype = "text/plain";
312	}
313
314	$mimeinfo=$2;
315	if (!defined $mimeinfo) {
316	$mimeinfo="";
317	} else { # strip leading and trailing stuff
318	$mimeinfo =~ s/^\;\s*//;
319	$mimeinfo =~ s/\s*$//;
320	}
321	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
322	$charset = $1;
323	}
324	}
325
326	my $transfer_encoding="7bit";
327	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
328	$transfer_encoding=$1;
329	}
330	if ($mimetype eq "text/html") {
331	$$textref= $self->text_from_part("$Headers\n$$textref");
332	} elsif ($mimetype ne "text/plain") {
333	$$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
334	$outhandle);
335	} elsif ($transfer_encoding =~ /quoted\-printable/) {
336	$$textref=qp_decode($$textref);
337	$self->convert2unicode($charset, $textref);
338	} elsif ($transfer_encoding =~ /base64/) {
339	$$textref=base64_decode($$textref);
340	$self->convert2unicode($charset, $textref);
341	} else {
342	$self->convert2unicode($charset, $textref);
343	}
344
345
346	# Add "All headers" metadata
347	$Headers = &text_into_html($Headers);
348
349	$Headers = "No headers" unless ($Headers =~ /\w/);
350	$Headers =~ s/@/&#64\;/g;
351	# escape [] so it isn't re-interpreted as metadata
352	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
353	$self->convert2unicode($charset, \$Headers);
354	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
355
356
357	# Add Title metadata
358	my $Title = text_into_html($raw{'Subject'});
359	$Title .= "<br>From: " . text_into_html($raw{'From'});
360	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
361	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
362
363	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
364
365
366	# Add text to document object
367	if ($mimetype eq "text/plain") {
368	$$textref = &text_into_html($$textref);
369	}
370	$$textref = "No message" unless ($$textref =~ /\w/);
371	$doc_obj->add_utf8_text($cursection, $$textref);
372
373	return 1;
374	}
375
376
377	# Convert a text string into HTML.
378	#
379	# The HTML is going to be inserted into a GML file, so
380	# we have to be careful not to use symbols like ">",
381	# which ocurs frequently in email messages (and use
382	# &gt instead.
383	#
384	# This function also turns links and email addresses into hyperlinks,
385	# and replaces carriage returns with <BR> tags (and multiple carriage
386	# returns with <P> tags).
387
388
389	sub text_into_html {
390	my ($text) = @_;
391
392	# Convert problem characters into HTML symbols
393	$text =~ s/&/&/g;
394	$text =~ s/</</g;
395	$text =~ s/>/>/g;
396	$text =~ s/\"/"/g;
397
398	# convert email addresses and URIs into links
399	# don't markup email addresses for now
400	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
401
402	# try to munge email addresses a little bit...
403	$text =~ s/@/@/;
404	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
405	# assume URI doesn't finish with a '.'
406	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+))@<a href=\"$1\">$1<\/a>@g;
407
408
409	# Clean up whitespace and convert \n charaters to <BR> or <P>
410	$text =~ s/ +/ /g;
411	$text =~ s/\s*$//g;
412	$text =~ s/^\s*//g;
413	$text =~ s/\n/\n<br>/g;
414	$text =~ s/<br>\s*<br>/<p>/gi;
415
416	return $text;
417	}
418
419
420
421
422	#Process a MIME message.
423	# the textref we are given DOES NOT include the header.
424	sub text_from_mime_message {
425	my $self = shift(@_);
426	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
427
428	# Check for multiparts - $mimeinfo will be a boundary
429	if ($mimetype =~ /multipart/) {
430	$boundary="";
431	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
432	$boundary=$1;
433	if ($boundary =~ m@^\"@) {
434	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
435	}
436	} else {
437	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
438	}
439	# parts start with "--$boundary"
440	# message ends with "--$boundary--"
441	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
442	# that perl might want to interpolate. Also allows spaces...
443	$boundary=~s/\\/\\\\/g;
444	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
445	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
446	# remove first "part" and last "part" (final --)
447	shift @message_parts;
448	my $last=pop @message_parts;
449	# if our boundaries are a bit dodgy and we only found 1 part...
450	if (!defined($last)) {$last="";}
451	# make sure it is only -- and whitespace
452	if ($last !~ /^\-\-\s*$/ms) {
453	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
454	}
455	foreach my $message_part (@message_parts) {
456	# remove the leading newline left from split.
457	$message_part=~s/^\r?\n//;
458	}
459	if ($mimetype eq "multipart/alternative") {
460	# check for an HTML version first, then TEXT, otherwise use first.
461	my $part_text="";
462	foreach my $message_part (@message_parts) {
463	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
464	{
465	# Use the HTML version
466	$part_text= $self->text_from_part($message_part);
467	$mimetype="text/html";
468	last;
469	}
470	}
471	if ($part_text eq "") { # try getting a text part instead
472	foreach my $message_part (@message_parts) {
473	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
474	{
475	# Use the plain version
476	$part_text= $self->text_from_part($message_part);
477	if ($part_text =~/[^\s]/) {
478	$part_text="<pre>".$part_text."</pre>";
479	}
480	$mimetype="text/plain";
481	last;
482	}
483	}
484	}
485	if ($part_text eq "") { # use first part
486	$part_text= $self->text_from_part(shift @message_parts);
487	}
488	if ($part_text eq "") { # we couldn't get anything!!!
489	# or it was an empty message...
490	# do nothing...
491	print $outhandle "EMAILPlug: no text - empty body?\n";
492	} else {
493	$text=$part_text;
494	}
495	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related)@) {
496	$text="";
497	foreach my $message_part (@message_parts) {
498	my $part_header=$message_part;
499	my $part_body;
500	if ($message_part=~ /^\s*\n/) {
501	# no header... use defaults
502	$part_body=$message_part;
503	$part_header="Content-type: text/plain; charset=us-ascii";
504	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
505	$part_body=$1;
506	} else {
507	# something's gone wrong...
508	$part_header="";
509	$part_body=$message_part;
510	}
511
512	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
513	my $part_content_type="";
514	my $part_content_info="";
515	if ($mimetype eq "multipart/digest") {
516	# default type - RTFRFC!!
517	$part_content_type="message/rfc822";
518	}
519	if ($part_header =~ m@^content\-type:\s([\w+/\-]+)\s\;?\s(.?)\s*$@mi) {
520	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
521	$part_content_info=$2;
522	}
523	my $filename="";
524	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
525	$filename=$1;
526	}
527
528	# disposition - either inline or attachment.
529	# NOT CURRENTLY USED - we display all text types instead...
530	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
531
532	# add <<attachment>> to each part except the first...
533	if ($text ne "") {
534	$text.="\n<p><hr><strong><<attachment>></>";
535	# add part info header
536	$text.="<br>Type: $part_content_type<br>\n";
537	if ($filename ne "") {
538	$text.="Filename: $filename\n";
539	}
540	$text.="</strong></p>\n";
541	}
542
543	if ($part_content_type =~ m@text/@)
544	{
545	my $part_text= $self->text_from_part($message_part);
546	if ($part_content_type !~ m@text/(ht\|x)ml@) {
547	$part_text=text_into_html($part_text);
548	}
549	if ($part_text eq "") {
550	$part_text='<<empty message>>';
551	}
552	$text.=$part_text;
553	} elsif ($part_content_type =~ m@message/rfc822@) {
554	# This is a forwarded message
555	my $message_part_headers=$part_body;
556	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
557	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
558	my $message_part_body=$1;
559
560	my $rfc822_formatted_body=""; # put result in here
561	if ($message_part_headers =~
562	/^content\-type:\s([\w\/\-]+)\s\;?\s(.?)\s*$/ims)
563	{
564	# The message header uses MIME flags
565	my $message_content_type=$1;
566	my $message_content_info=$2;
567	if (!defined($message_content_info)) {
568	$message_content_info="";
569	}
570	$message_content_type =~ tr/A-Z/a-z/;
571	if ($message_content_type =~ /multipart/) {
572	$rfc822_formatted_body=
573	$self->text_from_mime_message($message_content_type,
574	$message_content_info,
575	$message_part_body,
576	$outhandle);
577	} else {
578	$message_part_body= $self->text_from_part($part_body);
579	$rfc822_formatted_body=text_into_html($message_part_body);
580	}
581	} else {
582	# message doesn't use MIME flags
583	$rfc822_formatted_body=text_into_html($message_part_body);
584	}
585	# Add the returned text to the output
586	# don't put all the headers...
587	$message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
588	$text.=text_into_html($message_part_headers);
589	$text.="<p>\n";
590	$text.=$rfc822_formatted_body;
591	# end of message/rfc822
592	} elsif ($part_content_type =~ /multipart/) {
593	# recurse again
594
595	$tmptext= $self->text_from_mime_message($part_content_type,
596	$part_content_info,
597	$part_body,
598	$outhandle);
599	$text.=$tmptext;
600	} elsif ($text eq "") {
601	# we can't do anything with this part, but if it's the first
602	# part then make sure it is mentioned..
603
604	$text.="\n<p><hr><strong><<attachment>></>";
605	# add part info header
606	$text.="<br>Type: $part_content_type<br>\n";
607	if ($filename ne "") {
608	$text.="Filename: $filename\n";
609	}
610	$text.="</strong></p>\n";
611	}
612	} # foreach message part.
613	} else {
614	# we can't handle this multipart type (not mixed or alternative)
615	# the RFC also mentions "parallel".
616	}
617	} # end of ($mimetype !~ multipart)
618	elsif ($mimetype =~ m@message/rfc822@) {
619	my $msg_header = $text;
620	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
621	$text = $1;
622
623	if ($msg_header =~ /^content\-type:\s([\w\/\-]+)\s\;?\s(.+?)\s$/mi)
624	{
625	$mimetype=$1;
626	$mimetype =~ tr/[A-Z]/[a-z]/;
627	$mimeinfo=$2;
628	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
629	$charset = $1;
630	}
631	my $msg_text;
632	if ($mimetype =~ m@multipart/@) {
633	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
634	$text, $outhandle);
635	} else {$msg_text=$text;}
636
637	my $brief_header="";
638	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
639	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
640	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
641	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
642	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
643	$text= "\n<b><<attached message>></b><br>";
644	$text.= "<table><tr><td width=\"5%\"> </td>\n";
645	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
646	. "</td></tr></table>";
647	}
648	} else {
649	# we don't do any processing of the content.
650	}
651
652	return $text;
653	}
654
655
656
657
658
659
660	# Process a MIME part. Return "" if we can't decode it.
661	sub text_from_part {
662	my $self = shift;
663	my $text = shift \|\| '';
664	my $part_header = $text;
665
666	# check for empty part header (leading blank line)
667	if ($text =~ /^\s*\r?\n/) {
668	$part_header="Content-type: text/plain; charset=us-ascii";
669	} else {
670	$part_header =~ s/\r?\n\r?\n(.*)$//s;
671	$text=$1; if (!defined($text)) {$text="";}
672	}
673	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
674	$part_header =~ /content\-type:\s([\w\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
675	my $type=$1;
676	my $charset=$2;
677	if (!defined($type)) {$type="";}
678	if (!defined($charset)) {$charset="ascii";}
679	my $encoding="";
680	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
681	$encoding=$1; $encoding=~tr/A-Z/a-z/;
682	}
683	# Content-Transfer-Encoding is per-part
684	if ($encoding ne "") {
685	if ($encoding =~ /quoted\-printable/) {
686	$text=qp_decode($text);
687	} elsif ($encoding =~ /base64/) {
688	$text=base64_decode($text);
689	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
690	# rfc2045 also allows binary, which we ignore (for now).
691	# maybe this shouldn't go to stderr, but anyway...
692	print STDERR "EMAILPlug: unknown encoding: $encoding\n";
693	return "";
694	}
695	}
696	if ($type eq "text/html") {
697	# only get stuff between <body> tags, or <html> tags.
698	$text =~ s@^.<html[^>]>@@is;
699	$text =~ s@</html>.*$@@is;
700	$text =~ s/^.?<body[^>]>//si;
701	$text =~ s/<\/body>.*$//si;
702	}
703	elsif ($type eq "text/xml") {
704	$text=~s/</</g;$text=~s/>/>/g;
705	$text="<pre>\n$text\n</pre>\n";
706	}
707	# convert to unicode
708	$self->convert2unicode($charset, \$text);
709	return $text;
710	}
711
712
713	# decode quoted-printable text
714	sub qp_decode {
715	my $text=shift;
716
717	my @lines=split('\n', $text);
718
719	# if a line ends with "=\s*", it is a soft line break, otherwise
720	# keep in any newline characters.
721	foreach my $line (@lines) {
722	if ($line !~ s/=\s*$//) {$line.="\n";}
723
724	if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
725	my @hexcode_segments=split('=',$line);
726	shift @hexcode_segments;
727	my @hexcodes;
728	foreach my $hexcode (@hexcode_segments) {
729	$hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
730	chomp($hexcode); # just in case...
731	my $char=chr (hex "0x$hexcode");
732	$line =~ s/=$hexcode/$char/g;
733	}
734	}
735	}
736	$text= join('', @lines);
737	return $text;
738	}
739
740	# decode base64 text. This is fairly slow (since it's interpreted perl rather
741	# than compiled XS stuff like in the ::MIME modules, but this is more portable
742	# for us at least).
743	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
744	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
745	# from each encoded byte.
746
747
748	sub base64_decode {
749	my $enc_text = shift;
750	# A=>0, B=>1, ..., '+'=>62, '/'=>63
751	# also '=' is used for padding at the end, but we remove it anyway.
752	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
753	# map each MIME char into it's value, for more efficient lookup.
754	my %index;
755	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
756	# remove all non-base64 chars. eval to get variable in transliteration...
757	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
758	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
759	my $decoded="";
760	while (length ($enc_text)>3)
761	{
762	my $fourchars=substr($enc_text,0,4,"");
763	my @chars=(split '',$fourchars);
764	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
765	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
766	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
767	}
768	# if there are any input chars left, there are either
769	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
770	my @chars=(split '',$enc_text);
771	if (length($enc_text)) {
772	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
773	}
774	if (length($enc_text)==3) {
775	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
776	}
777	return $decoded;
778	}
779
780	sub convert2unicode {
781	my $self = shift(@_);
782	my ($charset, $textref) = @_;
783
784	# first get our character encoding name in the right form.
785	$charset = "iso_8859_1" unless defined $charset;
786	$charset=~tr/A-Z/a-z/;
787	$charset=~s/\-/_/g;
788	$charset=~s/gb2312/gb/;
789	# assumes EUC-KR, not ISO-2022 !?
790	$charset=~s/ks_c_5601_1987/korean/;
791
792	# It appears that we can't always trust ascii text so we'll treat it
793	# as iso-8859-1 (letting characters above 0x80 through without
794	# converting them to utf-8 will result in invalid XML documents
795	# which can't be parsed at build time).
796	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
797
798	if ($charset eq "iso_8859_1") {
799	# test if the mailer lied, and it has win1252 chars in it...
800	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
801	if ($$textref =~ m/[\x80-\x9f]/) {
802	my $outhandle = $self->{'outhandle'};
803	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
804	print $outhandle "codepage 1252 detected.\n";
805	$charset = "windows_1252";
806	}
807	}
808	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
809	}
810
811
812	# Perl packages have to return true if they are run.
813	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: