Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 7243

Last change on this file since 7243 was 6916, checked in by jrm21, 20 years ago
Don't store Headers metadata by default (it's quite wasteful of space), unless -headers option is given to EMAILPlug in the collect.cfg file.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.2 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content (optional, not stored by default)
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64	# 12/05/02 Added usage datastructure - John Thompson
65
66	package EMAILPlug;
67
68	use SplitPlug;
69
70	use unicode;
71
72	use sorttools;
73	use util;
74
75
76	# EMAILPlug is a sub-class of SplitPlug.
77
78	sub BEGIN {
79	@ISA = ('SplitPlug');
80	}
81
82	use strict;
83	no strict "refs"; # so we can use a variable as a filehandle for print $out
84
85	my $arguments =
86	[ { 'name' => "process_exp",
87	'desc' => "{BasPlug.process_exp}",
88	'type' => "regexp",
89	'reqd' => "no",
90	'deft' => &get_default_process_exp() },
91	{ 'name' => "no_attachments",
92	'desc' => "{EMAILPlug.no_attachments}",
93	'type' => "flag",
94	'reqd' => "no" },
95	{ 'name' => "headers",
96	'desc' => "{EMAILPlug.headers}",
97	'type' => "flag",
98	'reqd' => "no" },
99	{ 'name' => "split_exp",
100	'desc' => "{EMAILPlug.split_exp}",
101	'type' => "regexp",
102	'reqd' => "no",
103	'deft' => &get_default_split_exp() }
104	];
105
106	my $options = { 'name' => "EMAILPlug",
107	'desc' => "{EMAILPlug.desc}",
108	'abstract' => "no",
109	'inherits' => "yes",
110	'args' => $arguments };
111
112	# Create a new EMAILPlug object with which to parse a file.
113	# Accomplished by creating a new BasPlug and using bless to
114	# turn it into an EMAILPlug.
115
116	sub new {
117	my ($class) = @_;
118	my $self = new BasPlug ($class, @_);
119	$self->{'plugin_type'} = "EMAILPlug";
120	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
121	my $option_list = $self->{'option_list'};
122	push( @{$option_list}, $options );
123
124	if (!parsargv::parse(\@_,
125	q^split_exp/.*/^, \$self->{'split_exp'},
126	q^no_attachments^, \$self->{'ignore_attachments'},
127	q^headers^, \$self->{'header_metadata'},
128	"allow_extra_options")) {
129	print STDERR "\nIncorrect options passed to $class.";
130	print STDERR "\nCheck your collect.cfg configuration file\n";
131	$self->print_txt_usage(""); # Use default resource bundle
132	die "\n";
133	}
134	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
135
136	# this might not actually be true at read-time, but after processing
137	# it should all be utf8.
138	$self->{'input_encoding'}="utf8";
139	return bless $self, $class;
140	}
141
142	sub get_default_process_exp {
143	my $self = shift (@_);
144	# mbx/email for mailbox file format, \d+ for maildir (each message is
145	# in a separate file, with a unique number for filename)
146	# mozilla and IE will save individual mbx format files with a ".eml" ext.
147	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
148	}
149
150	# This plugin splits the mbox mail files at lines starting with From<sp>
151	# It is supposed to be "\n\nFrom ", but this isn't always used.
152	sub get_default_split_exp {
153	return q^\nFrom .*\n^;
154
155	}
156
157
158	# do plugin specific processing of doc_obj
159	sub process {
160
161	my $self = shift (@_);
162	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
163	my $outhandle = $self->{'outhandle'};
164
165	# Check that we're dealing with a valid mail file
166	# mbox message files start with "From "
167	# maildir messages usually start with Return-Path and Delivered-To
168	# mh is very similar to maildir
169	my $startoffile=substr($$textref,0,256);
170	if (($startoffile !~ /^(From )/) &&
171	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
172	return undef;
173	}
174
175
176	print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
177
178	print $outhandle "EMAILPlug: processing $file\n"
179	if $self->{'verbosity'} > 1;
180
181	my $cursection = $doc_obj->get_top_section();
182
183	#
184	# Parse the document's text and extract metadata
185	#
186
187	# Protect backslashes
188	$$textref =~ s@\\@\\\\@g;
189
190	# Separate header from body of message
191	my $Headers = $$textref;
192	$Headers =~ s/\r?\n\r?\n(.*)$//s;
193	$$textref = $1;
194	$Headers .= "\n";
195
196	# Unfold headers - see rfc822
197	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
198	# Extract basic metadata from header
199	my @headers = ("From", "To", "Subject", "Date");
200	my %raw;
201	foreach my $name (@headers) {
202	$raw{$name} = "No $name value";
203	}
204
205	# Get a default encoding for the header - RFC says should be ascii...
206	my $default_header_encoding="iso_8859_1";
207
208	# We don't know what character set is the user's default...
209	# We could use textcat to guess... for now we'll look at mime content-type
210	# if ($Headers =~ /([[:^ascii:]])/) {
211	# }
212	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
213	$default_header_encoding=$1;
214	$default_header_encoding =~ s@\-@_@g;
215	$default_header_encoding =~ tr/A-Z/a-z/;
216	}
217
218
219	# Examine each line of the headers
220	my ($line, $name, $value);
221	my @parts;
222	foreach $line (split(/\n/, $Headers)) {
223
224	# Ignore lines with no content or which begin with whitespace
225	next unless ($line =~ /:/);
226	next if ($line =~ /^\s/);
227
228	# Find out what metadata is on this line
229	@parts = split(/:/, $line);
230	$name = shift @parts;
231	# get fieldname in canonical form - first cap, then lower case.
232	$name =~ tr/A-Z/a-z/;
233	# uppercase the first character according to the current locale
234	$name=~s/(.+)/\u$1/;
235	next unless $name;
236	next unless ($raw{$name});
237
238	# Find the value of that metadata
239	$value = join(":", @parts);
240	$value =~ s/^\s+//;
241	$value =~ s/\s+$//;
242	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
243	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
244	my $original_value=$value;
245	my $encoded=$value;
246	$value="";
247	# we should ignore spaces between consecutive encoded-texts
248	$encoded =~ s@\?=\s+=\?@\?==\?@g;
249	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
250	my ($charset, $encoding, $data)=($2,$3,$4);
251	my ($decoded_data);
252	$value.="$1"; # any leading chars
253	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
254	chomp $data;
255	$encoding =~ tr/BQ/bq/;
256	if ($encoding eq "q") { # quoted printable
257	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
258	$decoded_data=qp_decode($data);
259	# qp_decode adds \n, which is default for body text
260	chomp($decoded_data);
261	} else { # base 64
262	$decoded_data=base64_decode($data);
263	}
264	$self->convert2unicode($charset, \$decoded_data);
265	$value .= $decoded_data;
266	} # end of while loop
267
268	# get any trailing characters
269	$self->convert2unicode($default_header_encoding, \$encoded);
270	$value.=$encoded;
271
272	if ($value =~ /^\s*$/) { # we couldn't extract anything...
273	$self->convert2unicode($default_header_encoding,
274	\$original_value);
275	$value=$original_value;
276	}
277	} # end of if =?...?=
278
279	# In the absense of other charset information, assume the
280	# header is the default (usually "iso_8859_1") and convert to unicode.
281	else {
282	$self->convert2unicode($default_header_encoding, \$value);
283	}
284
285	# Store the metadata
286	$value =~ s@_@\\_@g; # protect against GS macro language
287	$raw{$name} = $value;
288	}
289
290	# Extract the name and e-mail address from the From metadata
291	my $frommeta = $raw{"From"};
292	my $fromnamemeta;
293	my $fromaddrmeta;
294
295	$frommeta =~ s/\s*$//; # Remove trailing space, if any
296
297	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
298	$fromnamemeta=$1;
299	$fromaddrmeta=$2;
300	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
301	$fromnamemeta=$2;
302	$fromaddrmeta=$1;
303	}
304	if (!defined($fromaddrmeta)) {
305	$fromaddrmeta=$frommeta;
306	}
307	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
308	# minor attempt to prevent spam-bots from harvesting addresses...
309	$fromaddrmeta=~s/@/@/;
310
311	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
312
313	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
314	$fromnamemeta =~ s/\"//g; # remove quotes
315	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
316	}
317	else {
318	$fromnamemeta = $fromaddrmeta;
319	}
320	# if name is an address
321	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
322	$fromnamemeta=~s/@/&#64\;/;
323	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
324
325	$raw{"From"}=$frommeta;
326
327	# Process Date information
328	if ($raw{"Date"} !~ /No Date/) {
329	$raw{"DateText"} = $raw{"Date"};
330
331	# Convert the date text to internal date format
332	$value = $raw{"Date"};
333	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
334	# make some assumptions about the year formatting...
335	# some (old) software thinks 2001 is 101, some think 2001 is 01
336	if ($year < 20) { $year += 2000; } # assume not really 1920...
337	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
338	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
339
340	} else {
341	# We have not extracted a date
342	$raw{"DateText"} = "Unknown.";
343	$raw{"Date"} = "19000000";
344	}
345
346	# Add extracted metadata to document object
347	foreach my $name (keys %raw) {
348	$value = $raw{$name};
349	if ($value) {
350	# assume subject, etc headers have no special HTML meaning.
351	$value = &text_into_html($value);
352	# escape [] so it isn't re-interpreted as metadata
353	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
354	} else {
355	$value = "No $name field";
356	}
357	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
358	}
359
360	my $mimetype="text/plain";
361	my $mimeinfo="";
362	my $charset = $default_header_encoding;
363	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
364	# more than one parameter given to Content-type.
365	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
366	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
367	{
368	$mimetype=$1;
369	$mimetype =~ tr/[A-Z]/[a-z]/;
370
371	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
372	$mimetype = "text/plain";
373	}
374
375	$mimeinfo=$2;
376	if (!defined $mimeinfo) {
377	$mimeinfo="";
378	} else { # strip leading and trailing stuff
379	$mimeinfo =~ s/^\;\s*//;
380	$mimeinfo =~ s/\s*$//;
381	}
382	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
383	$charset = $1;
384	}
385	}
386
387	my $transfer_encoding="7bit";
388	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
389	$transfer_encoding=$1;
390	}
391
392	if ($mimetype eq "text/html") {
393	$$textref= $self->text_from_part("$Headers\n$$textref");
394	} elsif ($mimetype ne "text/plain") {
395	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
396	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
397	} else { # mimetype eq text/plain
398	$$textref = &text_into_html($$textref);
399	$$textref =~ s@_@\\_@g; # protect against GS macro language
400
401	if ($transfer_encoding =~ /quoted\-printable/) {
402	$$textref=qp_decode($$textref);
403	} elsif ($transfer_encoding =~ /base64/) {
404	$$textref=base64_decode($$textref);
405	}
406	$self->convert2unicode($charset, $textref);
407	}
408
409
410	if ($self->{'header_metadata'} && $self->{'header_metadata'} == 1) {
411	# Add "All headers" metadata
412	$Headers = &text_into_html($Headers);
413
414	$Headers = "No headers" unless ($Headers =~ /\w/);
415	$Headers =~ s/@/&#64\;/g;
416	# escape [] so it isn't re-interpreted as metadata
417	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
418	$self->convert2unicode($charset, \$Headers);
419
420	$Headers =~ s@_@\\_@g; # protect against GS macro language
421	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
422	}
423
424
425	# Add Title metadata
426	my $Title = text_into_html($raw{'Subject'});
427	$Title .= "<br>From: " . text_into_html($raw{'From'});
428	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
429	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
430
431	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
432
433
434	# Add text to document object
435	$$textref = "No message" unless ($$textref =~ /\w/);
436
437	$doc_obj->add_utf8_text($cursection, $$textref);
438
439	return 1;
440	}
441
442
443	# Convert a text string into HTML.
444	#
445	# The HTML is going to be inserted into a GML file, so
446	# we have to be careful not to use symbols like ">",
447	# which ocurs frequently in email messages (and use
448	# &gt instead.
449	#
450	# This function also turns links and email addresses into hyperlinks,
451	# and replaces carriage returns with <BR> tags (and multiple carriage
452	# returns with <P> tags).
453
454
455	sub text_into_html {
456	my ($text) = @_;
457
458	# Convert problem characters into HTML symbols
459	$text =~ s/&/&/g;
460	$text =~ s/</</g;
461	$text =~ s/>/>/g;
462	$text =~ s/\"/"/g;
463
464	# convert email addresses and URIs into links
465	# don't markup email addresses for now
466	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
467
468	# try to munge email addresses a little bit...
469	$text =~ s/@/@/;
470	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
471	# assume URI doesn't finish with a '.'
472	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
473
474
475	# Clean up whitespace and convert \n charaters to <BR> or <P>
476	$text =~ s/ +/ /g;
477	$text =~ s/\s*$//g;
478	$text =~ s/^\s*//g;
479	$text =~ s/\n/\n<br>/g;
480	$text =~ s/<br>\s*<br>/<p>/gi;
481
482	return $text;
483	}
484
485
486
487
488	#Process a MIME message.
489	# the textref we are given DOES NOT include the header.
490	sub text_from_mime_message {
491	my $self = shift(@_);
492	my ($mimetype,$mimeinfo,$text)=(@_);
493	my $outhandle=$self->{'outhandle'};
494	# Check for multiparts - $mimeinfo will be a boundary
495	if ($mimetype =~ /multipart/) {
496	my $boundary="";
497	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
498	$boundary=$1;
499	if ($boundary =~ m@^\"@) {
500	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
501	}
502	} else {
503	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
504	}
505	# parts start with "--$boundary"
506	# message ends with "--$boundary--"
507	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
508	# that perl might want to interpolate. Also allows spaces...
509	$boundary=~s/\\/\\\\/g;
510	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
511	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
512	# remove first "part" and last "part" (final --)
513	shift @message_parts;
514	my $last=pop @message_parts;
515	# if our boundaries are a bit dodgy and we only found 1 part...
516	if (!defined($last)) {$last="";}
517	# make sure it is only -- and whitespace
518	if ($last !~ /^\-\-\s*$/ms) {
519	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
520	}
521	foreach my $message_part (@message_parts) {
522	# remove the leading newline left from split.
523	$message_part=~s/^\r?\n//;
524	}
525	if ($mimetype eq "multipart/alternative") {
526	# check for an HTML version first, then TEXT, otherwise use first.
527	my $part_text="";
528	foreach my $message_part (@message_parts) {
529	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
530	{
531	# Use the HTML version
532	$part_text= $self->text_from_part($message_part);
533	$mimetype="text/html";
534	last;
535	}
536	}
537	if ($part_text eq "") { # try getting a text part instead
538	foreach my $message_part (@message_parts) {
539	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
540	{
541	# Use the plain version
542	$part_text= $self->text_from_part($message_part);
543	if ($part_text =~/[^\s]/) {
544	$part_text = text_into_html($part_text);
545	}
546	$mimetype="text/plain";
547	last;
548	}
549	}
550	}
551	if ($part_text eq "") { #use first part (no html/text part found)
552	$part_text = $self->text_from_part(shift @message_parts);
553	$part_text = text_into_html($part_text);
554	}
555	if ($part_text eq "") { # we couldn't get anything!!!
556	# or it was an empty message...
557	# do nothing...
558	print $outhandle "EMAILPlug: no text - empty body?\n";
559	} else {
560	$text = $part_text;
561	}
562	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
563	$text = "";
564	# signed is for PGP/GPG messages... the last part is a hash
565	if ($mimetype =~ m@multipart/signed@) {
566	pop @message_parts;
567	}
568	my $is_first_part=1;
569	foreach my $message_part (@message_parts) {
570	if ($is_first_part && $text ne "") {$is_first_part=0;}
571
572	if ($mimetype eq "multipart/digest") {
573	# default type - RTFRFC!! Set if not already set
574	$message_part =~ m@^(.*)\n\r?\n@s;
575	my $part_header=$1;
576	if ($part_header !~ m@^content-type@mi) {
577	$message_part="Content-type: message/rfc822\n"
578	. $message_part; # prepend default type
579	}
580	}
581
582	$text .= $self->process_multipart_part($message_part,
583	$is_first_part);
584	} # foreach message part.
585	} else {
586	# we can't handle this multipart type (not mixed or alternative)
587	# the RFC also mentions "parallel".
588	}
589	} # end of ($mimetype =~ multipart)
590	elsif ($mimetype =~ m@message/rfc822@) {
591	my $msg_header = $text;
592	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
593	$text = $1;
594
595	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
596	{
597	$mimetype=$1;
598	$mimetype =~ tr/[A-Z]/[a-z]/;
599	$mimeinfo=$2;
600	#if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
601	# $charset = $1;
602	#}
603	my $msg_text;
604	if ($mimetype =~ m@multipart/@) {
605	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
606	$text);
607	} else {$msg_text=text_from_part($text);}
608
609	my $brief_header=text_into_html(get_brief_headers($msg_header));
610	$text= "\n<b><<attached message>></b><br>";
611	$text.= "<table><tr><td width=\"5%\"> </td>\n";
612	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
613	. "</td></tr></table>";
614	}
615	} else {
616	# we don't do any processing of the content.
617	}
618
619	return $text;
620	}
621
622
623
624
625	sub process_multipart_part {
626	my $self = shift;
627	my $message_part = shift;
628	my $is_first_part = shift;
629
630	my $return_text="";
631	my $part_header=$message_part;
632	my $part_body;
633	if ($message_part=~ /^\s*\n/) {
634	# no header... use defaults
635	$part_body=$message_part;
636	$part_header="Content-type: text/plain; charset=us-ascii";
637	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
638	$part_body=$1;
639	} else {
640	# something's gone wrong...
641	$part_header="";
642	$part_body=$message_part;
643	}
644
645	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
646	my $part_content_type="";
647	my $part_content_info="";
648
649	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
650	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
651	$part_content_info=$2;
652	if (!defined($part_content_info)) {
653	$part_content_info="";
654	} else {
655	$part_content_info =~ s/^\;\s*//;
656	$part_content_info =~ s/\s*$//;
657	}
658	}
659	my $filename="";
660	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
661	$filename=$1;
662	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
663	}
664
665	# disposition - either inline or attachment.
666	# NOT CURRENTLY USED - we display all text types instead...
667	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
668
669	# add <<attachment>> to each part except the first...
670	if (!$is_first_part) {
671	$return_text.="\n<p><hr><strong><<attachment>>";
672	# add part info header
673	my $header_text="<br>Type: $part_content_type<br>\n";
674	if ($filename ne "") {
675	$header_text.="Filename: $filename\n";
676	}
677	$header_text =~ s@_@\\_@g;
678	$return_text.=$header_text . "</strong></p>\n<p>\n";
679	}
680
681	if ($part_content_type =~ m@text/@)
682	{
683	my $part_text= $self->text_from_part($message_part);
684	if ($part_content_type !~ m@text/(ht\|x)ml@) {
685	$part_text = text_into_html($part_text);
686	}
687	if ($part_text eq "") {
688	$part_text = ' ';
689	}
690	$return_text .= $part_text;
691	} elsif ($part_content_type =~ m@message/rfc822@) {
692	# This is a forwarded message
693	my $message_part_headers=$part_body;
694	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
695	my $message_part_body=$1;
696	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
697
698	my $rfc822_formatted_body=""; # put result in here
699	if ($message_part_headers =~
700	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
701	{
702	# The message header uses MIME flags
703	my $message_content_type=$1;
704	my $message_content_info=$2;
705	if (!defined($message_content_info)) {
706	$message_content_info="";
707	} else {
708	$message_content_info =~ s/^\;\s*//;
709	$message_content_info =~ s/\s*$//;
710	}
711	$message_content_type =~ tr/A-Z/a-z/;
712	if ($message_content_type =~ /multipart/) {
713	$rfc822_formatted_body=
714	$self->text_from_mime_message($message_content_type,
715	$message_content_info,
716	$message_part_body);
717	} else {
718	$message_part_body= $self->text_from_part($part_body);
719	$rfc822_formatted_body=text_into_html($message_part_body);
720	}
721	} else {
722	# message doesn't use MIME flags
723	$rfc822_formatted_body=text_into_html($message_part_body);
724	$rfc822_formatted_body =~ s@_@\\_@g;
725	}
726	# Add the returned text to the output
727	# don't put all the headers...
728	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
729	my $brief_headers=get_brief_headers($message_part_headers);
730	$return_text.=text_into_html($brief_headers);
731	$return_text.="</p><p>\n";
732	$return_text.=$rfc822_formatted_body;
733	$return_text.="</p>\n";
734	# end of message/rfc822
735	} elsif ($part_content_type =~ /multipart/) {
736	# recurse again
737
738	my $tmptext= $self->text_from_mime_message($part_content_type,
739	$part_content_info,
740	$part_body);
741	$return_text.=$tmptext;
742	} else {
743	# this part isn't text/* or another message...
744	if ($is_first_part) {
745	# this is the first part of a multipart, or only part!
746	$return_text="\n<p><hr><strong><<attachment>>";
747	# add part info header
748	my $header_text="<br>Type: $part_content_type<br>\n";
749	$header_text.="Filename: $filename</strong></p>\n<p>\n";
750	$header_text =~ s@_@\\_@g;
751	$return_text.=$header_text;
752	}
753
754	# save attachment by default
755	if (!$self->{'ignore_attachments'}
756	&& $filename ne "") { # this part has a file...
757	my $encoding="8bit";
758	if ($part_header =~
759	/^content-transfer-encoding:\s*(\w+)/mi ) {
760	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
761	}
762	my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
763	my $save_filename=$filename;
764
765	# make sure we don't clobber files with same name;
766	# need to keep state between .mbx files
767	my $assoc_files=$self->{'assoc_filenames'};
768	if ($assoc_files->{$filename}) { # it's been set...
769	$assoc_files->{$filename}++;
770	$filename =~ m/(.+)\.(\w+)$/;
771	my ($filestem, $ext)=($1,$2);
772	$save_filename="${filestem}_"
773	. $assoc_files->{$filename} . ".$ext";
774	} else { # first file with this name
775	$assoc_files->{$filename}=1;
776	}
777	open (SAVE, ">$tmpdir/$save_filename") \|\|
778	warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
779	my $part_text = $message_part;
780	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
781	if ($encoding eq "base64") {
782	print SAVE base64_decode($part_text);
783	} elsif ($encoding eq "quoted-printable") {
784	print SAVE qp_decode($part_text);
785	} else { # 7bit, 8bit, binary, etc...
786	print SAVE $part_text;
787	}
788	close SAVE;
789	my $doc_obj=$self->{'doc_obj'};
790	$doc_obj->associate_file("$tmpdir/$save_filename",
791	"$save_filename",
792	$part_content_type # mimetype
793	);
794	# clean up tmp area...
795	# Can't do this as it hasn't been copied/linked yet!!!
796	# &util::rm("$tmpdir/$save_filename");
797	my $outhandle=$self->{'outhandle'};
798	print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
799
800	# be nice if "download" was a translatable macro :(
801	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
802	} # end of save attachment
803	} # end of !text/message part
804
805
806	return $return_text;
807	}
808
809
810	# Return only the "important" headers from a set of message headers
811	sub get_brief_headers {
812	my $msg_header = shift;
813	my $brief_header = "";
814
815	# Order matters!
816	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
817	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
818	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
819	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
820	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
821
822	return $brief_header;
823	}
824
825
826	# Process a MIME part. Return "" if we can't decode it.
827	# should only be called for parts with type "text/*" ?
828	sub text_from_part {
829	my $self = shift;
830	my $text = shift \|\| '';
831	my $part_header = $text;
832
833	# check for empty part header (leading blank line)
834	if ($text =~ /^\s*\r?\n/) {
835	$part_header="Content-type: text/plain; charset=us-ascii";
836	} else {
837	$part_header =~ s/\r?\n\r?\n(.*)$//s;
838	$text=$1; if (!defined($text)) {$text="";}
839	}
840	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
841	$part_header =~ /content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
842	my $type=$1;
843	my $charset=$2;
844	if (!defined($type)) {$type="";}
845	if (!defined($charset)) {$charset="ascii";}
846	my $encoding="";
847	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
848	$encoding=$1; $encoding=~tr/A-Z/a-z/;
849	}
850	# Content-Transfer-Encoding is per-part
851	if ($encoding ne "") {
852	if ($encoding =~ /quoted\-printable/) {
853	$text=qp_decode($text);
854	} elsif ($encoding =~ /base64/) {
855	$text=base64_decode($text);
856	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
857	# rfc2045 also allows binary, which we ignore (for now).
858	my $outhandle=$self->{'outhandle'};
859	print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
860	return "";
861	}
862	}
863	if ($type eq "text/html") {
864	# only get stuff between <body> tags, or <html> tags.
865	$text =~ s@^.<html[^>]>@@is;
866	$text =~ s@</html>.*$@@is;
867	$text =~ s/^.?<body[^>]>//si;
868	$text =~ s/<\/body>.*$//si;
869	}
870	elsif ($type eq "text/xml") {
871	$text=~s/</</g;$text=~s/>/>/g;
872	$text="<pre>\n$text\n</pre>\n";
873	}
874	# convert to unicode
875	$self->convert2unicode($charset, \$text);
876
877	$text =~ s@_@\\_@g; # protect against GS macro language
878	return $text;
879	}
880
881
882
883
884	# decode quoted-printable text
885	sub qp_decode {
886	my $text=shift;
887
888	# if a line ends with "=\s*", it is a soft line break, otherwise
889	# keep in any newline characters.
890
891	$text =~ s/=\s*\r?\n//mg;
892	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
893	return $text;
894	}
895
896	# decode base64 text. This is fairly slow (since it's interpreted perl rather
897	# than compiled XS stuff like in the ::MIME modules, but this is more portable
898	# for us at least).
899	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
900	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
901	# from each encoded byte.
902
903
904	sub base64_decode {
905	my $enc_text = shift;
906	# A=>0, B=>1, ..., '+'=>62, '/'=>63
907	# also '=' is used for padding at the end, but we remove it anyway.
908	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
909	# map each MIME char into it's value, for more efficient lookup.
910	my %index;
911	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
912	# remove all non-base64 chars. eval to get variable in transliteration...
913	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
914	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
915	my $decoded="";
916	while (length ($enc_text)>3)
917	{
918	my $fourchars=substr($enc_text,0,4,"");
919	my @chars=(split '',$fourchars);
920	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
921	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
922	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
923	}
924	# if there are any input chars left, there are either
925	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
926	my @chars=(split '',$enc_text);
927	if (length($enc_text)) {
928	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
929	}
930	if (length($enc_text)==3) {
931	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
932	}
933	return $decoded;
934	}
935
936	sub convert2unicode {
937	my $self = shift(@_);
938	my ($charset, $textref) = @_;
939
940	# first get our character encoding name in the right form.
941	$charset = "iso_8859_1" unless defined $charset;
942	$charset=~tr/A-Z/a-z/;
943	$charset=~s/\-/_/g;
944	$charset=~s/gb2312/gb/;
945	# assumes EUC-KR, not ISO-2022 !?
946	$charset=~s/ks_c_5601_1987/korean/;
947
948	if ($charset eq "utf_8" \|\| !$$textref) {
949	# nothing to do!
950	return;
951	}
952
953	# It appears that we can't always trust ascii text so we'll treat it
954	# as iso-8859-1 (letting characters above 0x80 through without
955	# converting them to utf-8 will result in invalid XML documents
956	# which can't be parsed at build time).
957	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
958
959	if ($charset eq "iso_8859_1") {
960	# test if the mailer lied, and it has win1252 chars in it...
961	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
962	if ($$textref =~ m/[\x80-\x9f]/) {
963	my $outhandle = $self->{'outhandle'};
964	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
965	print $outhandle "codepage 1252 detected.\n";
966	$charset = "windows_1252";
967	}
968	}
969	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
970
971	if ($utf8_text ne "") {
972	$$textref=$utf8_text;
973	} else {
974	# we didn't get any text... unsupported encoding perhaps? Or it is
975	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
976	# characters out here if this causes problems...
977	my $outhandle=$self->{'outhandle'};
978	print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
979	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
980
981	}
982	}
983
984
985	# Perl packages have to return true if they are run.
986	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: