Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3721

Last change on this file since 3721 was 3721, checked in by jrm21, 21 years ago
bug where some text/plain messages weren't having < > & properly escaped. Also some minor code formatting changes for consistency.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.0 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64	# 12/05/02 Added usage datastructure - John Thompson
65
66	package EMAILPlug;
67
68	use SplitPlug;
69
70	use unicode;
71
72	use sorttools;
73	use util;
74
75
76	# EMAILPlug is a sub-class of SplitPlug.
77
78	sub BEGIN {
79	@ISA = ('SplitPlug');
80	}
81
82	sub print_usage {
83	print STDERR "\n usage: plugin EMAILPlug [options]\n\n";
84	print STDERR " options:\n";
85	print STDERR " -split_exp A perl regular expression used to split files\n";
86	print STDERR " containing many messages into individual documents.\n\n";
87	print STDERR " -no_attachments Do not save message attachments.\n\n";
88
89	}
90
91	my $arguments =
92	[ { 'name' => "process_exp",
93	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
94	'type' => "string",
95	'reqd' => "no",
96	'deft' => q@([\\/]\d+\|\.(mbx\|email\|eml))$@
97	},
98	{ 'name' => "no_attachments",
99	'desc' => "Do not save message attachments.",
100	'type' => "flag",
101	'reqd' => "no"
102	},
103	{ 'name' => "block_exp",
104	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
105	'type' => "string",
106	'reqd' => "no",
107	'deft' => q^^}
108	];
109
110	my $options =
111	{ 'name' => "EMAILPlug",
112	'desc' => "Email plug reads email files. These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers All the header content\n\t\$Subject Subject: header\n\t\$To To: header\n\t\$From From: header\n\t\$FromName Name of sender (where available)\n\t\$FromAddr E-mail address of sender\n\t\$DateText Date: header\n\t\$Date Date: header in GSDL format (eg: 19990924)",
113	'inherits' => "Yes",
114	'args' => $arguments };
115
116	# Create a new EMAILPlug object with which to parse a file.
117	# Accomplished by creating a new BasPlug and using bless to
118	# turn it into an EMAILPlug.
119
120	sub new {
121	my ($class) = @_;
122	my $self = new BasPlug ($class, @_);
123
124	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
125	my $option_list = $self->{'option_list'};
126	push( @{$option_list}, $options );
127
128	if (!parsargv::parse(\@_,
129	q^split_exp/.*/^, \$self->{'split_exp'},
130	q^no_attachments^, \$self->{'ignore_attachments'},
131	"allow_extra_options")) {
132	print STDERR "\nIncorrect options passed to $class.";
133	print STDERR "\nCheck your collect.cfg configuration file\n";
134	die "\n";
135	}
136	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
137
138	# this might not actually be true at read-time, but after processing
139	# it should all be utf8.
140	$self->{'input_encoding'}="utf8";
141	return bless $self, $class;
142	}
143
144	sub get_default_process_exp {
145	my $self = shift (@_);
146	# mbx/email for mailbox file format, \d+ for maildir (each message is
147	# in a separate file, with a unique number for filename)
148	# mozilla and IE will save individual mbx format files with a ".eml" ext.
149	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
150	}
151
152	# This plugin splits the mbox mail files at lines starting with From<sp>
153	# It is supposed to be "\n\nFrom ", but this isn't always used.
154	sub get_default_split_exp {
155	return q^\nFrom .*\n^;
156
157	}
158
159
160	# do plugin specific processing of doc_obj
161	sub process {
162
163	my $self = shift (@_);
164	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
165	my $outhandle = $self->{'outhandle'};
166
167	# Check that we're dealing with a valid mail file
168	# mbox message files start with "From "
169	# maildir messages usually start with Return-Path and Delivered-To
170	# mh is very similar to maildir
171	my $startoffile=substr($$textref,0,256);
172	if (($startoffile !~ /^(From )/) &&
173	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
174	return undef;
175	}
176
177
178	print $outhandle "EMAILPlug: processing $file\n"
179	if $self->{'verbosity'} > 1;
180
181	my $cursection = $doc_obj->get_top_section();
182
183	#
184	# Parse the document's text and extract metadata
185	#
186
187	# Protect backslashes
188	$$textref =~ s@\\@\\\\@g;
189
190	# Separate header from body of message
191	my $Headers = $$textref;
192	$Headers =~ s/\r?\n\r?\n(.*)$//s;
193	$$textref = $1;
194
195	# Unfold headers - see rfc822
196	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
197	# Extract basic metadata from header
198	my @headers = ("From", "To", "Subject", "Date");
199	my %raw;
200	foreach my $name (@headers) {
201	$raw{$name} = "No $name value";
202	}
203
204	# Get a default encoding for the header - RFC says should be ascii...
205	my $default_heading_encoding="iso_8859_1";
206
207	# We don't know what character set is the user's default...
208	# We could use textcat to guess... for now we'll look at mime content-type
209	# if ($Headers =~ /([[:^ascii:]])/) {
210	# }
211	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
212	$default_header_encoding=$1;
213	$default_header_encoding =~ s@\-@_@g;
214	$default_header_encoding =~ tr/A-Z/a-z/;
215	}
216
217
218	# Examine each line of the headers
219	my ($line, $name, $value);
220	my @parts;
221	foreach $line (split(/\n/, $Headers)) {
222
223	# Ignore lines with no content or which begin with whitespace
224	next unless ($line =~ /:/);
225	next if ($line =~ /^\s/);
226
227	# Find out what metadata is on this line
228	@parts = split(/:/, $line);
229	$name = shift @parts;
230	# get fieldname in canonical form - first cap, then lower case.
231	$name =~ tr/A-Z/a-z/;
232	# uppercase the first character according to the current locale
233	$name=~s/(.+)/\u$1/;
234	next unless $name;
235	next unless ($raw{$name});
236
237	# Find the value of that metadata
238	$value = join(":", @parts);
239	$value =~ s/^\s+//;
240	$value =~ s/\s+$//;
241	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
242	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
243	my $original_value=$value;
244	my $encoded=$value;
245	$value="";
246	# we should ignore spaces between consecutive encoded-texts
247	$encoded =~ s@\?=\s+=\?@\?==\?@g;
248	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
249	my ($charset, $encoding, $data)=($2,$3,$4);
250	my ($decoded_data);
251	$value.="$1"; # any leading chars
252	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
253	chomp $data;
254	$encoding =~ tr/BQ/bq/;
255	if ($encoding eq "q") { # quoted printable
256	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
257	$decoded_data=qp_decode($data);
258	# qp_decode adds \n, which is default for body text
259	chomp($decoded_data);
260	} else { # base 64
261	$decoded_data=base64_decode($data);
262	}
263	$self->convert2unicode($charset, \$decoded_data);
264	$value .= $decoded_data;
265	} # end of while loop
266
267	# get any trailing characters
268	$self->convert2unicode($default_header_encoding, \$encoded);
269	$value.=$encoded;
270
271	if ($value =~ /^\s*$/) { # we couldn't extract anything...
272	$self->convert2unicode($default_header_encoding,
273	\$original_value);
274	$value=original_value;
275	}
276	} # end of if =?...?=
277
278	# In the absense of other charset information, assume the
279	# header is the default (usually "iso_8859_1") and convert it to unicode.
280	else {
281	$self->convert2unicode($default_header_encoding, \$value);
282	}
283
284	# Store the metadata
285	$raw{$name} = $value;
286	}
287
288	# Extract the name and e-mail address from the From metadata
289	$frommeta = $raw{"From"};
290	my $fromnamemeta;
291	my $fromaddrmeta;
292
293	$frommeta =~ s/\s*$//; # Remove trailing space, if any
294
295	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
296	$fromnamemeta=$1;
297	$fromaddrmeta=$2;
298	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
299	$fromnamemeta=$2;
300	$fromaddrmeta=$1;
301	}
302	if (!defined($fromaddrmeta)) {
303	$fromaddrmeta=$frommeta;
304	}
305	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
306	# minor attempt to prevent spam-bots from harvesting addresses...
307	$fromaddrmeta=~s/@/@/;
308	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
309
310	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
311	$fromnamemeta =~ s/\"//g; # remove quotes
312	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
313	}
314	else {
315	$fromnamemeta = $fromaddrmeta;
316	}
317	# if name is an address
318	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
319	$fromnamemeta=~s/@/&#64\;/;
320	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
321
322	$raw{"From"}=$frommeta;
323
324	# Process Date information
325	if ($raw{"Date"} !~ /No Date/) {
326	$raw{"DateText"} = $raw{"Date"};
327
328	# Convert the date text to internal date format
329	$value = $raw{"Date"};
330	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
331	# make some assumptions about the year formatting...
332	# some (old) software thinks 2001 is 101, some think 2001 is 01
333	if ($year < 20) { $year += 2000; } # assume not really 1920...
334	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
335	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
336
337	} else {
338	# We have not extracted a date
339	$raw{"DateText"} = "Unknown.";
340	$raw{"Date"} = "19000000";
341	}
342
343	# Add extracted metadata to document object
344	foreach my $name (keys %raw) {
345	$value = $raw{$name};
346	if ($value) {
347	# assume subject, etc headers have no special HTML meaning.
348	$value = &text_into_html($value);
349	# escape [] so it isn't re-interpreted as metadata
350	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
351	} else {
352	$value = "No $name field";
353	}
354	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
355	}
356
357	my $mimetype="text/plain";
358	my $mimeinfo="";
359	my $charset = $default_header_encoding;
360	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
361	# more than one parameter given to Content-type.
362	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
363	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
364	{
365	$mimetype=$1;
366	$mimetype =~ tr/[A-Z]/[a-z]/;
367
368	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
369	$mimetype = "text/plain";
370	}
371
372	$mimeinfo=$2;
373	if (!defined $mimeinfo) {
374	$mimeinfo="";
375	} else { # strip leading and trailing stuff
376	$mimeinfo =~ s/^\;\s*//;
377	$mimeinfo =~ s/\s*$//;
378	}
379	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
380	$charset = $1;
381	}
382	}
383
384	my $transfer_encoding="7bit";
385	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
386	$transfer_encoding=$1;
387	}
388	if ($mimetype eq "text/html") {
389	$$textref= $self->text_from_part("$Headers\n$$textref");
390	} elsif ($mimetype ne "text/plain") {
391	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
392	$$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
393	$outhandle);
394	} elsif ($transfer_encoding =~ /quoted\-printable/) {
395	$$textref=qp_decode($$textref);
396	$self->convert2unicode($charset, $textref);
397	} elsif ($transfer_encoding =~ /base64/) {
398	$$textref=base64_decode($$textref);
399	$self->convert2unicode($charset, $textref);
400	} else {
401	$self->convert2unicode($charset, $textref);
402	}
403
404
405	# Add "All headers" metadata
406	$Headers = &text_into_html($Headers);
407
408	$Headers = "No headers" unless ($Headers =~ /\w/);
409	$Headers =~ s/@/&#64\;/g;
410	# escape [] so it isn't re-interpreted as metadata
411	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
412	$self->convert2unicode($charset, \$Headers);
413	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
414
415
416	# Add Title metadata
417	my $Title = text_into_html($raw{'Subject'});
418	$Title .= "<br>From: " . text_into_html($raw{'From'});
419	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
420	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
421
422	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
423
424
425	# Add text to document object
426	if ($mimetype eq "text/plain") {
427	$$textref = &text_into_html($$textref);
428	}
429	$$textref = "No message" unless ($$textref =~ /\w/);
430	$doc_obj->add_utf8_text($cursection, $$textref);
431
432	return 1;
433	}
434
435
436	# Convert a text string into HTML.
437	#
438	# The HTML is going to be inserted into a GML file, so
439	# we have to be careful not to use symbols like ">",
440	# which ocurs frequently in email messages (and use
441	# &gt instead.
442	#
443	# This function also turns links and email addresses into hyperlinks,
444	# and replaces carriage returns with <BR> tags (and multiple carriage
445	# returns with <P> tags).
446
447
448	sub text_into_html {
449	my ($text) = @_;
450
451	# Convert problem characters into HTML symbols
452	$text =~ s/&/&/g;
453	$text =~ s/</</g;
454	$text =~ s/>/>/g;
455	$text =~ s/\"/"/g;
456
457	$text =~ s@_@\\_@g; # protect against greenstone macros...
458
459	# convert email addresses and URIs into links
460	# don't markup email addresses for now
461	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
462
463	# try to munge email addresses a little bit...
464	$text =~ s/@/@/;
465	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
466	# assume URI doesn't finish with a '.'
467	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
468
469
470	# Clean up whitespace and convert \n charaters to <BR> or <P>
471	$text =~ s/ +/ /g;
472	$text =~ s/\s*$//g;
473	$text =~ s/^\s*//g;
474	$text =~ s/\n/\n<br>/g;
475	$text =~ s/<br>\s*<br>/<p>/gi;
476
477	return $text;
478	}
479
480
481
482
483	#Process a MIME message.
484	# the textref we are given DOES NOT include the header.
485	sub text_from_mime_message {
486	my $self = shift(@_);
487	my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
488
489	# Check for multiparts - $mimeinfo will be a boundary
490	if ($mimetype =~ /multipart/) {
491	$boundary="";
492	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
493	$boundary=$1;
494	if ($boundary =~ m@^\"@) {
495	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
496	}
497	} else {
498	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
499	}
500	# parts start with "--$boundary"
501	# message ends with "--$boundary--"
502	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
503	# that perl might want to interpolate. Also allows spaces...
504	$boundary=~s/\\/\\\\/g;
505	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
506	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
507	# remove first "part" and last "part" (final --)
508	shift @message_parts;
509	my $last=pop @message_parts;
510	# if our boundaries are a bit dodgy and we only found 1 part...
511	if (!defined($last)) {$last="";}
512	# make sure it is only -- and whitespace
513	if ($last !~ /^\-\-\s*$/ms) {
514	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
515	}
516	foreach my $message_part (@message_parts) {
517	# remove the leading newline left from split.
518	$message_part=~s/^\r?\n//;
519	}
520	if ($mimetype eq "multipart/alternative") {
521	# check for an HTML version first, then TEXT, otherwise use first.
522	my $part_text="";
523	foreach my $message_part (@message_parts) {
524	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
525	{
526	# Use the HTML version
527	$part_text= $self->text_from_part($message_part);
528	$mimetype="text/html";
529	last;
530	}
531	}
532	if ($part_text eq "") { # try getting a text part instead
533	foreach my $message_part (@message_parts) {
534	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
535	{
536	# Use the plain version
537	$part_text= $self->text_from_part($message_part);
538	if ($part_text =~/[^\s]/) {
539	$part_text = text_into_html($part_text);
540	}
541	$mimetype="text/plain";
542	last;
543	}
544	}
545	}
546	if ($part_text eq "") { #use first part (no html/text part found)
547	$part_text = $self->text_from_part(shift @message_parts);
548	$part_text = text_into_html($part_text);
549	}
550	if ($part_text eq "") { # we couldn't get anything!!!
551	# or it was an empty message...
552	# do nothing...
553	print $outhandle "EMAILPlug: no text - empty body?\n";
554	} else {
555	$text = $part_text;
556	}
557	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
558	$text = "";
559	# signed is for PGP/GPG messages... the last part is a hash
560	if ($mimetype =~ m@multipart/signed@) {
561	pop @message_parts;
562	}
563	foreach my $message_part (@message_parts) {
564	my $part_header=$message_part;
565	my $part_body;
566	if ($message_part=~ /^\s*\n/) {
567	# no header... use defaults
568	$part_body=$message_part;
569	$part_header="Content-type: text/plain; charset=us-ascii";
570	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
571	$part_body=$1;
572	} else {
573	# something's gone wrong...
574	$part_header="";
575	$part_body=$message_part;
576	}
577
578	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
579	my $part_content_type="";
580	my $part_content_info="";
581	if ($mimetype eq "multipart/digest") {
582	# default type - RTFRFC!!
583	$part_content_type="message/rfc822";
584	}
585	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s\;?\s(.?)\s*$@mi) {
586	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
587	$part_content_info=$2;
588	}
589	my $filename="";
590	if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
591	$filename=$1;
592	}
593
594	# disposition - either inline or attachment.
595	# NOT CURRENTLY USED - we display all text types instead...
596	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
597
598	# add <<attachment>> to each part except the first...
599	if ($text ne "") {
600	$text.="\n<p><hr><strong><<attachment>></>";
601	# add part info header
602	$text.="<br>Type: $part_content_type<br>\n";
603	if ($filename ne "") {
604	$text.="Filename: $filename\n";
605	}
606	$text.="</strong></p>\n";
607	}
608	if ($part_content_type =~ m@text/@)
609	{
610	my $part_text= $self->text_from_part($message_part);
611	if ($part_content_type !~ m@text/(ht\|x)ml@) {
612	$part_text = text_into_html($part_text);
613	}
614	if ($part_text eq "") {
615	$part_text = '<<empty message>>';
616	}
617	$text .= $part_text;
618	} elsif ($part_content_type =~ m@message/rfc822@) {
619	# This is a forwarded message
620	my $message_part_headers=$part_body;
621	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
622	my $message_part_body=$1;
623	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
624
625	my $rfc822_formatted_body=""; # put result in here
626	if ($message_part_headers =~
627	/^content\-type:\s([\w\.\-\/]+)\s\;?\s(.?)\s*$/ims)
628	{
629	# The message header uses MIME flags
630	my $message_content_type=$1;
631	my $message_content_info=$2;
632	if (!defined($message_content_info)) {
633	$message_content_info="";
634	}
635	$message_content_type =~ tr/A-Z/a-z/;
636	if ($message_content_type =~ /multipart/) {
637	$rfc822_formatted_body=
638	$self->text_from_mime_message($message_content_type,
639	$message_content_info,
640	$message_part_body,
641	$outhandle);
642	} else {
643	$message_part_body= $self->text_from_part($part_body);
644	$rfc822_formatted_body=text_into_html($message_part_body);
645	}
646	} else {
647	# message doesn't use MIME flags
648	$rfc822_formatted_body=text_into_html($message_part_body);
649	}
650	# Add the returned text to the output
651	# don't put all the headers...
652	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
653	my $brief_headers=get_brief_headers($message_part_headers);
654	$text.=text_into_html($brief_headers);
655	$text.="<p>\n";
656	$text.=$rfc822_formatted_body;
657	$text.="</p>\n";
658	# end of message/rfc822
659	} elsif ($part_content_type =~ /multipart/) {
660	# recurse again
661
662	$tmptext= $self->text_from_mime_message($part_content_type,
663	$part_content_info,
664	$part_body,
665	$outhandle);
666	$text.=$tmptext;
667	} else {
668	# this part isn't text/* or another message...
669	if ($text eq "") {
670	# this is the first part of a multipart, or only part!
671	$text="\n<p><hr><strong><<attachment>></>";
672	# add part info header
673	$text.="<br>Type: $part_content_type<br>\n";
674	$text.="Filename: $filename</strong></p>\n";
675	}
676
677	# save attachment by default
678	if (!$self->{'ignore_attachments'}
679	&& $filename ne "") { # this part has a file...
680	my $encoding="8bit";
681	if ($part_header =~
682	/^content-transfer-encoding:\s*(\w+)/mi ) {
683	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
684	}
685	my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
686	my $save_filename="$filename";
687
688	# make sure we don't clobber files with same name;
689	# need to keep state between .mbx files
690	my $assoc_files=$self->{'assoc_filenames'};
691	if ($assoc_files{$filename}) { # it's been set...
692	$assoc_files{$filename}++;
693	$filename =~ m/(.+)\.(\w+)$/;
694	my ($filestem, $ext)=($1,$2);
695	$save_filename="${filestem}_"
696	. $assoc_files{$filename} . ".$ext";
697	} else { # first file with this name
698	$assoc_files{$filename}=1;
699	}
700	open (SAVE, ">$tmpdir/$save_filename") \|\|
701	warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
702	$part_text = $message_part;
703	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
704	if ($encoding eq "base64") {
705	print SAVE base64_decode($part_text);
706	} elsif ($encoding eq "quoted-printable") {
707	print SAVE qp_decode($part_text);
708	} else { # 7bit, 8bit, binary, etc...
709	print SAVE $part_text;
710	}
711	close SAVE;
712	my $doc_obj=$self->{'doc_obj'};
713	$doc_obj->associate_file("$tmpdir/$save_filename",
714	"$save_filename",
715	$part_content_type # mimetype
716	);
717	# clean up tmp area...
718	# Can't do this as it hasn't been copied/linked yet!!!
719	# &util::rm("$tmpdir/$save_filename");
720	print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
721
722	# be nice if "download" was a translatable macro :(
723	$text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
724	} # end of save attachment
725	} # end of !text/message part
726	} # foreach message part.
727	} else {
728	# we can't handle this multipart type (not mixed or alternative)
729	# the RFC also mentions "parallel".
730	}
731	} # end of ($mimetype =~ multipart)
732	elsif ($mimetype =~ m@message/rfc822@) {
733	my $msg_header = $text;
734	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
735	$text = $1;
736
737	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
738	{
739	$mimetype=$1;
740	$mimetype =~ tr/[A-Z]/[a-z]/;
741	$mimeinfo=$2;
742	if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
743	$charset = $1;
744	}
745	my $msg_text;
746	if ($mimetype =~ m@multipart/@) {
747	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
748	$text, $outhandle);
749	} else {$msg_text=$text;}
750
751	my $brief_header=text_into_html(get_brief_headers($msg_header));
752	$text= "\n<b><<attached message>></b><br>";
753	$text.= "<table><tr><td width=\"5%\"> </td>\n";
754	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
755	. "</td></tr></table>";
756	}
757	} else {
758	# we don't do any processing of the content.
759	}
760
761	return $text;
762	}
763
764
765	# Return only the "important" headers from a set of message headers
766	sub get_brief_headers {
767	my $msg_header = shift;
768	my $brief_header = "";
769
770	# Order matters!
771	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
772	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
773	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
774	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
775	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
776
777	return $brief_header;
778	}
779
780
781	# Process a MIME part. Return "" if we can't decode it.
782	sub text_from_part {
783	my $self = shift;
784	my $text = shift \|\| '';
785	my $part_header = $text;
786
787	# check for empty part header (leading blank line)
788	if ($text =~ /^\s*\r?\n/) {
789	$part_header="Content-type: text/plain; charset=us-ascii";
790	} else {
791	$part_header =~ s/\r?\n\r?\n(.*)$//s;
792	$text=$1; if (!defined($text)) {$text="";}
793	}
794	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
795	$part_header =~ /content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
796	my $type=$1;
797	my $charset=$2;
798	if (!defined($type)) {$type="";}
799	if (!defined($charset)) {$charset="ascii";}
800	my $encoding="";
801	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
802	$encoding=$1; $encoding=~tr/A-Z/a-z/;
803	}
804	# Content-Transfer-Encoding is per-part
805	if ($encoding ne "") {
806	if ($encoding =~ /quoted\-printable/) {
807	$text=qp_decode($text);
808	} elsif ($encoding =~ /base64/) {
809	$text=base64_decode($text);
810	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
811	# rfc2045 also allows binary, which we ignore (for now).
812	my $outhandle=$self->{'outhandle'};
813	print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
814	return "";
815	}
816	}
817	if ($type eq "text/html") {
818	# only get stuff between <body> tags, or <html> tags.
819	$text =~ s@^.<html[^>]>@@is;
820	$text =~ s@</html>.*$@@is;
821	$text =~ s/^.?<body[^>]>//si;
822	$text =~ s/<\/body>.*$//si;
823	}
824	elsif ($type eq "text/xml") {
825	$text=~s/</</g;$text=~s/>/>/g;
826	$text="<pre>\n$text\n</pre>\n";
827	}
828	# convert to unicode
829	$self->convert2unicode($charset, \$text);
830	return $text;
831	}
832
833
834	# decode quoted-printable text
835	sub qp_decode {
836	my $text=shift;
837
838	# if a line ends with "=\s*", it is a soft line break, otherwise
839	# keep in any newline characters.
840
841	$text =~ s/=\s*\r?\n//mg;
842	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
843	return $text;
844	}
845
846	# decode base64 text. This is fairly slow (since it's interpreted perl rather
847	# than compiled XS stuff like in the ::MIME modules, but this is more portable
848	# for us at least).
849	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
850	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
851	# from each encoded byte.
852
853
854	sub base64_decode {
855	my $enc_text = shift;
856	# A=>0, B=>1, ..., '+'=>62, '/'=>63
857	# also '=' is used for padding at the end, but we remove it anyway.
858	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
859	# map each MIME char into it's value, for more efficient lookup.
860	my %index;
861	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
862	# remove all non-base64 chars. eval to get variable in transliteration...
863	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
864	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
865	my $decoded="";
866	while (length ($enc_text)>3)
867	{
868	my $fourchars=substr($enc_text,0,4,"");
869	my @chars=(split '',$fourchars);
870	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
871	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
872	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
873	}
874	# if there are any input chars left, there are either
875	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
876	my @chars=(split '',$enc_text);
877	if (length($enc_text)) {
878	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
879	}
880	if (length($enc_text)==3) {
881	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
882	}
883	return $decoded;
884	}
885
886	sub convert2unicode {
887	my $self = shift(@_);
888	my ($charset, $textref) = @_;
889
890	# first get our character encoding name in the right form.
891	$charset = "iso_8859_1" unless defined $charset;
892	$charset=~tr/A-Z/a-z/;
893	$charset=~s/\-/_/g;
894	$charset=~s/gb2312/gb/;
895	# assumes EUC-KR, not ISO-2022 !?
896	$charset=~s/ks_c_5601_1987/korean/;
897
898	if ($charset eq "utf_8") {
899	# nothing to do!
900	return;
901	}
902
903	# It appears that we can't always trust ascii text so we'll treat it
904	# as iso-8859-1 (letting characters above 0x80 through without
905	# converting them to utf-8 will result in invalid XML documents
906	# which can't be parsed at build time).
907	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
908
909	if ($charset eq "iso_8859_1") {
910	# test if the mailer lied, and it has win1252 chars in it...
911	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
912	if ($$textref =~ m/[\x80-\x9f]/) {
913	my $outhandle = $self->{'outhandle'};
914	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
915	print $outhandle "codepage 1252 detected.\n";
916	$charset = "windows_1252";
917	}
918	}
919	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
920
921	if ($utf8_text ne "") {
922	$$textref=$utf8_text;
923	} else {
924	# we didn't get any text... unsupported encoding perhaps? Or it is
925	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
926	# characters out here if this causes problems...
927	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
928
929	}
930	}
931
932
933	# Perl packages have to return true if they are run.
934	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: