Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 7703

Last change on this file since 7703 was 7703, checked in by jrm21, 20 years ago

1) use the email's message ID instead of document hash for Identifier.

2) if a message claims to be utf8, actually check it for bad chars.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 34.1 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content (optional, not stored by default)
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64	# 12/05/02 Added usage datastructure - John Thompson
65
66	package EMAILPlug;
67
68	use SplitPlug;
69
70	use unicode;
71
72	use sorttools;
73	use util;
74
75
76	# EMAILPlug is a sub-class of SplitPlug.
77
78	sub BEGIN {
79	@ISA = ('SplitPlug');
80	}
81
82	use strict;
83	no strict "refs"; # so we can use a variable as a filehandle for print $out
84
85	my $arguments =
86	[ { 'name' => "process_exp",
87	'desc' => "{BasPlug.process_exp}",
88	'type' => "regexp",
89	'reqd' => "no",
90	'deft' => &get_default_process_exp() },
91	{ 'name' => "no_attachments",
92	'desc' => "{EMAILPlug.no_attachments}",
93	'type' => "flag",
94	'reqd' => "no" },
95	{ 'name' => "headers",
96	'desc' => "{EMAILPlug.headers}",
97	'type' => "flag",
98	'reqd' => "no" },
99	{ 'name' => "split_exp",
100	'desc' => "{EMAILPlug.split_exp}",
101	'type' => "regexp",
102	'reqd' => "no",
103	'deft' => &get_default_split_exp() }
104	];
105
106	my $options = { 'name' => "EMAILPlug",
107	'desc' => "{EMAILPlug.desc}",
108	'abstract' => "no",
109	'inherits' => "yes",
110	'args' => $arguments };
111
112	# Create a new EMAILPlug object with which to parse a file.
113	# Accomplished by creating a new BasPlug and using bless to
114	# turn it into an EMAILPlug.
115
116	sub new {
117	my ($class) = @_;
118	my $self = new BasPlug ($class, @_);
119	$self->{'plugin_type'} = "EMAILPlug";
120	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
121	my $option_list = $self->{'option_list'};
122	push( @{$option_list}, $options );
123
124	if (!parsargv::parse(\@_,
125	q^split_exp/.*/^, \$self->{'split_exp'},
126	q^no_attachments^, \$self->{'ignore_attachments'},
127	q^headers^, \$self->{'header_metadata'},
128	"allow_extra_options")) {
129	print STDERR "\nIncorrect options passed to $class.";
130	print STDERR "\nCheck your collect.cfg configuration file\n";
131	$self->print_txt_usage(""); # Use default resource bundle
132	die "\n";
133	}
134	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
135
136	# this might not actually be true at read-time, but after processing
137	# it should all be utf8.
138	$self->{'input_encoding'}="utf8";
139	return bless $self, $class;
140	}
141
142	sub get_default_process_exp {
143	my $self = shift (@_);
144	# mbx/email for mailbox file format, \d+ for maildir (each message is
145	# in a separate file, with a unique number for filename)
146	# mozilla and IE will save individual mbx format files with a ".eml" ext.
147	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
148	}
149
150	# This plugin splits the mbox mail files at lines starting with From<sp>
151	# It is supposed to be "\n\nFrom ", but this isn't always used.
152	sub get_default_split_exp {
153	return q^\nFrom .*\n^;
154
155	}
156
157
158	# do plugin specific processing of doc_obj
159	sub process {
160
161	my $self = shift (@_);
162	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
163	my $outhandle = $self->{'outhandle'};
164
165	# Check that we're dealing with a valid mail file
166	# mbox message files start with "From "
167	# maildir messages usually start with Return-Path and Delivered-To
168	# mh is very similar to maildir
169	my $startoffile=substr($$textref,0,256);
170	if (($startoffile !~ /^(From )/) &&
171	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
172	return undef;
173	}
174
175
176	print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
177
178	print $outhandle "EMAILPlug: processing $file\n"
179	if $self->{'verbosity'} > 1;
180
181	my $cursection = $doc_obj->get_top_section();
182
183	#
184	# Parse the document's text and extract metadata
185	#
186
187	# Protect backslashes
188	$$textref =~ s@\\@\\\\@g;
189
190	# Separate header from body of message
191	my $Headers = $$textref;
192	$Headers =~ s/\r?\n\r?\n(.*)$//s;
193	$$textref = $1;
194	$Headers .= "\n";
195
196	# Unfold headers - see rfc822
197	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
198	# Extract basic metadata from header
199	my @headers = ("From", "To", "Subject", "Date");
200	my %raw;
201	foreach my $name (@headers) {
202	$raw{$name} = "No $name value";
203	}
204
205	# Get a default encoding for the header - RFC says should be ascii...
206	my $default_header_encoding="iso_8859_1";
207
208	# We don't know what character set is the user's default...
209	# We could use textcat to guess... for now we'll look at mime content-type
210	# if ($Headers =~ /([[:^ascii:]])/) {
211	# }
212	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
213	$default_header_encoding=$1;
214	$default_header_encoding =~ s@\-@_@g;
215	$default_header_encoding =~ tr/A-Z/a-z/;
216	}
217
218
219	# Examine each line of the headers
220	my ($line, $name, $value);
221	my @parts;
222	foreach $line (split(/\n/, $Headers)) {
223
224	# Ignore lines with no content or which begin with whitespace
225	next unless ($line =~ /:/);
226	next if ($line =~ /^\s/);
227
228	# Find out what metadata is on this line
229	@parts = split(/:/, $line);
230	$name = shift @parts;
231	# get fieldname in canonical form - first cap, then lower case.
232	$name =~ tr/A-Z/a-z/;
233	# uppercase the first character according to the current locale
234	$name=~s/(.+)/\u$1/;
235	next unless $name;
236	next unless ($raw{$name});
237
238	# Find the value of that metadata
239	$value = join(":", @parts);
240	$value =~ s/^\s+//;
241	$value =~ s/\s+$//;
242	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
243	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
244	my $original_value=$value;
245	my $encoded=$value;
246	$value="";
247	# we should ignore spaces between consecutive encoded-texts
248	$encoded =~ s@\?=\s+=\?@\?==\?@g;
249	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
250	my ($charset, $encoding, $data)=($2,$3,$4);
251	my ($decoded_data);
252	$value.="$1"; # any leading chars
253	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
254	chomp $data;
255	$encoding =~ tr/BQ/bq/;
256	if ($encoding eq "q") { # quoted printable
257	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
258	$decoded_data=qp_decode($data);
259	# qp_decode adds \n, which is default for body text
260	chomp($decoded_data);
261	} else { # base 64
262	$decoded_data=base64_decode($data);
263	}
264	$self->convert2unicode($charset, \$decoded_data);
265	$value .= $decoded_data;
266	} # end of while loop
267
268	# get any trailing characters
269	$self->convert2unicode($default_header_encoding, \$encoded);
270	$value.=$encoded;
271
272	if ($value =~ /^\s*$/) { # we couldn't extract anything...
273	$self->convert2unicode($default_header_encoding,
274	\$original_value);
275	$value=$original_value;
276	}
277	} # end of if =?...?=
278
279	# In the absense of other charset information, assume the
280	# header is the default (usually "iso_8859_1") and convert to unicode.
281	else {
282	$self->convert2unicode($default_header_encoding, \$value);
283	}
284
285	# Store the metadata
286	$value =~ s@_@\\_@g; # protect against GS macro language
287	$raw{$name} = $value;
288	}
289
290	# Extract the name and e-mail address from the From metadata
291	my $frommeta = $raw{"From"};
292	my $fromnamemeta;
293	my $fromaddrmeta;
294
295	$frommeta =~ s/\s*$//; # Remove trailing space, if any
296
297	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
298	$fromnamemeta=$1;
299	$fromaddrmeta=$2;
300	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
301	$fromnamemeta=$2;
302	$fromaddrmeta=$1;
303	}
304	if (!defined($fromaddrmeta)) {
305	$fromaddrmeta=$frommeta;
306	}
307	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
308	# minor attempt to prevent spam-bots from harvesting addresses...
309	$fromaddrmeta=~s/@/@/;
310
311	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
312
313	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
314	$fromnamemeta =~ s/\"//g; # remove quotes
315	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
316	}
317	else {
318	$fromnamemeta = $fromaddrmeta;
319	}
320	# if name is an address
321	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
322	$fromnamemeta=~s/@/&#64\;/;
323	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
324
325	$raw{"From"}=$frommeta;
326
327	# Process Date information
328	if ($raw{"Date"} !~ /No Date/) {
329	$raw{"DateText"} = $raw{"Date"};
330
331	# Convert the date text to internal date format
332	$value = $raw{"Date"};
333	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
334	# make some assumptions about the year formatting...
335	# some (old) software thinks 2001 is 101, some think 2001 is 01
336	if ($year < 20) { $year += 2000; } # assume not really 1920...
337	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
338	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
339
340	} else {
341	# We have not extracted a date
342	$raw{"DateText"} = "Unknown.";
343	$raw{"Date"} = "19000000";
344	}
345
346	# Add extracted metadata to document object
347	foreach my $name (keys %raw) {
348	$value = $raw{$name};
349	if ($value) {
350	# assume subject, etc headers have no special HTML meaning.
351	$value = &text_into_html($value);
352	# escape [] so it isn't re-interpreted as metadata
353	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
354	} else {
355	$value = "No $name field";
356	}
357	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
358	}
359
360
361	# extract a message ID from the headers, if there is one, and we'll use
362	# that as the greenstone doc ID. Having a predictable ID means we can
363	# link to other messages, eg from In-Reply-To or References headers...
364	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
365	my $id=escape_msg_id($1);
366	$doc_obj->{'msgid'}=$id;
367	}
368	# link to another message, if this is a reply
369	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
370	my $id=escape_msg_id($1);
371	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
372	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
373	# References can have multiple, get the last one
374	my $id=escape_msg_id($1);
375	# not necessarily in-reply-to, but same thread...
376	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
377	}
378
379
380
381	my $mimetype="text/plain";
382	my $mimeinfo="";
383	my $charset = $default_header_encoding;
384	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
385	# more than one parameter given to Content-type.
386	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
387	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
388	{
389	$mimetype=$1;
390	$mimetype =~ tr/[A-Z]/[a-z]/;
391
392	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
393	$mimetype = "text/plain";
394	}
395
396	$mimeinfo=$2;
397	if (!defined $mimeinfo) {
398	$mimeinfo="";
399	} else { # strip leading and trailing stuff
400	$mimeinfo =~ s/^\;\s*//;
401	$mimeinfo =~ s/\s*$//;
402	}
403	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
404	$charset = $1;
405	}
406	}
407
408	my $transfer_encoding="7bit";
409	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
410	$transfer_encoding=$1;
411	}
412
413	if ($mimetype eq "text/html") {
414	$$textref= $self->text_from_part("$Headers\n$$textref");
415	} elsif ($mimetype ne "text/plain") {
416	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
417	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
418	} else { # mimetype eq text/plain
419	$$textref = &text_into_html($$textref);
420	$$textref =~ s@_@\\_@g; # protect against GS macro language
421
422	if ($transfer_encoding =~ /quoted\-printable/) {
423	$$textref=qp_decode($$textref);
424	} elsif ($transfer_encoding =~ /base64/) {
425	$$textref=base64_decode($$textref);
426	}
427	$self->convert2unicode($charset, $textref);
428	}
429
430
431	if ($self->{'header_metadata'} && $self->{'header_metadata'} == 1) {
432	# Add "All headers" metadata
433	$Headers = &text_into_html($Headers);
434
435	$Headers = "No headers" unless ($Headers =~ /\w/);
436	$Headers =~ s/@/&#64\;/g;
437	# escape [] so it isn't re-interpreted as metadata
438	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
439	$self->convert2unicode($charset, \$Headers);
440
441	$Headers =~ s@_@\\_@g; # protect against GS macro language
442	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
443	}
444
445
446	# Add Title metadata
447	my $Title = text_into_html($raw{'Subject'});
448	$Title .= "<br>From: " . text_into_html($fromnamemeta);
449	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
450	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
451
452	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
453
454
455	# Add text to document object
456	$$textref = "No message" unless ($$textref =~ /\w/);
457
458	$doc_obj->add_utf8_text($cursection, $$textref);
459
460	return 1;
461	}
462
463
464	# Convert a text string into HTML.
465	#
466	# The HTML is going to be inserted into a GML file, so
467	# we have to be careful not to use symbols like ">",
468	# which ocurs frequently in email messages (and use
469	# &gt instead.
470	#
471	# This function also turns links and email addresses into hyperlinks,
472	# and replaces carriage returns with <BR> tags (and multiple carriage
473	# returns with <P> tags).
474
475
476	sub text_into_html {
477	my ($text) = @_;
478
479	# Convert problem characters into HTML symbols
480	$text =~ s/&/&/g;
481	$text =~ s/</</g;
482	$text =~ s/>/>/g;
483	$text =~ s/\"/"/g;
484
485	# convert email addresses and URIs into links
486	# don't markup email addresses for now
487	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
488
489	# try to munge email addresses a little bit...
490	$text =~ s/@/@/;
491	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
492	# assume URI doesn't finish with a '.'
493	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
494
495
496	# Clean up whitespace and convert \n charaters to <BR> or <P>
497	$text =~ s/ +/ /g;
498	$text =~ s/\s*$//g;
499	$text =~ s/^\s*//g;
500	$text =~ s/\n/\n<br>/g;
501	$text =~ s/<br>\s*<br>/<p>/gi;
502
503	return $text;
504	}
505
506
507
508
509	#Process a MIME message.
510	# the textref we are given DOES NOT include the header.
511	sub text_from_mime_message {
512	my $self = shift(@_);
513	my ($mimetype,$mimeinfo,$text)=(@_);
514	my $outhandle=$self->{'outhandle'};
515	# Check for multiparts - $mimeinfo will be a boundary
516	if ($mimetype =~ /multipart/) {
517	my $boundary="";
518	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
519	$boundary=$1;
520	if ($boundary =~ m@^\"@) {
521	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
522	}
523	} else {
524	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
525	}
526	# parts start with "--$boundary"
527	# message ends with "--$boundary--"
528	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
529	# that perl might want to interpolate. Also allows spaces...
530	$boundary=~s/\\/\\\\/g;
531	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
532	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
533	# remove first "part" and last "part" (final --)
534	shift @message_parts;
535	my $last=pop @message_parts;
536	# if our boundaries are a bit dodgy and we only found 1 part...
537	if (!defined($last)) {$last="";}
538	# make sure it is only -- and whitespace
539	if ($last !~ /^\-\-\s*$/ms) {
540	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
541	}
542	foreach my $message_part (@message_parts) {
543	# remove the leading newline left from split.
544	$message_part=~s/^\r?\n//;
545	}
546	if ($mimetype eq "multipart/alternative") {
547	# check for an HTML version first, then TEXT, otherwise use first.
548	my $part_text="";
549	foreach my $message_part (@message_parts) {
550	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
551	{
552	# Use the HTML version
553	$part_text= $self->text_from_part($message_part);
554	$mimetype="text/html";
555	last;
556	}
557	}
558	if ($part_text eq "") { # try getting a text part instead
559	foreach my $message_part (@message_parts) {
560	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
561	{
562	# Use the plain version
563	$part_text= $self->text_from_part($message_part);
564	if ($part_text =~/[^\s]/) {
565	$part_text = text_into_html($part_text);
566	}
567	$mimetype="text/plain";
568	last;
569	}
570	}
571	}
572	if ($part_text eq "") { #use first part (no html/text part found)
573	$part_text = $self->text_from_part(shift @message_parts);
574	$part_text = text_into_html($part_text);
575	}
576	if ($part_text eq "") { # we couldn't get anything!!!
577	# or it was an empty message...
578	# do nothing...
579	print $outhandle "EMAILPlug: no text - empty body?\n";
580	} else {
581	$text = $part_text;
582	}
583	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
584	$text = "";
585	# signed is for PGP/GPG messages... the last part is a hash
586	if ($mimetype =~ m@multipart/signed@) {
587	pop @message_parts;
588	}
589	my $is_first_part=1;
590	foreach my $message_part (@message_parts) {
591	if ($is_first_part && $text ne "") {$is_first_part=0;}
592
593	if ($mimetype eq "multipart/digest") {
594	# default type - RTFRFC!! Set if not already set
595	$message_part =~ m@^(.*)\n\r?\n@s;
596	my $part_header=$1;
597	if ($part_header !~ m@^content-type@mi) {
598	$message_part="Content-type: message/rfc822\n"
599	. $message_part; # prepend default type
600	}
601	}
602
603	$text .= $self->process_multipart_part($message_part,
604	$is_first_part);
605	} # foreach message part.
606	} else {
607	# we can't handle this multipart type (not mixed or alternative)
608	# the RFC also mentions "parallel".
609	}
610	} # end of ($mimetype =~ multipart)
611	elsif ($mimetype =~ m@message/rfc822@) {
612	my $msg_header = $text;
613	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
614	$text = $1;
615
616	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
617	{
618	$mimetype=$1;
619	$mimetype =~ tr/[A-Z]/[a-z]/;
620	$mimeinfo=$2;
621	#if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
622	# $charset = $1;
623	#}
624	my $msg_text;
625	if ($mimetype =~ m@multipart/@) {
626	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
627	$text);
628	} else {$msg_text=text_from_part($text);}
629
630	my $brief_header=text_into_html(get_brief_headers($msg_header));
631	$text= "\n<b><<attached message>></b><br>";
632	$text.= "<table><tr><td width=\"5%\"> </td>\n";
633	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
634	. "</td></tr></table>";
635	}
636	} else {
637	# we don't do any processing of the content.
638	}
639
640	return $text;
641	}
642
643
644
645	# used for turning a message id into a more friendly string for greenstone
646	sub escape_msg_id {
647	#msgid
648	my $id=shift;
649	chomp $id; $id =~ s!\s!!g; # remove spaces
650	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
651	$id =~ s![_&]!-!g; # replace symbols that might cause problems
652	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
653	return $id;
654	}
655
656
657
658	sub process_multipart_part {
659	my $self = shift;
660	my $message_part = shift;
661	my $is_first_part = shift;
662
663	my $return_text="";
664	my $part_header=$message_part;
665	my $part_body;
666	if ($message_part=~ /^\s*\n/) {
667	# no header... use defaults
668	$part_body=$message_part;
669	$part_header="Content-type: text/plain; charset=us-ascii";
670	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
671	$part_body=$1;
672	} else {
673	# something's gone wrong...
674	$part_header="";
675	$part_body=$message_part;
676	}
677
678	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
679	my $part_content_type="";
680	my $part_content_info="";
681
682	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
683	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
684	$part_content_info=$2;
685	if (!defined($part_content_info)) {
686	$part_content_info="";
687	} else {
688	$part_content_info =~ s/^\;\s*//;
689	$part_content_info =~ s/\s*$//;
690	}
691	}
692	my $filename="";
693	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
694	$filename=$1;
695	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
696	}
697
698	# disposition - either inline or attachment.
699	# NOT CURRENTLY USED - we display all text types instead...
700	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
701
702	# add <<attachment>> to each part except the first...
703	if (!$is_first_part) {
704	$return_text.="\n<p><hr><strong><<attachment>>";
705	# add part info header
706	my $header_text="<br>Type: $part_content_type<br>\n";
707	if ($filename ne "") {
708	$header_text.="Filename: $filename\n";
709	}
710	$header_text =~ s@_@\\_@g;
711	$return_text.=$header_text . "</strong></p>\n<p>\n";
712	}
713
714	if ($part_content_type =~ m@text/@)
715	{
716	my $part_text= $self->text_from_part($message_part);
717	if ($part_content_type !~ m@text/(ht\|x)ml@) {
718	$part_text = text_into_html($part_text);
719	}
720	if ($part_text eq "") {
721	$part_text = ' ';
722	}
723	$return_text .= $part_text;
724	} elsif ($part_content_type =~ m@message/rfc822@) {
725	# This is a forwarded message
726	my $message_part_headers=$part_body;
727	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
728	my $message_part_body=$1;
729	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
730
731	my $rfc822_formatted_body=""; # put result in here
732	if ($message_part_headers =~
733	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
734	{
735	# The message header uses MIME flags
736	my $message_content_type=$1;
737	my $message_content_info=$2;
738	if (!defined($message_content_info)) {
739	$message_content_info="";
740	} else {
741	$message_content_info =~ s/^\;\s*//;
742	$message_content_info =~ s/\s*$//;
743	}
744	$message_content_type =~ tr/A-Z/a-z/;
745	if ($message_content_type =~ /multipart/) {
746	$rfc822_formatted_body=
747	$self->text_from_mime_message($message_content_type,
748	$message_content_info,
749	$message_part_body);
750	} else {
751	$message_part_body= $self->text_from_part($part_body);
752	$rfc822_formatted_body=text_into_html($message_part_body);
753	}
754	} else {
755	# message doesn't use MIME flags
756	$rfc822_formatted_body=text_into_html($message_part_body);
757	$rfc822_formatted_body =~ s@_@\\_@g;
758	}
759	# Add the returned text to the output
760	# don't put all the headers...
761	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
762	my $brief_headers=get_brief_headers($message_part_headers);
763	$return_text.=text_into_html($brief_headers);
764	$return_text.="</p><p>\n";
765	$return_text.=$rfc822_formatted_body;
766	$return_text.="</p>\n";
767	# end of message/rfc822
768	} elsif ($part_content_type =~ /multipart/) {
769	# recurse again
770
771	my $tmptext= $self->text_from_mime_message($part_content_type,
772	$part_content_info,
773	$part_body);
774	$return_text.=$tmptext;
775	} else {
776	# this part isn't text/* or another message...
777	if ($is_first_part) {
778	# this is the first part of a multipart, or only part!
779	$return_text="\n<p><hr><strong><<attachment>>";
780	# add part info header
781	my $header_text="<br>Type: $part_content_type<br>\n";
782	$header_text.="Filename: $filename</strong></p>\n<p>\n";
783	$header_text =~ s@_@\\_@g;
784	$return_text.=$header_text;
785	}
786
787	# save attachment by default
788	if (!$self->{'ignore_attachments'}
789	&& $filename ne "") { # this part has a file...
790	my $encoding="8bit";
791	if ($part_header =~
792	/^content-transfer-encoding:\s*(\w+)/mi ) {
793	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
794	}
795	my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
796	my $save_filename=$filename;
797
798	# make sure we don't clobber files with same name;
799	# need to keep state between .mbx files
800	my $assoc_files=$self->{'assoc_filenames'};
801	if ($assoc_files->{$filename}) { # it's been set...
802	$assoc_files->{$filename}++;
803	$filename =~ m/(.+)\.(\w+)$/;
804	my ($filestem, $ext)=($1,$2);
805	$save_filename="${filestem}_"
806	. $assoc_files->{$filename} . ".$ext";
807	} else { # first file with this name
808	$assoc_files->{$filename}=1;
809	}
810	open (SAVE, ">$tmpdir/$save_filename") \|\|
811	warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
812	my $part_text = $message_part;
813	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
814	if ($encoding eq "base64") {
815	print SAVE base64_decode($part_text);
816	} elsif ($encoding eq "quoted-printable") {
817	print SAVE qp_decode($part_text);
818	} else { # 7bit, 8bit, binary, etc...
819	print SAVE $part_text;
820	}
821	close SAVE;
822	my $doc_obj=$self->{'doc_obj'};
823	$doc_obj->associate_file("$tmpdir/$save_filename",
824	"$save_filename",
825	$part_content_type # mimetype
826	);
827	# clean up tmp area...
828	# Can't do this as it hasn't been copied/linked yet!!!
829	# &util::rm("$tmpdir/$save_filename");
830	my $outhandle=$self->{'outhandle'};
831	print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
832
833	# be nice if "download" was a translatable macro :(
834	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
835	} # end of save attachment
836	} # end of !text/message part
837
838
839	return $return_text;
840	}
841
842
843	# Return only the "important" headers from a set of message headers
844	sub get_brief_headers {
845	my $msg_header = shift;
846	my $brief_header = "";
847
848	# Order matters!
849	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
850	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
851	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
852	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
853	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
854
855	return $brief_header;
856	}
857
858
859	# Process a MIME part. Return "" if we can't decode it.
860	# should only be called for parts with type "text/*" ?
861	sub text_from_part {
862	my $self = shift;
863	my $text = shift \|\| '';
864	my $part_header = $text;
865
866	# check for empty part header (leading blank line)
867	if ($text =~ /^\s*\r?\n/) {
868	$part_header="Content-type: text/plain; charset=us-ascii";
869	} else {
870	$part_header =~ s/\r?\n\r?\n(.*)$//s;
871	$text=$1; if (!defined($text)) {$text="";}
872	}
873	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
874	$part_header =~ /content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
875	my $type=$1;
876	my $charset=$2;
877	if (!defined($type)) {$type="";}
878	if (!defined($charset)) {$charset="ascii";}
879	my $encoding="";
880	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
881	$encoding=$1; $encoding=~tr/A-Z/a-z/;
882	}
883	# Content-Transfer-Encoding is per-part
884	if ($encoding ne "") {
885	if ($encoding =~ /quoted\-printable/) {
886	$text=qp_decode($text);
887	} elsif ($encoding =~ /base64/) {
888	$text=base64_decode($text);
889	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
890	# rfc2045 also allows binary, which we ignore (for now).
891	my $outhandle=$self->{'outhandle'};
892	print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
893	return "";
894	}
895	}
896	if ($type eq "text/html") {
897	# only get stuff between <body> tags, or <html> tags.
898	$text =~ s@^.<html[^>]>@@is;
899	$text =~ s@</html>.*$@@is;
900	$text =~ s/^.?<body[^>]>//si;
901	$text =~ s/<\/body>.*$//si;
902	}
903	elsif ($type eq "text/xml") {
904	$text=~s/</</g;$text=~s/>/>/g;
905	$text="<pre>\n$text\n</pre>\n";
906	}
907	# convert to unicode
908	$self->convert2unicode($charset, \$text);
909
910	$text =~ s@_@\\_@g; # protect against GS macro language
911	return $text;
912	}
913
914
915
916
917	# decode quoted-printable text
918	sub qp_decode {
919	my $text=shift;
920
921	# if a line ends with "=\s*", it is a soft line break, otherwise
922	# keep in any newline characters.
923
924	$text =~ s/=\s*\r?\n//mg;
925	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
926	return $text;
927	}
928
929	# decode base64 text. This is fairly slow (since it's interpreted perl rather
930	# than compiled XS stuff like in the ::MIME modules, but this is more portable
931	# for us at least).
932	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
933	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
934	# from each encoded byte.
935
936
937	sub base64_decode {
938	my $enc_text = shift;
939	# A=>0, B=>1, ..., '+'=>62, '/'=>63
940	# also '=' is used for padding at the end, but we remove it anyway.
941	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
942	# map each MIME char into it's value, for more efficient lookup.
943	my %index;
944	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
945	# remove all non-base64 chars. eval to get variable in transliteration...
946	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
947	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
948	my $decoded="";
949	while (length ($enc_text)>3)
950	{
951	my $fourchars=substr($enc_text,0,4,"");
952	my @chars=(split '',$fourchars);
953	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
954	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
955	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
956	}
957	# if there are any input chars left, there are either
958	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
959	my @chars=(split '',$enc_text);
960	if (length($enc_text)) {
961	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
962	}
963	if (length($enc_text)==3) {
964	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
965	}
966	return $decoded;
967	}
968
969	sub convert2unicode {
970	my $self = shift(@_);
971	my ($charset, $textref) = @_;
972
973	if (!$$textref) {
974	# nothing to do!
975	return;
976	}
977
978	# first get our character encoding name in the right form.
979	$charset = "iso_8859_1" unless defined $charset;
980	$charset =~ tr/A-Z/a-z/; # lowercase
981	$charset =~ s/\-/_/g;
982	$charset =~ s/gb2312/gb/;
983	# assumes EUC-KR, not ISO-2022 !?
984	$charset =~ s/^ks_c_5601_1987/korean/;
985	if ($charset eq 'utf_8') {$charset='utf8'}
986
987	my $outhandle = $self->{'outhandle'};
988
989	if ($charset eq "utf8") {
990	# no conversion needed, but lets check that it's valid utf8
991	# see utf-8 manpage for valid ranges
992	$$textref =~ m/^/g; # to set \G
993	my $badbytesfound=0;
994	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
995	my $highbytes=$1;
996	my $highbyteslength=length($highbytes);
997	# replace any non utf8 complaint bytes
998	$highbytes =~ /^/g; # set pos()
999	while ($highbytes =~
1000	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
1001	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
1002	[\xf0-\xf7][\x80-\xbf]{3} # 4 byte
1003	[\xf8-\xfb][\x80-\xbf]{4} # 5 byte
1004	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1005	)*([\x80-\xff])? !xg
1006	) {
1007	my $badbyte=$1;
1008	if (!defined $badbyte) {next} # hit end of string
1009	my $pos=pos($highbytes);
1010	substr($highbytes, $pos-1, 1, "\xc2\x80");
1011	# update the position to continue searching (for \G)
1012	pos($highbytes) = $pos+1; # set to just after the \x80
1013	$badbytesfound=1;
1014	}
1015	if ($badbytesfound==1) {
1016	# claims to be utf8, but it isn't!
1017	print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
1018	. "detected and removed.\n";
1019
1020	my $replength=length($highbytes);
1021	my $textpos=pos($$textref);
1022	# replace bad bytes with good bytes
1023	substr( $$textref, $textpos-$replength,
1024	$replength, $highbytes);
1025	# update the position to continue searching (for \G)
1026	pos($$textref)=$textpos+($replength-$highbyteslength);
1027	}
1028	}
1029	return;
1030	}
1031
1032	# It appears that we can't always trust ascii text so we'll treat it
1033	# as iso-8859-1 (letting characters above 0x80 through without
1034	# converting them to utf-8 will result in invalid XML documents
1035	# which can't be parsed at build time).
1036	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
1037
1038	if ($charset eq "iso_8859_1") {
1039	# test if the mailer lied, and it has win1252 chars in it...
1040	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1041	if ($$textref =~ m/[\x80-\x9f]/) {
1042	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
1043	print $outhandle "codepage 1252 detected.\n";
1044	$charset = "windows_1252";
1045	}
1046	}
1047	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1048
1049	if ($utf8_text ne "") {
1050	$$textref=$utf8_text;
1051	} else {
1052	# we didn't get any text... unsupported encoding perhaps? Or it is
1053	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
1054	# characters out here if this causes problems...
1055	my $outhandle=$self->{'outhandle'};
1056	print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
1057	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1058
1059	}
1060	}
1061
1062
1063	sub set_OID {
1064	my $self = shift (@_);
1065	my ($doc_obj, $id, $segment_number) = @_;
1066
1067	if ( exists $doc_obj->{'msgid'} ) {
1068	$doc_obj->set_OID($doc_obj->{'msgid'});
1069	} else {
1070	$doc_obj->set_OID("$id\_$segment_number");
1071	}
1072	}
1073
1074
1075	# Perl packages have to return true if they are run.
1076	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: