Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 7703

Last change on this file since 7703 was 7703, checked in by jrm21, 20 years ago

1) use the email's message ID instead of document hash for Identifier.

2) if a message claims to be utf8, actually check it for bad chars.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 34.1 KB

Rev	Line
[638]	1	###########################################################################
	2	#
	3	# EMAILPlug.pm - a plugin for parsing email files
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[3630]	9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
	29	# EMAILPlug
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[6916]	42	# $Headers All the header content (optional, not stored by default)
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[2630]	52	#
[2918]	53	#
[2630]	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]	63
	64	# 12/05/02 Added usage datastructure - John Thompson
	65
[638]	66	package EMAILPlug;
	67
[1895]	68	use SplitPlug;
	69
[2730]	70	use unicode;
	71
[638]	72	use sorttools;
	73	use util;
	74
	75
[1895]	76	# EMAILPlug is a sub-class of SplitPlug.
[638]	77
[1206]	78	sub BEGIN {
[1895]	79	@ISA = ('SplitPlug');
[638]	80	}
	81
[6062]	82	use strict;
	83	no strict "refs"; # so we can use a variable as a filehandle for print $out
[3524]	84
[3540]	85	my $arguments =
[4744]	86	[ { 'name' => "process_exp",
[4873]	87	'desc' => "{BasPlug.process_exp}",
[6408]	88	'type' => "regexp",
[3540]	89	'reqd' => "no",
[4744]	90	'deft' => &get_default_process_exp() },
	91	{ 'name' => "no_attachments",
[4873]	92	'desc' => "{EMAILPlug.no_attachments}",
[3630]	93	'type' => "flag",
[4744]	94	'reqd' => "no" },
[6916]	95	{ 'name' => "headers",
	96	'desc' => "{EMAILPlug.headers}",
	97	'type' => "flag",
	98	'reqd' => "no" },
[4744]	99	{ 'name' => "split_exp",
[4873]	100	'desc' => "{EMAILPlug.split_exp}",
[6408]	101	'type' => "regexp",
	102	'reqd' => "no",
	103	'deft' => &get_default_split_exp() }
	104	];
[3540]	105
[4744]	106	my $options = { 'name' => "EMAILPlug",
[5680]	107	'desc' => "{EMAILPlug.desc}",
[6408]	108	'abstract' => "no",
	109	'inherits' => "yes",
[4744]	110	'args' => $arguments };
[3540]	111
[638]	112	# Create a new EMAILPlug object with which to parse a file.
[1206]	113	# Accomplished by creating a new BasPlug and using bless to
[638]	114	# turn it into an EMAILPlug.
	115
	116	sub new {
	117	my ($class) = @_;
[3523]	118	my $self = new BasPlug ($class, @_);
[5924]	119	$self->{'plugin_type'} = "EMAILPlug";
[3540]	120	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
	121	my $option_list = $self->{'option_list'};
	122	push( @{$option_list}, $options );
	123
[3523]	124	if (!parsargv::parse(\@_,
	125	q^split_exp/.*/^, \$self->{'split_exp'},
[3630]	126	q^no_attachments^, \$self->{'ignore_attachments'},
[6916]	127	q^headers^, \$self->{'header_metadata'},
[3523]	128	"allow_extra_options")) {
	129	print STDERR "\nIncorrect options passed to $class.";
	130	print STDERR "\nCheck your collect.cfg configuration file\n";
[4873]	131	$self->print_txt_usage(""); # Use default resource bundle
[3523]	132	die "\n";
	133	}
[3630]	134	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
	135
[2730]	136	# this might not actually be true at read-time, but after processing
	137	# it should all be utf8.
	138	$self->{'input_encoding'}="utf8";
[638]	139	return bless $self, $class;
	140	}
	141
[1244]	142	sub get_default_process_exp {
	143	my $self = shift (@_);
[2096]	144	# mbx/email for mailbox file format, \d+ for maildir (each message is
	145	# in a separate file, with a unique number for filename)
[3111]	146	# mozilla and IE will save individual mbx format files with a ".eml" ext.
	147	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
[638]	148	}
	149
[1895]	150	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	151	# It is supposed to be "\n\nFrom ", but this isn't always used.
[1895]	152	sub get_default_split_exp {
[2096]	153	return q^\nFrom .*\n^;
[3523]	154
[1895]	155	}
	156
	157
[1244]	158	# do plugin specific processing of doc_obj
	159	sub process {
[2630]	160
[638]	161	my $self = shift (@_);
[6332]	162	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]	163	my $outhandle = $self->{'outhandle'};
	164
[1244]	165	# Check that we're dealing with a valid mail file
[3111]	166	# mbox message files start with "From "
	167	# maildir messages usually start with Return-Path and Delivered-To
	168	# mh is very similar to maildir
	169	my $startoffile=substr($$textref,0,256);
	170	if (($startoffile !~ /^(From )/) &&
	171	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	172	return undef;
	173	}
[638]	174
[1757]	175
[6332]	176	print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
	177
[1424]	178	print $outhandle "EMAILPlug: processing $file\n"
[1244]	179	if $self->{'verbosity'} > 1;
[1206]	180
[1244]	181	my $cursection = $doc_obj->get_top_section();
[638]	182
[1206]	183	#
	184	# Parse the document's text and extract metadata
	185	#
[638]	186
[2652]	187	# Protect backslashes
	188	$$textref =~ s@\\@\\\\@g;
	189
[1206]	190	# Separate header from body of message
[1244]	191	my $Headers = $$textref;
[2630]	192	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	193	$$textref = $1;
[4089]	194	$Headers .= "\n";
[2779]	195
[2630]	196	# Unfold headers - see rfc822
	197	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	198	# Extract basic metadata from header
	199	my @headers = ("From", "To", "Subject", "Date");
	200	my %raw;
[1658]	201	foreach my $name (@headers) {
	202	$raw{$name} = "No $name value";
	203	}
[1206]	204
[3132]	205	# Get a default encoding for the header - RFC says should be ascii...
[6062]	206	my $default_header_encoding="iso_8859_1";
[3132]	207
	208	# We don't know what character set is the user's default...
	209	# We could use textcat to guess... for now we'll look at mime content-type
	210	# if ($Headers =~ /([[:^ascii:]])/) {
	211	# }
	212	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	213	$default_header_encoding=$1;
	214	$default_header_encoding =~ s@\-@_@g;
	215	$default_header_encoding =~ tr/A-Z/a-z/;
	216	}
	217
	218
[1658]	219	# Examine each line of the headers
	220	my ($line, $name, $value);
	221	my @parts;
	222	foreach $line (split(/\n/, $Headers)) {
	223
	224	# Ignore lines with no content or which begin with whitespace
	225	next unless ($line =~ /:/);
	226	next if ($line =~ /^\s/);
	227
	228	# Find out what metadata is on this line
	229	@parts = split(/:/, $line);
	230	$name = shift @parts;
[3136]	231	# get fieldname in canonical form - first cap, then lower case.
[3134]	232	$name =~ tr/A-Z/a-z/;
[3136]	233	# uppercase the first character according to the current locale
[2630]	234	$name=~s/(.+)/\u$1/;
[1658]	235	next unless $name;
	236	next unless ($raw{$name});
	237
	238	# Find the value of that metadata
	239	$value = join(":", @parts);
	240	$value =~ s/^\s+//;
	241	$value =~ s/\s+$//;
[2730]	242	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
[3132]	243	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
[2730]	244	my $original_value=$value;
	245	my $encoded=$value;
	246	$value="";
[3215]	247	# we should ignore spaces between consecutive encoded-texts
	248	$encoded =~ s@\?=\s+=\?@\?==\?@g;
	249	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
[2730]	250	my ($charset, $encoding, $data)=($2,$3,$4);
[2847]	251	my ($decoded_data);
[2730]	252	$value.="$1"; # any leading chars
	253	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	254	chomp $data;
	255	$encoding =~ tr/BQ/bq/;
	256	if ($encoding eq "q") { # quoted printable
[2733]	257	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
[2730]	258	$decoded_data=qp_decode($data);
[3215]	259	# qp_decode adds \n, which is default for body text
	260	chomp($decoded_data);
[2730]	261	} else { # base 64
	262	$decoded_data=base64_decode($data);
	263	}
[2847]	264	$self->convert2unicode($charset, \$decoded_data);
	265	$value .= $decoded_data;
	266	} # end of while loop
[3132]	267
[2847]	268	# get any trailing characters
[3132]	269	$self->convert2unicode($default_header_encoding, \$encoded);
[2847]	270	$value.=$encoded;
	271
[2730]	272	if ($value =~ /^\s*$/) { # we couldn't extract anything...
[3132]	273	$self->convert2unicode($default_header_encoding,
	274	\$original_value);
[6062]	275	$value=$original_value;
[2730]	276	}
[3136]	277	} # end of if =?...?=
	278
	279	# In the absense of other charset information, assume the
[6062]	280	# header is the default (usually "iso_8859_1") and convert to unicode.
[3136]	281	else {
[3132]	282	$self->convert2unicode($default_header_encoding, \$value);
	283	}
[2730]	284
[1658]	285	# Store the metadata
[6062]	286	$value =~ s@_@\\_@g; # protect against GS macro language
[1206]	287	$raw{$name} = $value;
	288	}
	289
[2630]	290	# Extract the name and e-mail address from the From metadata
[6062]	291	my $frommeta = $raw{"From"};
[2680]	292	my $fromnamemeta;
	293	my $fromaddrmeta;
	294
	295	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	296
	297	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	298	$fromnamemeta=$1;
	299	$fromaddrmeta=$2;
	300	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
	301	$fromnamemeta=$2;
	302	$fromaddrmeta=$1;
	303	}
[2630]	304	if (!defined($fromaddrmeta)) {
	305	$fromaddrmeta=$frommeta;
	306	}
[2680]	307	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	308	# minor attempt to prevent spam-bots from harvesting addresses...
	309	$fromaddrmeta=~s/@/@/;
[6062]	310
[2630]	311	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	312
[3328]	313	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]	314	$fromnamemeta =~ s/\"//g; # remove quotes
	315	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]	316	}
	317	else {
	318	$fromnamemeta = $fromaddrmeta;
	319	}
	320	# if name is an address
	321	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	322	$fromnamemeta=~s/@/&#64\;/;
[2630]	323	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	324
	325	$raw{"From"}=$frommeta;
	326
[1206]	327	# Process Date information
[1658]	328	if ($raw{"Date"} !~ /No Date/) {
[1206]	329	$raw{"DateText"} = $raw{"Date"};
[638]	330
[1206]	331	# Convert the date text to internal date format
	332	$value = $raw{"Date"};
	333	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[3143]	334	# make some assumptions about the year formatting...
	335	# some (old) software thinks 2001 is 101, some think 2001 is 01
	336	if ($year < 20) { $year += 2000; } # assume not really 1920...
	337	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	338	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	339
[1206]	340	} else {
	341	# We have not extracted a date
	342	$raw{"DateText"} = "Unknown.";
	343	$raw{"Date"} = "19000000";
[638]	344	}
	345
[1244]	346	# Add extracted metadata to document object
[1206]	347	foreach my $name (keys %raw) {
	348	$value = $raw{$name};
	349	if ($value) {
[2730]	350	# assume subject, etc headers have no special HTML meaning.
[1206]	351	$value = &text_into_html($value);
[2730]	352	# escape [] so it isn't re-interpreted as metadata
	353	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	354	} else {
	355	$value = "No $name field";
	356	}
[1244]	357	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	358	}
[638]	359
[7703]	360
	361	# extract a message ID from the headers, if there is one, and we'll use
	362	# that as the greenstone doc ID. Having a predictable ID means we can
	363	# link to other messages, eg from In-Reply-To or References headers...
	364	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
	365	my $id=escape_msg_id($1);
	366	$doc_obj->{'msgid'}=$id;
	367	}
	368	# link to another message, if this is a reply
	369	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
	370	my $id=escape_msg_id($1);
	371	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	372	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
	373	# References can have multiple, get the last one
	374	my $id=escape_msg_id($1);
	375	# not necessarily in-reply-to, but same thread...
	376	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	377	}
	378
	379
	380
[2630]	381	my $mimetype="text/plain";
	382	my $mimeinfo="";
[3132]	383	my $charset = $default_header_encoding;
[3073]	384	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	385	# more than one parameter given to Content-type.
	386	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]	387	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
[2630]	388	{
	389	$mimetype=$1;
	390	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	391
	392	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	393	$mimetype = "text/plain";
	394	}
	395
[2630]	396	$mimeinfo=$2;
[3073]	397	if (!defined $mimeinfo) {
	398	$mimeinfo="";
	399	} else { # strip leading and trailing stuff
	400	$mimeinfo =~ s/^\;\s*//;
	401	$mimeinfo =~ s/\s*$//;
	402	}
	403	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	404	$charset = $1;
	405	}
[2630]	406	}
	407
[2680]	408	my $transfer_encoding="7bit";
	409	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	410	$transfer_encoding=$1;
	411	}
[6062]	412
[2886]	413	if ($mimetype eq "text/html") {
	414	$$textref= $self->text_from_part("$Headers\n$$textref");
	415	} elsif ($mimetype ne "text/plain") {
[3630]	416	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[6062]	417	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
	418	} else { # mimetype eq text/plain
	419	$$textref = &text_into_html($$textref);
	420	$$textref =~ s@_@\\_@g; # protect against GS macro language
	421
	422	if ($transfer_encoding =~ /quoted\-printable/) {
	423	$$textref=qp_decode($$textref);
	424	} elsif ($transfer_encoding =~ /base64/) {
	425	$$textref=base64_decode($$textref);
	426	}
[2886]	427	$self->convert2unicode($charset, $textref);
[2680]	428	}
[6062]	429
[2630]	430
[6916]	431	if ($self->{'header_metadata'} && $self->{'header_metadata'} == 1) {
	432	# Add "All headers" metadata
	433	$Headers = &text_into_html($Headers);
[2630]	434
[6916]	435	$Headers = "No headers" unless ($Headers =~ /\w/);
	436	$Headers =~ s/@/&#64\;/g;
	437	# escape [] so it isn't re-interpreted as metadata
	438	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
	439	$self->convert2unicode($charset, \$Headers);
[2754]	440
[6916]	441	$Headers =~ s@_@\\_@g; # protect against GS macro language
	442	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
	443	}
[6062]	444
	445
[2918]	446	# Add Title metadata
	447	my $Title = text_into_html($raw{'Subject'});
[7703]	448	$Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]	449	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	450	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	451
	452	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	453
	454
[1244]	455	# Add text to document object
	456	$$textref = "No message" unless ($$textref =~ /\w/);
[6062]	457
[1244]	458	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	459
[1244]	460	return 1;
[638]	461	}
	462
	463
	464	# Convert a text string into HTML.
	465	#
	466	# The HTML is going to be inserted into a GML file, so
	467	# we have to be careful not to use symbols like ">",
	468	# which ocurs frequently in email messages (and use
	469	# &gt instead.
	470	#
	471	# This function also turns links and email addresses into hyperlinks,
	472	# and replaces carriage returns with <BR> tags (and multiple carriage
	473	# returns with <P> tags).
	474
[1206]	475
[638]	476	sub text_into_html {
	477	my ($text) = @_;
	478
[1244]	479	# Convert problem characters into HTML symbols
[3132]	480	$text =~ s/&/&/g;
	481	$text =~ s/</</g;
	482	$text =~ s/>/>/g;
	483	$text =~ s/\"/"/g;
[638]	484
[2630]	485	# convert email addresses and URIs into links
	486	# don't markup email addresses for now
	487	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	488
[2918]	489	# try to munge email addresses a little bit...
	490	$text =~ s/@/@/;
[2730]	491	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	492	# assume URI doesn't finish with a '.'
[3398]	493	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
[2630]	494
	495
[638]	496	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	497	$text =~ s/ +/ /g;
	498	$text =~ s/\s*$//g;
	499	$text =~ s/^\s*//g;
	500	$text =~ s/\n/\n<br>/g;
	501	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	502
	503	return $text;
	504	}
	505
	506
[2630]	507
	508
	509	#Process a MIME message.
	510	# the textref we are given DOES NOT include the header.
	511	sub text_from_mime_message {
[2847]	512	my $self = shift(@_);
[6062]	513	my ($mimetype,$mimeinfo,$text)=(@_);
	514	my $outhandle=$self->{'outhandle'};
[2630]	515	# Check for multiparts - $mimeinfo will be a boundary
	516	if ($mimetype =~ /multipart/) {
[6062]	517	my $boundary="";
[2732]	518	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	519	$boundary=$1;
[2732]	520	if ($boundary =~ m@^\"@) {
	521	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	522	}
	523	} else {
	524	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
[2630]	525	}
	526	# parts start with "--$boundary"
	527	# message ends with "--$boundary--"
	528	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	529	# that perl might want to interpolate. Also allows spaces...
[2630]	530	$boundary=~s/\\/\\\\/g;
	531	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]	532	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	533	# remove first "part" and last "part" (final --)
	534	shift @message_parts;
	535	my $last=pop @message_parts;
[2680]	536	# if our boundaries are a bit dodgy and we only found 1 part...
	537	if (!defined($last)) {$last="";}
[2630]	538	# make sure it is only -- and whitespace
	539	if ($last !~ /^\-\-\s*$/ms) {
	540	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
	541	}
	542	foreach my $message_part (@message_parts) {
	543	# remove the leading newline left from split.
	544	$message_part=~s/^\r?\n//;
	545	}
	546	if ($mimetype eq "multipart/alternative") {
	547	# check for an HTML version first, then TEXT, otherwise use first.
	548	my $part_text="";
	549	foreach my $message_part (@message_parts) {
	550	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
	551	{
	552	# Use the HTML version
[2847]	553	$part_text= $self->text_from_part($message_part);
[2630]	554	$mimetype="text/html";
	555	last;
	556	}
	557	}
	558	if ($part_text eq "") { # try getting a text part instead
	559	foreach my $message_part (@message_parts) {
	560	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
	561	{
	562	# Use the plain version
[2847]	563	$part_text= $self->text_from_part($message_part);
[2732]	564	if ($part_text =~/[^\s]/) {
[3721]	565	$part_text = text_into_html($part_text);
[2732]	566	}
[2630]	567	$mimetype="text/plain";
	568	last;
	569	}
	570	}
	571	}
[3721]	572	if ($part_text eq "") { #use first part (no html/text part found)
	573	$part_text = $self->text_from_part(shift @message_parts);
	574	$part_text = text_into_html($part_text);
[2630]	575	}
	576	if ($part_text eq "") { # we couldn't get anything!!!
	577	# or it was an empty message...
	578	# do nothing...
	579	print $outhandle "EMAILPlug: no text - empty body?\n";
	580	} else {
[3721]	581	$text = $part_text;
[2630]	582	}
[3352]	583	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
[3721]	584	$text = "";
[3352]	585	# signed is for PGP/GPG messages... the last part is a hash
	586	if ($mimetype =~ m@multipart/signed@) {
	587	pop @message_parts;
	588	}
[6062]	589	my $is_first_part=1;
[2630]	590	foreach my $message_part (@message_parts) {
[6062]	591	if ($is_first_part && $text ne "") {$is_first_part=0;}
	592
[2630]	593	if ($mimetype eq "multipart/digest") {
[6062]	594	# default type - RTFRFC!! Set if not already set
	595	$message_part =~ m@^(.*)\n\r?\n@s;
	596	my $part_header=$1;
	597	if ($part_header !~ m@^content-type@mi) {
	598	$message_part="Content-type: message/rfc822\n"
	599	. $message_part; # prepend default type
[4224]	600	}
[2630]	601	}
	602
[6062]	603	$text .= $self->process_multipart_part($message_part,
	604	$is_first_part);
[2630]	605	} # foreach message part.
	606	} else {
	607	# we can't handle this multipart type (not mixed or alternative)
	608	# the RFC also mentions "parallel".
	609	}
[3627]	610	} # end of ($mimetype =~ multipart)
[2918]	611	elsif ($mimetype =~ m@message/rfc822@) {
	612	my $msg_header = $text;
	613	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	614	$text = $1;
	615
[3630]	616	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
[2918]	617	{
	618	$mimetype=$1;
	619	$mimetype =~ tr/[A-Z]/[a-z]/;
	620	$mimeinfo=$2;
[6062]	621	#if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
	622	# $charset = $1;
	623	#}
[2918]	624	my $msg_text;
	625	if ($mimetype =~ m@multipart/@) {
	626	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
[6062]	627	$text);
	628	} else {$msg_text=text_from_part($text);}
[2918]	629
[3630]	630	my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]	631	$text= "\n<b><<attached message>></b><br>";
	632	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	633	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	634	. "</td></tr></table>";
	635	}
	636	} else {
[2886]	637	# we don't do any processing of the content.
	638	}
	639
[2630]	640	return $text;
	641	}
	642
	643
[6062]	644
[7703]	645	# used for turning a message id into a more friendly string for greenstone
	646	sub escape_msg_id {
	647	#msgid
	648	my $id=shift;
	649	chomp $id; $id =~ s!\s!!g; # remove spaces
	650	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
	651	$id =~ s![_&]!-!g; # replace symbols that might cause problems
	652	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
	653	return $id;
	654	}
[6062]	655
[7703]	656
	657
[6062]	658	sub process_multipart_part {
	659	my $self = shift;
	660	my $message_part = shift;
	661	my $is_first_part = shift;
	662
	663	my $return_text="";
	664	my $part_header=$message_part;
	665	my $part_body;
	666	if ($message_part=~ /^\s*\n/) {
	667	# no header... use defaults
	668	$part_body=$message_part;
	669	$part_header="Content-type: text/plain; charset=us-ascii";
	670	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
	671	$part_body=$1;
	672	} else {
	673	# something's gone wrong...
	674	$part_header="";
	675	$part_body=$message_part;
	676	}
	677
	678	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	679	my $part_content_type="";
	680	my $part_content_info="";
	681
	682	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
	683	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	684	$part_content_info=$2;
	685	if (!defined($part_content_info)) {
	686	$part_content_info="";
	687	} else {
	688	$part_content_info =~ s/^\;\s*//;
	689	$part_content_info =~ s/\s*$//;
	690	}
	691	}
	692	my $filename="";
[6079]	693	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]	694	$filename=$1;
[6079]	695	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
[6062]	696	}
	697
	698	# disposition - either inline or attachment.
	699	# NOT CURRENTLY USED - we display all text types instead...
	700	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	701
	702	# add <<attachment>> to each part except the first...
	703	if (!$is_first_part) {
	704	$return_text.="\n<p><hr><strong><<attachment>>";
	705	# add part info header
	706	my $header_text="<br>Type: $part_content_type<br>\n";
	707	if ($filename ne "") {
	708	$header_text.="Filename: $filename\n";
	709	}
	710	$header_text =~ s@_@\\_@g;
	711	$return_text.=$header_text . "</strong></p>\n<p>\n";
	712	}
	713
	714	if ($part_content_type =~ m@text/@)
	715	{
	716	my $part_text= $self->text_from_part($message_part);
	717	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	718	$part_text = text_into_html($part_text);
	719	}
	720	if ($part_text eq "") {
	721	$part_text = ' ';
	722	}
	723	$return_text .= $part_text;
	724	} elsif ($part_content_type =~ m@message/rfc822@) {
	725	# This is a forwarded message
	726	my $message_part_headers=$part_body;
	727	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	728	my $message_part_body=$1;
	729	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	730
	731	my $rfc822_formatted_body=""; # put result in here
	732	if ($message_part_headers =~
	733	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
	734	{
	735	# The message header uses MIME flags
	736	my $message_content_type=$1;
	737	my $message_content_info=$2;
	738	if (!defined($message_content_info)) {
	739	$message_content_info="";
	740	} else {
	741	$message_content_info =~ s/^\;\s*//;
	742	$message_content_info =~ s/\s*$//;
	743	}
	744	$message_content_type =~ tr/A-Z/a-z/;
	745	if ($message_content_type =~ /multipart/) {
	746	$rfc822_formatted_body=
	747	$self->text_from_mime_message($message_content_type,
	748	$message_content_info,
	749	$message_part_body);
	750	} else {
	751	$message_part_body= $self->text_from_part($part_body);
	752	$rfc822_formatted_body=text_into_html($message_part_body);
	753	}
	754	} else {
	755	# message doesn't use MIME flags
	756	$rfc822_formatted_body=text_into_html($message_part_body);
	757	$rfc822_formatted_body =~ s@_@\\_@g;
	758	}
	759	# Add the returned text to the output
	760	# don't put all the headers...
	761	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	762	my $brief_headers=get_brief_headers($message_part_headers);
	763	$return_text.=text_into_html($brief_headers);
	764	$return_text.="</p><p>\n";
	765	$return_text.=$rfc822_formatted_body;
	766	$return_text.="</p>\n";
	767	# end of message/rfc822
	768	} elsif ($part_content_type =~ /multipart/) {
	769	# recurse again
	770
	771	my $tmptext= $self->text_from_mime_message($part_content_type,
	772	$part_content_info,
	773	$part_body);
	774	$return_text.=$tmptext;
	775	} else {
	776	# this part isn't text/* or another message...
	777	if ($is_first_part) {
	778	# this is the first part of a multipart, or only part!
	779	$return_text="\n<p><hr><strong><<attachment>>";
	780	# add part info header
	781	my $header_text="<br>Type: $part_content_type<br>\n";
	782	$header_text.="Filename: $filename</strong></p>\n<p>\n";
	783	$header_text =~ s@_@\\_@g;
	784	$return_text.=$header_text;
	785	}
	786
	787	# save attachment by default
	788	if (!$self->{'ignore_attachments'}
	789	&& $filename ne "") { # this part has a file...
	790	my $encoding="8bit";
	791	if ($part_header =~
	792	/^content-transfer-encoding:\s*(\w+)/mi ) {
	793	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
	794	}
	795	my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
	796	my $save_filename=$filename;
	797
	798	# make sure we don't clobber files with same name;
	799	# need to keep state between .mbx files
	800	my $assoc_files=$self->{'assoc_filenames'};
	801	if ($assoc_files->{$filename}) { # it's been set...
	802	$assoc_files->{$filename}++;
	803	$filename =~ m/(.+)\.(\w+)$/;
	804	my ($filestem, $ext)=($1,$2);
	805	$save_filename="${filestem}_"
	806	. $assoc_files->{$filename} . ".$ext";
	807	} else { # first file with this name
	808	$assoc_files->{$filename}=1;
	809	}
	810	open (SAVE, ">$tmpdir/$save_filename") \|\|
	811	warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
	812	my $part_text = $message_part;
	813	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
	814	if ($encoding eq "base64") {
	815	print SAVE base64_decode($part_text);
	816	} elsif ($encoding eq "quoted-printable") {
	817	print SAVE qp_decode($part_text);
	818	} else { # 7bit, 8bit, binary, etc...
	819	print SAVE $part_text;
	820	}
	821	close SAVE;
	822	my $doc_obj=$self->{'doc_obj'};
	823	$doc_obj->associate_file("$tmpdir/$save_filename",
	824	"$save_filename",
	825	$part_content_type # mimetype
	826	);
	827	# clean up tmp area...
	828	# Can't do this as it hasn't been copied/linked yet!!!
	829	# &util::rm("$tmpdir/$save_filename");
	830	my $outhandle=$self->{'outhandle'};
	831	print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
	832
	833	# be nice if "download" was a translatable macro :(
	834	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
	835	} # end of save attachment
	836	} # end of !text/message part
	837
	838
	839	return $return_text;
	840	}
	841
	842
[3630]	843	# Return only the "important" headers from a set of message headers
	844	sub get_brief_headers {
	845	my $msg_header = shift;
	846	my $brief_header = "";
[2630]	847
[3630]	848	# Order matters!
	849	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
	850	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
	851	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
	852	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
	853	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]	854
[3630]	855	return $brief_header;
	856	}
[2630]	857
	858
	859	# Process a MIME part. Return "" if we can't decode it.
[6062]	860	# should only be called for parts with type "text/*" ?
[2630]	861	sub text_from_part {
[3132]	862	my $self = shift;
[3136]	863	my $text = shift \|\| '';
	864	my $part_header = $text;
	865
[2681]	866	# check for empty part header (leading blank line)
	867	if ($text =~ /^\s*\r?\n/) {
	868	$part_header="Content-type: text/plain; charset=us-ascii";
	869	} else {
	870	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	871	$text=$1; if (!defined($text)) {$text="";}
	872	}
[2630]	873	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[3630]	874	$part_header =~ /content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is;
[2730]	875	my $type=$1;
	876	my $charset=$2;
	877	if (!defined($type)) {$type="";}
	878	if (!defined($charset)) {$charset="ascii";}
[2630]	879	my $encoding="";
[2638]	880	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	881	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	882	}
	883	# Content-Transfer-Encoding is per-part
	884	if ($encoding ne "") {
	885	if ($encoding =~ /quoted\-printable/) {
	886	$text=qp_decode($text);
	887	} elsif ($encoding =~ /base64/) {
	888	$text=base64_decode($text);
	889	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	890	# rfc2045 also allows binary, which we ignore (for now).
[3630]	891	my $outhandle=$self->{'outhandle'};
	892	print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
[2630]	893	return "";
	894	}
	895	}
	896	if ($type eq "text/html") {
	897	# only get stuff between <body> tags, or <html> tags.
[2730]	898	$text =~ s@^.<html[^>]>@@is;
	899	$text =~ s@</html>.*$@@is;
	900	$text =~ s/^.?<body[^>]>//si;
	901	$text =~ s/<\/body>.*$//si;
[2630]	902	}
	903	elsif ($type eq "text/xml") {
	904	$text=~s/</</g;$text=~s/>/>/g;
	905	$text="<pre>\n$text\n</pre>\n";
	906	}
[2730]	907	# convert to unicode
[2847]	908	$self->convert2unicode($charset, \$text);
[6062]	909
	910	$text =~ s@_@\\_@g; # protect against GS macro language
[2630]	911	return $text;
	912	}
	913
	914
[6062]	915
	916
[2630]	917	# decode quoted-printable text
	918	sub qp_decode {
	919	my $text=shift;
	920
	921	# if a line ends with "=\s*", it is a soft line break, otherwise
	922	# keep in any newline characters.
	923
[3627]	924	$text =~ s/=\s*\r?\n//mg;
	925	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]	926	return $text;
	927	}
	928
	929	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	930	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	931	# for us at least).
	932	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	933	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	934	# from each encoded byte.
[2630]	935
	936
	937	sub base64_decode {
	938	my $enc_text = shift;
	939	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	940	# also '=' is used for padding at the end, but we remove it anyway.
	941	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	942	# map each MIME char into it's value, for more efficient lookup.
	943	my %index;
	944	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	945	# remove all non-base64 chars. eval to get variable in transliteration...
	946	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	947	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	948	my $decoded="";
	949	while (length ($enc_text)>3)
	950	{
	951	my $fourchars=substr($enc_text,0,4,"");
	952	my @chars=(split '',$fourchars);
	953	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	954	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	955	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	956	}
	957	# if there are any input chars left, there are either
	958	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	959	my @chars=(split '',$enc_text);
	960	if (length($enc_text)) {
	961	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	962	}
	963	if (length($enc_text)==3) {
	964	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	965	}
	966	return $decoded;
	967	}
	968
[2847]	969	sub convert2unicode {
	970	my $self = shift(@_);
	971	my ($charset, $textref) = @_;
[2630]	972
[7703]	973	if (!$$textref) {
	974	# nothing to do!
	975	return;
	976	}
	977
[2847]	978	# first get our character encoding name in the right form.
	979	$charset = "iso_8859_1" unless defined $charset;
[7703]	980	$charset =~ tr/A-Z/a-z/; # lowercase
	981	$charset =~ s/\-/_/g;
	982	$charset =~ s/gb2312/gb/;
[2847]	983	# assumes EUC-KR, not ISO-2022 !?
[7703]	984	$charset =~ s/^ks_c_5601_1987/korean/;
	985	if ($charset eq 'utf_8') {$charset='utf8'}
[2847]	986
[7703]	987	my $outhandle = $self->{'outhandle'};
	988
	989	if ($charset eq "utf8") {
	990	# no conversion needed, but lets check that it's valid utf8
	991	# see utf-8 manpage for valid ranges
	992	$$textref =~ m/^/g; # to set \G
	993	my $badbytesfound=0;
	994	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	995	my $highbytes=$1;
	996	my $highbyteslength=length($highbytes);
	997	# replace any non utf8 complaint bytes
	998	$highbytes =~ /^/g; # set pos()
	999	while ($highbytes =~
	1000	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	1001	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	1002	[\xf0-\xf7][\x80-\xbf]{3} # 4 byte
	1003	[\xf8-\xfb][\x80-\xbf]{4} # 5 byte
	1004	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1005	)*([\x80-\xff])? !xg
	1006	) {
	1007	my $badbyte=$1;
	1008	if (!defined $badbyte) {next} # hit end of string
	1009	my $pos=pos($highbytes);
	1010	substr($highbytes, $pos-1, 1, "\xc2\x80");
	1011	# update the position to continue searching (for \G)
	1012	pos($highbytes) = $pos+1; # set to just after the \x80
	1013	$badbytesfound=1;
	1014	}
	1015	if ($badbytesfound==1) {
	1016	# claims to be utf8, but it isn't!
	1017	print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
	1018	. "detected and removed.\n";
	1019
	1020	my $replength=length($highbytes);
	1021	my $textpos=pos($$textref);
	1022	# replace bad bytes with good bytes
	1023	substr( $$textref, $textpos-$replength,
	1024	$replength, $highbytes);
	1025	# update the position to continue searching (for \G)
	1026	pos($$textref)=$textpos+($replength-$highbyteslength);
	1027	}
	1028	}
[3206]	1029	return;
	1030	}
	1031
[2847]	1032	# It appears that we can't always trust ascii text so we'll treat it
	1033	# as iso-8859-1 (letting characters above 0x80 through without
	1034	# converting them to utf-8 will result in invalid XML documents
	1035	# which can't be parsed at build time).
	1036	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	1037
[3073]	1038	if ($charset eq "iso_8859_1") {
	1039	# test if the mailer lied, and it has win1252 chars in it...
	1040	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	1041	if ($$textref =~ m/[\x80-\x9f]/) {
	1042	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
	1043	print $outhandle "codepage 1252 detected.\n";
	1044	$charset = "windows_1252";
	1045	}
	1046	}
[3351]	1047	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	1048
	1049	if ($utf8_text ne "") {
	1050	$$textref=$utf8_text;
	1051	} else {
	1052	# we didn't get any text... unsupported encoding perhaps? Or it is
	1053	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
	1054	# characters out here if this causes problems...
[3726]	1055	my $outhandle=$self->{'outhandle'};
	1056	print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
[3351]	1057	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
	1058
	1059	}
[2847]	1060	}
	1061
	1062
[7703]	1063	sub set_OID {
	1064	my $self = shift (@_);
	1065	my ($doc_obj, $id, $segment_number) = @_;
	1066
	1067	if ( exists $doc_obj->{'msgid'} ) {
	1068	$doc_obj->set_OID($doc_obj->{'msgid'});
	1069	} else {
	1070	$doc_obj->set_OID("$id\_$segment_number");
	1071	}
	1072	}
	1073
	1074
[1206]	1075	# Perl packages have to return true if they are run.
	1076	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: