Context Navigation

source: gsdl/trunk/perllib/plugins/EmailPlugin.pm@ 17739

Last change on this file since 17739 was 17053, checked in by kjdon, 16 years ago
made the default OIDtype auto, add message_id at end instead of start of list
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 38.0 KB

Rev	Line
[638]	1	###########################################################################
	2	#
[15872]	3	# EmailPlugin.pm - a plugin for parsing email files
[638]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[3630]	9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
[15872]	29	# EmailPlugin
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[6916]	42	# $Headers All the header content (optional, not stored by default)
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[2630]	52	#
[2918]	53	#
[2630]	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]	63
[16341]	64
[15872]	65	package EmailPlugin;
[3540]	66
[7830]	67	use strict;
	68	no strict "refs"; # so we can use a variable as a filehandle for print $out
	69
[638]	70
[15872]	71	use SplitTextFile;
[7830]	72	use unicode; # gs conv functions
	73	use gsprintf 'gsprintf'; # translations
[2730]	74
[638]	75	use sorttools;
	76	use util;
	77
[10254]	78	sub BEGIN {
[15872]	79	@EmailPlugin::ISA = ('SplitTextFile');
[10254]	80	}
[638]	81
[17026]	82	my $extended_oidtype_list =
	83	[ {'name' => "message_id",
	84	'desc' => "{EmailPlugin.OIDtype.message_id}" }
	85	];
[638]	86
[17026]	87	# add in all the standard options from BasePlugin
[17053]	88	unshift (@$extended_oidtype_list, @{$BasePlugin::oidtype_list});
[17026]	89
[3540]	90	my $arguments =
[4744]	91	[ { 'name' => "process_exp",
[15872]	92	'desc' => "{BasePlugin.process_exp}",
[6408]	93	'type' => "regexp",
[3540]	94	'reqd' => "no",
[4744]	95	'deft' => &get_default_process_exp() },
	96	{ 'name' => "no_attachments",
[15872]	97	'desc' => "{EmailPlugin.no_attachments}",
[3630]	98	'type' => "flag",
[4744]	99	'reqd' => "no" },
[6916]	100	{ 'name' => "headers",
[15872]	101	'desc' => "{EmailPlugin.headers}",
[6916]	102	'type' => "flag",
	103	'reqd' => "no" },
[17026]	104	{ 'name' => "OIDtype",
	105	'desc' => "{import.OIDtype}",
	106	'type' => "enum",
	107	'list' => $extended_oidtype_list,
[17053]	108	'deft' => "auto",
[17026]	109	'reqd' => "no",
	110	'modegli' => "2" },
	111	{ 'name' => "OIDmetadata",
	112	'desc' => "{import.OIDmetadata}",
	113	'type' => "metadata",
	114	'deft' => "dc.Identifier",
	115	'reqd' => "no",
	116	'modegli' => "2" },
[4744]	117	{ 'name' => "split_exp",
[15872]	118	'desc' => "{EmailPlugin.split_exp}",
[6408]	119	'type' => "regexp",
	120	'reqd' => "no",
	121	'deft' => &get_default_split_exp() }
	122	];
[3540]	123
[15872]	124	my $options = { 'name' => "EmailPlugin",
	125	'desc' => "{EmailPlugin.desc}",
[6408]	126	'abstract' => "no",
	127	'inherits' => "yes",
[4744]	128	'args' => $arguments };
[3540]	129
[638]	130	sub new {
[10218]	131	my ($class) = shift (@_);
	132	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	133	push(@$pluginlist, $class);
[3540]	134
[15872]	135	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	136	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	137
[15872]	138	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	139
[3630]	140	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
[16341]	141	$self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
[3630]	142
[2730]	143	# this might not actually be true at read-time, but after processing
	144	# it should all be utf8.
	145	$self->{'input_encoding'}="utf8";
[638]	146	return bless $self, $class;
	147	}
	148
[1244]	149	sub get_default_process_exp {
	150	my $self = shift (@_);
[2096]	151	# mbx/email for mailbox file format, \d+ for maildir (each message is
	152	# in a separate file, with a unique number for filename)
[3111]	153	# mozilla and IE will save individual mbx format files with a ".eml" ext.
	154	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
[638]	155	}
	156
[1895]	157	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	158	# It is supposed to be "\n\nFrom ", but this isn't always used.
[9971]	159	# add \d{4} so that the line ends in a year (in case the text has an
	160	# unescaped "From " at the start of a line).
[1895]	161	sub get_default_split_exp {
[9971]	162	return q^\nFrom .*\d{4}\n^;
[3523]	163
[1895]	164	}
	165
[16677]	166	sub can_process_this_file {
	167	my $self = shift(@_);
	168	my ($filename) = @_;
[1895]	169
[16677]	170	# avoid any confusion between filenames matching \d+ (which are by default
	171	# matched by EmailPlugin) and directories that match \d+ (which should not)
	172
	173	return 0 if (-d $filename);
	174
	175	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
	176	return 1;
	177	}
	178	return 0;
	179
	180	}
	181
	182
[1244]	183	# do plugin specific processing of doc_obj
	184	sub process {
[2630]	185
[638]	186	my $self = shift (@_);
[6332]	187	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]	188	my $outhandle = $self->{'outhandle'};
	189
[1244]	190	# Check that we're dealing with a valid mail file
[3111]	191	# mbox message files start with "From "
	192	# maildir messages usually start with Return-Path and Delivered-To
	193	# mh is very similar to maildir
	194	my $startoffile=substr($$textref,0,256);
	195	if (($startoffile !~ /^(From )/) &&
	196	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	197	return undef;
	198	}
[638]	199
[1244]	200	my $cursection = $doc_obj->get_top_section();
[638]	201
[1206]	202	#
	203	# Parse the document's text and extract metadata
	204	#
[638]	205
[2652]	206	# Protect backslashes
	207	$$textref =~ s@\\@\\\\@g;
	208
[1206]	209	# Separate header from body of message
[1244]	210	my $Headers = $$textref;
[2630]	211	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	212	$$textref = $1;
[4089]	213	$Headers .= "\n";
[2779]	214
[2630]	215	# Unfold headers - see rfc822
	216	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	217	# Extract basic metadata from header
	218	my @headers = ("From", "To", "Subject", "Date");
	219	my %raw;
[1658]	220	foreach my $name (@headers) {
	221	$raw{$name} = "No $name value";
	222	}
[1206]	223
[3132]	224	# Get a default encoding for the header - RFC says should be ascii...
[6062]	225	my $default_header_encoding="iso_8859_1";
[3132]	226
	227	# We don't know what character set is the user's default...
	228	# We could use textcat to guess... for now we'll look at mime content-type
	229	# if ($Headers =~ /([[:^ascii:]])/) {
	230	# }
	231	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	232	$default_header_encoding=$1;
	233	$default_header_encoding =~ s@\-@_@g;
	234	$default_header_encoding =~ tr/A-Z/a-z/;
	235	}
	236
	237
[1658]	238	# Examine each line of the headers
	239	my ($line, $name, $value);
	240	my @parts;
	241	foreach $line (split(/\n/, $Headers)) {
	242
	243	# Ignore lines with no content or which begin with whitespace
	244	next unless ($line =~ /:/);
	245	next if ($line =~ /^\s/);
	246
	247	# Find out what metadata is on this line
	248	@parts = split(/:/, $line);
	249	$name = shift @parts;
[3136]	250	# get fieldname in canonical form - first cap, then lower case.
[3134]	251	$name =~ tr/A-Z/a-z/;
[3136]	252	# uppercase the first character according to the current locale
[2630]	253	$name=~s/(.+)/\u$1/;
[1658]	254	next unless $name;
	255	next unless ($raw{$name});
	256
	257	# Find the value of that metadata
	258	$value = join(":", @parts);
	259	$value =~ s/^\s+//;
	260	$value =~ s/\s+$//;
[16341]	261	# decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
	262	$self->decode_header_value($default_header_encoding, \$value);
[2730]	263
[1658]	264	# Store the metadata
[6062]	265	$value =~ s@_@\\_@g; # protect against GS macro language
[1206]	266	$raw{$name} = $value;
	267	}
	268
[2630]	269	# Extract the name and e-mail address from the From metadata
[6062]	270	my $frommeta = $raw{"From"};
[2680]	271	my $fromnamemeta;
	272	my $fromaddrmeta;
	273
	274	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	275
	276	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	277	$fromnamemeta=$1;
	278	$fromaddrmeta=$2;
	279	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
	280	$fromnamemeta=$2;
	281	$fromaddrmeta=$1;
	282	}
[2630]	283	if (!defined($fromaddrmeta)) {
	284	$fromaddrmeta=$frommeta;
	285	}
[2680]	286	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	287	# minor attempt to prevent spam-bots from harvesting addresses...
	288	$fromaddrmeta=~s/@/@/;
[6062]	289
[2630]	290	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	291
[3328]	292	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]	293	$fromnamemeta =~ s/\"//g; # remove quotes
	294	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]	295	}
	296	else {
	297	$fromnamemeta = $fromaddrmeta;
	298	}
	299	# if name is an address
	300	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	301	$fromnamemeta=~s/@/&#64\;/;
[2630]	302	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	303
	304	$raw{"From"}=$frommeta;
	305
[1206]	306	# Process Date information
[1658]	307	if ($raw{"Date"} !~ /No Date/) {
[1206]	308	$raw{"DateText"} = $raw{"Date"};
[638]	309
[1206]	310	# Convert the date text to internal date format
	311	$value = $raw{"Date"};
[15212]	312	# proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
[1206]	313	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[15212]	314	if (!defined($day) \|\| !defined($month) \|\| !defined ($year)) {
	315	# try monthly archive format: Wed Apr 23 00:26:08 2008
	316	($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
	317	}
	318
[3143]	319	# make some assumptions about the year formatting...
	320	# some (old) software thinks 2001 is 101, some think 2001 is 01
	321	if ($year < 20) { $year += 2000; } # assume not really 1920...
	322	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	323	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	324
[1206]	325	} else {
	326	# We have not extracted a date
	327	$raw{"DateText"} = "Unknown.";
	328	$raw{"Date"} = "19000000";
[638]	329	}
	330
[1244]	331	# Add extracted metadata to document object
[1206]	332	foreach my $name (keys %raw) {
	333	$value = $raw{$name};
	334	if ($value) {
[2730]	335	# assume subject, etc headers have no special HTML meaning.
[1206]	336	$value = &text_into_html($value);
[2730]	337	# escape [] so it isn't re-interpreted as metadata
	338	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	339	} else {
	340	$value = "No $name field";
	341	}
[1244]	342	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	343	}
[638]	344
[7703]	345
	346	# extract a message ID from the headers, if there is one, and we'll use
	347	# that as the greenstone doc ID. Having a predictable ID means we can
	348	# link to other messages, eg from In-Reply-To or References headers...
	349	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
	350	my $id=escape_msg_id($1);
	351	$doc_obj->{'msgid'}=$id;
	352	}
	353	# link to another message, if this is a reply
	354	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
	355	my $id=escape_msg_id($1);
	356	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	357	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
	358	# References can have multiple, get the last one
	359	my $id=escape_msg_id($1);
	360	# not necessarily in-reply-to, but same thread...
	361	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	362	}
	363
	364
	365
[2630]	366	my $mimetype="text/plain";
	367	my $mimeinfo="";
[3132]	368	my $charset = $default_header_encoding;
[3073]	369	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	370	# more than one parameter given to Content-type.
	371	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]	372	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
[2630]	373	{
	374	$mimetype=$1;
	375	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	376
	377	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	378	$mimetype = "text/plain";
	379	}
	380
[2630]	381	$mimeinfo=$2;
[3073]	382	if (!defined $mimeinfo) {
	383	$mimeinfo="";
	384	} else { # strip leading and trailing stuff
	385	$mimeinfo =~ s/^\;\s*//;
	386	$mimeinfo =~ s/\s*$//;
	387	}
	388	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	389	$charset = $1;
	390	}
[2630]	391	}
	392
[2680]	393	my $transfer_encoding="7bit";
	394	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	395	$transfer_encoding=$1;
	396	}
[6062]	397
[2886]	398	if ($mimetype eq "text/html") {
[9971]	399	$$textref= $self->text_from_part($$textref, $Headers);
[2886]	400	} elsif ($mimetype ne "text/plain") {
[3630]	401	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[16341]	402	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
[6062]	403	} else { # mimetype eq text/plain
	404
	405	if ($transfer_encoding =~ /quoted\-printable/) {
	406	$$textref=qp_decode($$textref);
	407	} elsif ($transfer_encoding =~ /base64/) {
	408	$$textref=base64_decode($$textref);
	409	}
[2886]	410	$self->convert2unicode($charset, $textref);
[8904]	411
	412	$$textref = &text_into_html($$textref);
	413	$$textref =~ s@_@\\_@g; # protect against GS macro language
	414
[2680]	415	}
[6062]	416
[2630]	417
[10218]	418	if ($self->{'headers'} && $self->{'headers'} == 1) {
[6916]	419	# Add "All headers" metadata
	420	$Headers = &text_into_html($Headers);
[2630]	421
[6916]	422	$Headers = "No headers" unless ($Headers =~ /\w/);
	423	$Headers =~ s/@/&#64\;/g;
	424	# escape [] so it isn't re-interpreted as metadata
	425	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
	426	$self->convert2unicode($charset, \$Headers);
[2754]	427
[6916]	428	$Headers =~ s@_@\\_@g; # protect against GS macro language
	429	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
	430	}
[6062]	431
	432
[2918]	433	# Add Title metadata
	434	my $Title = text_into_html($raw{'Subject'});
[7703]	435	$Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]	436	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	437	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	438
	439	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	440
[8121]	441	# Add FileFormat metadata
	442	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
[2918]	443
[1244]	444	# Add text to document object
	445	$$textref = "No message" unless ($$textref =~ /\w/);
[6062]	446
[1244]	447	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	448
[1244]	449	return 1;
[638]	450	}
	451
[16341]	452	# delete any temp files that we have created
	453	sub clean_up_after_doc_obj_processing {
	454	my $self = shift(@_);
	455
	456	foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
	457	if (-e $tmp_file_path) {
	458	&util::rm($tmp_file_path);
	459	}
	460	}
	461
	462	}
[638]	463
	464	# Convert a text string into HTML.
	465	#
	466	# The HTML is going to be inserted into a GML file, so
	467	# we have to be careful not to use symbols like ">",
	468	# which ocurs frequently in email messages (and use
	469	# &gt instead.
	470	#
	471	# This function also turns links and email addresses into hyperlinks,
	472	# and replaces carriage returns with <BR> tags (and multiple carriage
	473	# returns with <P> tags).
	474
[1206]	475
[638]	476	sub text_into_html {
	477	my ($text) = @_;
	478
[1244]	479	# Convert problem characters into HTML symbols
[3132]	480	$text =~ s/&/&/g;
	481	$text =~ s/</</g;
	482	$text =~ s/>/>/g;
	483	$text =~ s/\"/"/g;
[638]	484
[2630]	485	# convert email addresses and URIs into links
	486	# don't markup email addresses for now
	487	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	488
[2918]	489	# try to munge email addresses a little bit...
	490	$text =~ s/@/@/;
[2730]	491	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	492	# assume URI doesn't finish with a '.'
[10827]	493	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
[2630]	494
	495
[638]	496	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	497	$text =~ s/ +/ /g;
	498	$text =~ s/\s*$//g;
	499	$text =~ s/^\s*//g;
	500	$text =~ s/\n/\n<br>/g;
	501	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	502
	503	return $text;
	504	}
	505
	506
[2630]	507
	508
	509	#Process a MIME message.
	510	# the textref we are given DOES NOT include the header.
	511	sub text_from_mime_message {
[2847]	512	my $self = shift(@_);
[16341]	513	my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
[6062]	514	my $outhandle=$self->{'outhandle'};
[2630]	515	# Check for multiparts - $mimeinfo will be a boundary
	516	if ($mimetype =~ /multipart/) {
[6062]	517	my $boundary="";
[2732]	518	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	519	$boundary=$1;
[2732]	520	if ($boundary =~ m@^\"@) {
	521	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	522	}
	523	} else {
[15872]	524	print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
[2630]	525	}
	526	# parts start with "--$boundary"
	527	# message ends with "--$boundary--"
	528	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	529	# that perl might want to interpolate. Also allows spaces...
[2630]	530	$boundary=~s/\\/\\\\/g;
	531	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]	532	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	533	# remove first "part" and last "part" (final --)
	534	shift @message_parts;
	535	my $last=pop @message_parts;
[2680]	536	# if our boundaries are a bit dodgy and we only found 1 part...
	537	if (!defined($last)) {$last="";}
[2630]	538	# make sure it is only -- and whitespace
	539	if ($last !~ /^\-\-\s*$/ms) {
[15872]	540	print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
[2630]	541	}
	542	foreach my $message_part (@message_parts) {
	543	# remove the leading newline left from split.
	544	$message_part=~s/^\r?\n//;
	545	}
	546	if ($mimetype eq "multipart/alternative") {
	547	# check for an HTML version first, then TEXT, otherwise use first.
	548	my $part_text="";
	549	foreach my $message_part (@message_parts) {
[10839]	550	if ($message_part =~ m@^content\-type:\s*text/html@i)
[2630]	551	{
	552	# Use the HTML version
[10834]	553	$part_text = $self->text_from_part($message_part);
[2630]	554	$mimetype="text/html";
	555	last;
	556	}
	557	}
	558	if ($part_text eq "") { # try getting a text part instead
	559	foreach my $message_part (@message_parts) {
[10839]	560	if ($message_part =~ m@^content\-type:\s*text/plain@i)
[2630]	561	{
	562	# Use the plain version
[10839]	563	$part_text = $self->text_from_part($message_part);
[2732]	564	if ($part_text =~/[^\s]/) {
[3721]	565	$part_text = text_into_html($part_text);
[2732]	566	}
[2630]	567	$mimetype="text/plain";
	568	last;
	569	}
	570	}
	571	}
[3721]	572	if ($part_text eq "") { #use first part (no html/text part found)
	573	$part_text = $self->text_from_part(shift @message_parts);
	574	$part_text = text_into_html($part_text);
[2630]	575	}
	576	if ($part_text eq "") { # we couldn't get anything!!!
	577	# or it was an empty message...
	578	# do nothing...
[16013]	579	gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
[2630]	580	} else {
[3721]	581	$text = $part_text;
[2630]	582	}
[3352]	583	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
[3721]	584	$text = "";
[3352]	585	# signed is for PGP/GPG messages... the last part is a hash
	586	if ($mimetype =~ m@multipart/signed@) {
	587	pop @message_parts;
	588	}
[6062]	589	my $is_first_part=1;
[2630]	590	foreach my $message_part (@message_parts) {
[6062]	591	if ($is_first_part && $text ne "") {$is_first_part=0;}
	592
[2630]	593	if ($mimetype eq "multipart/digest") {
[6062]	594	# default type - RTFRFC!! Set if not already set
	595	$message_part =~ m@^(.*)\n\r?\n@s;
	596	my $part_header=$1;
	597	if ($part_header !~ m@^content-type@mi) {
	598	$message_part="Content-type: message/rfc822\n"
	599	. $message_part; # prepend default type
[4224]	600	}
[2630]	601	}
	602
[16341]	603	$text .= $self->process_multipart_part($default_header_encoding,
	604	$message_part,
[6062]	605	$is_first_part);
[2630]	606	} # foreach message part.
	607	} else {
	608	# we can't handle this multipart type (not mixed or alternative)
	609	# the RFC also mentions "parallel".
	610	}
[3627]	611	} # end of ($mimetype =~ multipart)
[2918]	612	elsif ($mimetype =~ m@message/rfc822@) {
	613	my $msg_header = $text;
	614	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	615	$text = $1;
	616
[3630]	617	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
[2918]	618	{
	619	$mimetype=$1;
[9971]	620	$mimeinfo=$2;
[2918]	621	$mimetype =~ tr/[A-Z]/[a-z]/;
[9971]	622
[2918]	623	my $msg_text;
	624	if ($mimetype =~ m@multipart/@) {
[16341]	625	$msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
	626	$default_header_encoding,
[6062]	627	$text);
[9823]	628	} else {
[9971]	629	$msg_text=$self->text_from_part($text,$msg_header);
[9823]	630	}
[2918]	631
[3630]	632	my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]	633	$text= "\n<b><<attached message>></b><br>";
	634	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	635	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	636	. "</td></tr></table>";
	637	}
	638	} else {
[2886]	639	# we don't do any processing of the content.
	640	}
	641
[2630]	642	return $text;
	643	}
	644
	645
[6062]	646
[7703]	647	# used for turning a message id into a more friendly string for greenstone
	648	sub escape_msg_id {
	649	#msgid
	650	my $id=shift;
	651	chomp $id; $id =~ s!\s!!g; # remove spaces
	652	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
	653	$id =~ s![_&]!-!g; # replace symbols that might cause problems
[10834]	654	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
[7703]	655	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
	656	return $id;
	657	}
[6062]	658
[7703]	659
	660
[6062]	661	sub process_multipart_part {
	662	my $self = shift;
[16341]	663	my $default_header_encoding = shift;
[6062]	664	my $message_part = shift;
	665	my $is_first_part = shift;
	666
	667	my $return_text="";
	668	my $part_header=$message_part;
	669	my $part_body;
	670	if ($message_part=~ /^\s*\n/) {
	671	# no header... use defaults
	672	$part_body=$message_part;
	673	$part_header="Content-type: text/plain; charset=us-ascii";
	674	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
	675	$part_body=$1;
	676	} else {
	677	# something's gone wrong...
	678	$part_header="";
	679	$part_body=$message_part;
	680	}
	681
	682	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	683	my $part_content_type="";
	684	my $part_content_info="";
	685
	686	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
	687	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	688	$part_content_info=$2;
	689	if (!defined($part_content_info)) {
	690	$part_content_info="";
	691	} else {
	692	$part_content_info =~ s/^\;\s*//;
	693	$part_content_info =~ s/\s*$//;
	694	}
	695	}
	696	my $filename="";
[6079]	697	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]	698	$filename=$1;
[6079]	699	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
[16341]	700	# decode the filename
	701	$self->decode_header_value($default_header_encoding, \$filename);
	702
[6062]	703	}
	704
	705	# disposition - either inline or attachment.
	706	# NOT CURRENTLY USED - we display all text types instead...
	707	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	708
	709	# add <<attachment>> to each part except the first...
	710	if (!$is_first_part) {
	711	$return_text.="\n<p><hr><strong><<attachment>>";
	712	# add part info header
[10827]	713	my $header_text = "<br>Type: $part_content_type<br>\n";
[6062]	714	if ($filename ne "") {
[10827]	715	$header_text .= "Filename: $filename\n";
[6062]	716	}
	717	$header_text =~ s@_@\\_@g;
[10827]	718	$return_text .= $header_text . "</strong></p>\n<p>\n";
[6062]	719	}
	720
	721	if ($part_content_type =~ m@text/@)
	722	{
[10827]	723	# $message_part includes the mime part headers
	724	my $part_text = $self->text_from_part($message_part);
[6062]	725	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	726	$part_text = text_into_html($part_text);
	727	}
	728	if ($part_text eq "") {
	729	$part_text = ' ';
	730	}
	731	$return_text .= $part_text;
	732	} elsif ($part_content_type =~ m@message/rfc822@) {
	733	# This is a forwarded message
	734	my $message_part_headers=$part_body;
	735	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	736	my $message_part_body=$1;
	737	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	738
	739	my $rfc822_formatted_body=""; # put result in here
	740	if ($message_part_headers =~
	741	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
	742	{
	743	# The message header uses MIME flags
	744	my $message_content_type=$1;
	745	my $message_content_info=$2;
	746	if (!defined($message_content_info)) {
	747	$message_content_info="";
	748	} else {
	749	$message_content_info =~ s/^\;\s*//;
	750	$message_content_info =~ s/\s*$//;
	751	}
	752	$message_content_type =~ tr/A-Z/a-z/;
	753	if ($message_content_type =~ /multipart/) {
	754	$rfc822_formatted_body=
	755	$self->text_from_mime_message($message_content_type,
	756	$message_content_info,
[16341]	757	$default_header_encoding,
[6062]	758	$message_part_body);
	759	} else {
[9971]	760	$message_part_body=$self->text_from_part($part_body,
	761	$message_part_headers);
[6062]	762	$rfc822_formatted_body=text_into_html($message_part_body);
	763	}
	764	} else {
	765	# message doesn't use MIME flags
	766	$rfc822_formatted_body=text_into_html($message_part_body);
	767	$rfc822_formatted_body =~ s@_@\\_@g;
	768	}
	769	# Add the returned text to the output
	770	# don't put all the headers...
	771	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	772	my $brief_headers=get_brief_headers($message_part_headers);
	773	$return_text.=text_into_html($brief_headers);
	774	$return_text.="</p><p>\n";
	775	$return_text.=$rfc822_formatted_body;
	776	$return_text.="</p>\n";
	777	# end of message/rfc822
	778	} elsif ($part_content_type =~ /multipart/) {
	779	# recurse again
	780
	781	my $tmptext= $self->text_from_mime_message($part_content_type,
	782	$part_content_info,
[16341]	783	$default_header_encoding,
[6062]	784	$part_body);
	785	$return_text.=$tmptext;
	786	} else {
	787	# this part isn't text/* or another message...
	788	if ($is_first_part) {
	789	# this is the first part of a multipart, or only part!
	790	$return_text="\n<p><hr><strong><<attachment>>";
	791	# add part info header
	792	my $header_text="<br>Type: $part_content_type<br>\n";
	793	$header_text.="Filename: $filename</strong></p>\n<p>\n";
	794	$header_text =~ s@_@\\_@g;
	795	$return_text.=$header_text;
	796	}
	797
	798	# save attachment by default
[10218]	799	if (!$self->{'no_attachments'}
[6062]	800	&& $filename ne "") { # this part has a file...
	801	my $encoding="8bit";
	802	if ($part_header =~
	803	/^content-transfer-encoding:\s*(\w+)/mi ) {
	804	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
	805	}
[16341]	806	my $tmpdir=&util::filename_cat($ENV{'GSDLHOME'}, "tmp");
[6062]	807	my $save_filename=$filename;
	808
	809	# make sure we don't clobber files with same name;
	810	# need to keep state between .mbx files
	811	my $assoc_files=$self->{'assoc_filenames'};
	812	if ($assoc_files->{$filename}) { # it's been set...
	813	$assoc_files->{$filename}++;
	814	$filename =~ m/(.+)\.(\w+)$/;
	815	my ($filestem, $ext)=($1,$2);
	816	$save_filename="${filestem}_"
	817	. $assoc_files->{$filename} . ".$ext";
	818	} else { # first file with this name
	819	$assoc_files->{$filename}=1;
	820	}
[16341]	821	my $tmp_filename = &util::filename_cat($tmpdir, $save_filename);
	822	open (SAVE, ">$tmp_filename") \|\|
	823	warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
	824	binmode(SAVE); # needed on Windows
[6062]	825	my $part_text = $message_part;
	826	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
	827	if ($encoding eq "base64") {
	828	print SAVE base64_decode($part_text);
	829	} elsif ($encoding eq "quoted-printable") {
	830	print SAVE qp_decode($part_text);
	831	} else { # 7bit, 8bit, binary, etc...
	832	print SAVE $part_text;
	833	}
	834	close SAVE;
	835	my $doc_obj=$self->{'doc_obj'};
[16341]	836	$doc_obj->associate_file("$tmp_filename",
[6062]	837	"$save_filename",
	838	$part_content_type # mimetype
	839	);
[16341]	840	# add this file to the list of tmp files for deleting later
	841	push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
	842
[6062]	843	my $outhandle=$self->{'outhandle'};
[15872]	844	print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
[6062]	845
	846	# be nice if "download" was a translatable macro :(
	847	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
	848	} # end of save attachment
	849	} # end of !text/message part
	850
	851
	852	return $return_text;
	853	}
	854
	855
[3630]	856	# Return only the "important" headers from a set of message headers
	857	sub get_brief_headers {
	858	my $msg_header = shift;
	859	my $brief_header = "";
[2630]	860
[3630]	861	# Order matters!
	862	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
	863	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
	864	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
	865	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
	866	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]	867
[3630]	868	return $brief_header;
	869	}
[2630]	870
	871
	872	# Process a MIME part. Return "" if we can't decode it.
[6062]	873	# should only be called for parts with type "text/*" ?
[9971]	874	# Either pass the entire mime part (including the part's header),
	875	# or pass the mime part's text and optionally the part's header.
[2630]	876	sub text_from_part {
[3132]	877	my $self = shift;
[3136]	878	my $text = shift \|\| '';
[9971]	879	my $part_header = shift;
[3136]	880
[10834]	881
[9971]	882	my $type="text/plain"; # default, overridden from part header
[10827]	883	my $charset=undef; # convert2unicode() will guess if necessary
[9971]	884
	885	if (! $part_header) { # no header argument was given. check the body
	886	$part_header = $text;
	887	# check for empty part header (leading blank line)
	888	if ($text =~ /^\s*\r?\n/) {
	889	$part_header="Content-type: text/plain; charset=us-ascii";
	890	} else {
	891	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	892	$text=$1; if (!defined($text)) {$text="";}
	893	}
	894	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2681]	895	}
[9971]	896
	897	if ($part_header =~
	898	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
	899	$type=$1;
	900	$charset=$2;
	901	}
[2630]	902	my $encoding="";
[2638]	903	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	904	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	905	}
	906	# Content-Transfer-Encoding is per-part
	907	if ($encoding ne "") {
	908	if ($encoding =~ /quoted\-printable/) {
	909	$text=qp_decode($text);
	910	} elsif ($encoding =~ /base64/) {
	911	$text=base64_decode($text);
	912	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	913	# rfc2045 also allows binary, which we ignore (for now).
[3630]	914	my $outhandle=$self->{'outhandle'};
[15872]	915	print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
[2630]	916	return "";
	917	}
	918	}
[10834]	919
[2630]	920	if ($type eq "text/html") {
	921	# only get stuff between <body> tags, or <html> tags.
[2730]	922	$text =~ s@^.<html[^>]>@@is;
	923	$text =~ s@</html>.*$@@is;
	924	$text =~ s/^.?<body[^>]>//si;
	925	$text =~ s/<\/body>.*$//si;
[2630]	926	}
	927	elsif ($type eq "text/xml") {
	928	$text=~s/</</g;$text=~s/>/>/g;
	929	$text="<pre>\n$text\n</pre>\n";
	930	}
[2730]	931	# convert to unicode
[2847]	932	$self->convert2unicode($charset, \$text);
[6062]	933	$text =~ s@_@\\_@g; # protect against GS macro language
[2630]	934	return $text;
	935	}
	936
	937
[6062]	938
	939
[2630]	940	# decode quoted-printable text
	941	sub qp_decode {
	942	my $text=shift;
	943
	944	# if a line ends with "=\s*", it is a soft line break, otherwise
	945	# keep in any newline characters.
	946
[3627]	947	$text =~ s/=\s*\r?\n//mg;
	948	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]	949	return $text;
	950	}
	951
	952	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	953	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	954	# for us at least).
	955	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	956	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	957	# from each encoded byte.
[2630]	958
	959
	960	sub base64_decode {
	961	my $enc_text = shift;
	962	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	963	# also '=' is used for padding at the end, but we remove it anyway.
	964	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	965	# map each MIME char into it's value, for more efficient lookup.
	966	my %index;
	967	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	968	# remove all non-base64 chars. eval to get variable in transliteration...
	969	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	970	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	971	my $decoded="";
	972	while (length ($enc_text)>3)
	973	{
	974	my $fourchars=substr($enc_text,0,4,"");
	975	my @chars=(split '',$fourchars);
	976	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	977	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	978	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	979	}
	980	# if there are any input chars left, there are either
	981	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	982	my @chars=(split '',$enc_text);
	983	if (length($enc_text)) {
	984	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	985	}
	986	if (length($enc_text)==3) {
	987	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	988	}
	989	return $decoded;
	990	}
	991
[10834]	992	# returns 0 if valid utf-8, 1 if invalid
	993	sub is_utf8 {
	994	my $self = shift;
	995	my $textref = shift;
	996
	997	$$textref =~ m/^/g; # to set \G
	998	my $badbytesfound=0;
	999	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1000	my $highbytes=$1;
	1001	my $highbyteslength=length($highbytes);
	1002	# replace any non utf8 complaint bytes
	1003	$highbytes =~ /^/g; # set pos()
	1004	while ($highbytes =~
	1005	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	1006	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	1007	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1008	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	1009	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1010	)*([\x80-\xff])? !xg
	1011	) {
	1012	my $badbyte=$1;
	1013	if (!defined $badbyte) {next} # hit end of string
	1014	return 1;
	1015	}
	1016	}
	1017	return 0;
	1018	}
	1019
[16341]	1020	# words with non ascii characters in header values must be encoded in the
	1021	# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
[10834]	1022
[16341]	1023	sub decode_header_value {
	1024	my $self = shift(@_);
	1025	my ($default_header_encoding, $textref) = @_;
	1026
	1027	if (!$$textref) {
	1028	# nothing to do!
	1029	return;
	1030	}
	1031	my $value = $$textref;
	1032	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
	1033	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
	1034	my $original_value=$value;
	1035	my $encoded=$value;
	1036	$value="";
	1037	# we should ignore spaces between consecutive encoded-texts
	1038	$encoded =~ s@\?=\s+=\?@\?==\?@g;
	1039	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
	1040	my ($charset, $encoding, $data)=($2,$3,$4);
	1041	my ($decoded_data);
	1042	my $leading_chars = "$1";
	1043	$self->convert2unicode($default_header_encoding, \$leading_chars);
	1044	$value.=$leading_chars;
	1045
	1046	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	1047	chomp $data;
	1048	$encoding =~ tr/BQ/bq/;
	1049	if ($encoding eq "q") { # quoted printable
	1050	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
	1051	$decoded_data=qp_decode($data);
	1052	# qp_decode adds \n, which is default for body text
	1053	chomp($decoded_data);
	1054	} else { # base 64
	1055	$decoded_data=base64_decode($data);
	1056	}
	1057	$self->convert2unicode($charset, \$decoded_data);
	1058	$value .= $decoded_data;
	1059	} # end of while loop
	1060
	1061	# get any trailing characters
	1062	$self->convert2unicode($default_header_encoding, \$encoded);
	1063	$value.=$encoded;
	1064
	1065	if ($value =~ /^\s*$/) { # we couldn't extract anything...
	1066	$self->convert2unicode($default_header_encoding,
	1067	\$original_value);
	1068	$value=$original_value;
	1069	}
	1070	$$textref = $value;
	1071	} # end of if =?...?=
	1072
	1073	# In the absense of other charset information, assume the
	1074	# header is the default (usually "iso_8859_1") and convert to unicode.
	1075	else {
	1076	$self->convert2unicode($default_header_encoding, $textref);
	1077	}
	1078
	1079	}
[10834]	1080
[16341]	1081
	1082
[2847]	1083	sub convert2unicode {
	1084	my $self = shift(@_);
	1085	my ($charset, $textref) = @_;
[2630]	1086
[7703]	1087	if (!$$textref) {
	1088	# nothing to do!
	1089	return;
	1090	}
	1091
[10827]	1092	if (! defined $charset) {
	1093	# check if we have valid utf-8
[10834]	1094	if ($self->is_utf8($textref)) { $charset = "utf8" }
[10827]	1095
	1096	# default to latin
	1097	$charset = "iso_8859_1" if ! defined($charset);
	1098	}
	1099
[2847]	1100	# first get our character encoding name in the right form.
[7703]	1101	$charset =~ tr/A-Z/a-z/; # lowercase
	1102	$charset =~ s/\-/_/g;
[8902]	1103	if ($charset =~ /gb_?2312/) { $charset="gb" }
[2847]	1104	# assumes EUC-KR, not ISO-2022 !?
[7703]	1105	$charset =~ s/^ks_c_5601_1987/korean/;
	1106	if ($charset eq 'utf_8') {$charset='utf8'}
[2847]	1107
[7703]	1108	my $outhandle = $self->{'outhandle'};
	1109
	1110	if ($charset eq "utf8") {
	1111	# no conversion needed, but lets check that it's valid utf8
	1112	# see utf-8 manpage for valid ranges
	1113	$$textref =~ m/^/g; # to set \G
	1114	my $badbytesfound=0;
	1115	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1116	my $highbytes=$1;
	1117	my $highbyteslength=length($highbytes);
	1118	# replace any non utf8 complaint bytes
	1119	$highbytes =~ /^/g; # set pos()
	1120	while ($highbytes =~
[8904]	1121	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7703]	1122	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8904]	1123	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1124	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
[7703]	1125	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1126	)*([\x80-\xff])? !xg
	1127	) {
	1128	my $badbyte=$1;
	1129	if (!defined $badbyte) {next} # hit end of string
	1130	my $pos=pos($highbytes);
	1131	substr($highbytes, $pos-1, 1, "\xc2\x80");
	1132	# update the position to continue searching (for \G)
	1133	pos($highbytes) = $pos+1; # set to just after the \x80
	1134	$badbytesfound=1;
	1135	}
	1136	if ($badbytesfound==1) {
	1137	# claims to be utf8, but it isn't!
[15872]	1138	print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
[7703]	1139	. "detected and removed.\n";
	1140
	1141	my $replength=length($highbytes);
	1142	my $textpos=pos($$textref);
	1143	# replace bad bytes with good bytes
	1144	substr( $$textref, $textpos-$replength,
	1145	$replength, $highbytes);
	1146	# update the position to continue searching (for \G)
	1147	pos($$textref)=$textpos+($replength-$highbyteslength);
	1148	}
	1149	}
[3206]	1150	return;
	1151	}
	1152
[2847]	1153	# It appears that we can't always trust ascii text so we'll treat it
	1154	# as iso-8859-1 (letting characters above 0x80 through without
	1155	# converting them to utf-8 will result in invalid XML documents
	1156	# which can't be parsed at build time).
	1157	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	1158
[3073]	1159	if ($charset eq "iso_8859_1") {
	1160	# test if the mailer lied, and it has win1252 chars in it...
	1161	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	1162	if ($$textref =~ m/[\x80-\x9f]/) {
[15872]	1163	print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
[3073]	1164	print $outhandle "codepage 1252 detected.\n";
	1165	$charset = "windows_1252";
	1166	}
	1167	}
[3351]	1168	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	1169
	1170	if ($utf8_text ne "") {
	1171	$$textref=$utf8_text;
	1172	} else {
	1173	# we didn't get any text... unsupported encoding perhaps? Or it is
	1174	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
	1175	# characters out here if this causes problems...
[3726]	1176	my $outhandle=$self->{'outhandle'};
[15872]	1177	print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
[3351]	1178	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
	1179
	1180	}
[2847]	1181	}
	1182
[17026]	1183	sub get_base_OID {
	1184	my $self = shift(@_);
	1185	my ($doc_obj) = @_;
[2847]	1186
[17026]	1187	## TODO: need a valid id in case there is no message id...
	1188	if ($self->{'OIDtype'} eq "message_id") {
	1189	# temporarily set OIDtype to hash to get a base id
	1190	$self->{'OIDtype'} = "hash_on_ga_xml";
	1191	$self->add_OID($doc_obj);
	1192	my $id = $doc_obj->get_OID();
	1193	$self->{'OIDtype'} = "message_id";
	1194	return $id;
	1195	}
	1196	return $self->SUPER::get_base_OID(@_);
	1197	}
	1198
	1199
	1200	sub add_OID {
[7703]	1201	my $self = shift (@_);
	1202	my ($doc_obj, $id, $segment_number) = @_;
	1203
[17026]	1204	if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
[7703]	1205	$doc_obj->set_OID($doc_obj->{'msgid'});
[17026]	1206	}
	1207	else {
[7703]	1208	$doc_obj->set_OID("$id\_$segment_number");
	1209	}
	1210	}
	1211
	1212
[1206]	1213	# Perl packages have to return true if they are run.
	1214	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: