Context Navigation

source: gsdl/trunk/perllib/plugins/EmailPlugin.pm@ 19283

Last change on this file since 19283 was 19283, checked in by kjdon, 15 years ago
added a new case for From metadata format: username at server (full name) - the current greenstone mail archive files use this
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 38.0 KB

Rev	Line
[638]	1	###########################################################################
	2	#
[15872]	3	# EmailPlugin.pm - a plugin for parsing email files
[638]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[3630]	9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
[15872]	29	# EmailPlugin
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[6916]	42	# $Headers All the header content (optional, not stored by default)
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[19281]	52	# $InReplyTo Message id of the one this replies to
[2630]	53	#
	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]	63
[16341]	64
[15872]	65	package EmailPlugin;
[3540]	66
[7830]	67	use strict;
	68	no strict "refs"; # so we can use a variable as a filehandle for print $out
	69
[638]	70
[15872]	71	use SplitTextFile;
[7830]	72	use unicode; # gs conv functions
	73	use gsprintf 'gsprintf'; # translations
[2730]	74
[638]	75	use sorttools;
	76	use util;
	77
[10254]	78	sub BEGIN {
[15872]	79	@EmailPlugin::ISA = ('SplitTextFile');
[10254]	80	}
[638]	81
[17026]	82	my $extended_oidtype_list =
	83	[ {'name' => "message_id",
	84	'desc' => "{EmailPlugin.OIDtype.message_id}" }
	85	];
[638]	86
[17026]	87	# add in all the standard options from BasePlugin
[17053]	88	unshift (@$extended_oidtype_list, @{$BasePlugin::oidtype_list});
[17026]	89
[3540]	90	my $arguments =
[4744]	91	[ { 'name' => "process_exp",
[15872]	92	'desc' => "{BasePlugin.process_exp}",
[6408]	93	'type' => "regexp",
[3540]	94	'reqd' => "no",
[4744]	95	'deft' => &get_default_process_exp() },
	96	{ 'name' => "no_attachments",
[15872]	97	'desc' => "{EmailPlugin.no_attachments}",
[3630]	98	'type' => "flag",
[4744]	99	'reqd' => "no" },
[6916]	100	{ 'name' => "headers",
[15872]	101	'desc' => "{EmailPlugin.headers}",
[6916]	102	'type' => "flag",
	103	'reqd' => "no" },
[17026]	104	{ 'name' => "OIDtype",
	105	'desc' => "{import.OIDtype}",
	106	'type' => "enum",
	107	'list' => $extended_oidtype_list,
[19281]	108	'deft' => "message_id",
[18591]	109	'reqd' => "no" },
[17026]	110	{ 'name' => "OIDmetadata",
	111	'desc' => "{import.OIDmetadata}",
	112	'type' => "metadata",
	113	'deft' => "dc.Identifier",
[18591]	114	'reqd' => "no" },
[4744]	115	{ 'name' => "split_exp",
[15872]	116	'desc' => "{EmailPlugin.split_exp}",
[6408]	117	'type' => "regexp",
	118	'reqd' => "no",
	119	'deft' => &get_default_split_exp() }
	120	];
[3540]	121
[15872]	122	my $options = { 'name' => "EmailPlugin",
	123	'desc' => "{EmailPlugin.desc}",
[6408]	124	'abstract' => "no",
	125	'inherits' => "yes",
[4744]	126	'args' => $arguments };
[3540]	127
[638]	128	sub new {
[10218]	129	my ($class) = shift (@_);
	130	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	131	push(@$pluginlist, $class);
[3540]	132
[15872]	133	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	134	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	135
[15872]	136	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	137
[3630]	138	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
[16341]	139	$self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
[3630]	140
[2730]	141	# this might not actually be true at read-time, but after processing
	142	# it should all be utf8.
	143	$self->{'input_encoding'}="utf8";
[638]	144	return bless $self, $class;
	145	}
	146
[1244]	147	sub get_default_process_exp {
	148	my $self = shift (@_);
[2096]	149	# mbx/email for mailbox file format, \d+ for maildir (each message is
	150	# in a separate file, with a unique number for filename)
[3111]	151	# mozilla and IE will save individual mbx format files with a ".eml" ext.
	152	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
[638]	153	}
	154
[1895]	155	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	156	# It is supposed to be "\n\nFrom ", but this isn't always used.
[9971]	157	# add \d{4} so that the line ends in a year (in case the text has an
	158	# unescaped "From " at the start of a line).
[1895]	159	sub get_default_split_exp {
[9971]	160	return q^\nFrom .*\d{4}\n^;
[3523]	161
[1895]	162	}
	163
[16677]	164	sub can_process_this_file {
	165	my $self = shift(@_);
	166	my ($filename) = @_;
[1895]	167
[16677]	168	# avoid any confusion between filenames matching \d+ (which are by default
	169	# matched by EmailPlugin) and directories that match \d+ (which should not)
	170
	171	return 0 if (-d $filename);
	172
	173	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
	174	return 1;
	175	}
	176	return 0;
	177
	178	}
	179
	180
[1244]	181	# do plugin specific processing of doc_obj
	182	sub process {
[2630]	183
[638]	184	my $self = shift (@_);
[6332]	185	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]	186	my $outhandle = $self->{'outhandle'};
	187
[1244]	188	# Check that we're dealing with a valid mail file
[3111]	189	# mbox message files start with "From "
	190	# maildir messages usually start with Return-Path and Delivered-To
	191	# mh is very similar to maildir
	192	my $startoffile=substr($$textref,0,256);
	193	if (($startoffile !~ /^(From )/) &&
	194	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	195	return undef;
	196	}
[638]	197
[1244]	198	my $cursection = $doc_obj->get_top_section();
[638]	199
[1206]	200	#
	201	# Parse the document's text and extract metadata
	202	#
[638]	203
[2652]	204	# Protect backslashes
	205	$$textref =~ s@\\@\\\\@g;
	206
[1206]	207	# Separate header from body of message
[1244]	208	my $Headers = $$textref;
[2630]	209	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	210	$$textref = $1;
[4089]	211	$Headers .= "\n";
[2779]	212
[2630]	213	# Unfold headers - see rfc822
	214	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	215	# Extract basic metadata from header
	216	my @headers = ("From", "To", "Subject", "Date");
	217	my %raw;
[1658]	218	foreach my $name (@headers) {
	219	$raw{$name} = "No $name value";
	220	}
[1206]	221
[3132]	222	# Get a default encoding for the header - RFC says should be ascii...
[6062]	223	my $default_header_encoding="iso_8859_1";
[3132]	224
	225	# We don't know what character set is the user's default...
	226	# We could use textcat to guess... for now we'll look at mime content-type
	227	# if ($Headers =~ /([[:^ascii:]])/) {
	228	# }
	229	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	230	$default_header_encoding=$1;
	231	$default_header_encoding =~ s@\-@_@g;
	232	$default_header_encoding =~ tr/A-Z/a-z/;
	233	}
	234
	235
[1658]	236	# Examine each line of the headers
	237	my ($line, $name, $value);
	238	my @parts;
	239	foreach $line (split(/\n/, $Headers)) {
	240
	241	# Ignore lines with no content or which begin with whitespace
	242	next unless ($line =~ /:/);
	243	next if ($line =~ /^\s/);
	244
	245	# Find out what metadata is on this line
	246	@parts = split(/:/, $line);
	247	$name = shift @parts;
[3136]	248	# get fieldname in canonical form - first cap, then lower case.
[3134]	249	$name =~ tr/A-Z/a-z/;
[3136]	250	# uppercase the first character according to the current locale
[2630]	251	$name=~s/(.+)/\u$1/;
[1658]	252	next unless $name;
	253	next unless ($raw{$name});
	254
	255	# Find the value of that metadata
	256	$value = join(":", @parts);
	257	$value =~ s/^\s+//;
	258	$value =~ s/\s+$//;
[16341]	259	# decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
	260	$self->decode_header_value($default_header_encoding, \$value);
[2730]	261
[1658]	262	# Store the metadata
[6062]	263	$value =~ s@_@\\_@g; # protect against GS macro language
[1206]	264	$raw{$name} = $value;
	265	}
	266
[2630]	267	# Extract the name and e-mail address from the From metadata
[6062]	268	my $frommeta = $raw{"From"};
[2680]	269	my $fromnamemeta;
	270	my $fromaddrmeta;
	271
	272	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	273
	274	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	275	$fromnamemeta=$1;
	276	$fromaddrmeta=$2;
	277	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
	278	$fromnamemeta=$2;
	279	$fromaddrmeta=$1;
[19283]	280	} elsif ($frommeta =~ m/(.+)\s+at\s+(.+)\s+\((.*)\)/) {
	281	$fromnamemeta=$3;
	282	$fromaddrmeta="$1@$2";
	283	}
	284
[2630]	285	if (!defined($fromaddrmeta)) {
	286	$fromaddrmeta=$frommeta;
	287	}
[2680]	288	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	289	# minor attempt to prevent spam-bots from harvesting addresses...
	290	$fromaddrmeta=~s/@/@/;
[6062]	291
[2630]	292	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	293
[3328]	294	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]	295	$fromnamemeta =~ s/\"//g; # remove quotes
	296	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]	297	}
	298	else {
	299	$fromnamemeta = $fromaddrmeta;
	300	}
	301	# if name is an address
	302	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	303	$fromnamemeta=~s/@/&#64\;/;
[2630]	304	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	305
	306	$raw{"From"}=$frommeta;
	307
[1206]	308	# Process Date information
[1658]	309	if ($raw{"Date"} !~ /No Date/) {
[1206]	310	$raw{"DateText"} = $raw{"Date"};
[638]	311
[1206]	312	# Convert the date text to internal date format
	313	$value = $raw{"Date"};
[15212]	314	# proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
[1206]	315	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[15212]	316	if (!defined($day) \|\| !defined($month) \|\| !defined ($year)) {
	317	# try monthly archive format: Wed Apr 23 00:26:08 2008
	318	($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
	319	}
	320
[3143]	321	# make some assumptions about the year formatting...
	322	# some (old) software thinks 2001 is 101, some think 2001 is 01
	323	if ($year < 20) { $year += 2000; } # assume not really 1920...
	324	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	325	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	326
[1206]	327	} else {
	328	# We have not extracted a date
	329	$raw{"DateText"} = "Unknown.";
	330	$raw{"Date"} = "19000000";
[638]	331	}
	332
[1244]	333	# Add extracted metadata to document object
[1206]	334	foreach my $name (keys %raw) {
	335	$value = $raw{$name};
	336	if ($value) {
[2730]	337	# assume subject, etc headers have no special HTML meaning.
[1206]	338	$value = &text_into_html($value);
[2730]	339	# escape [] so it isn't re-interpreted as metadata
	340	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	341	} else {
	342	$value = "No $name field";
	343	}
[1244]	344	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	345	}
[638]	346
[7703]	347
	348	# extract a message ID from the headers, if there is one, and we'll use
	349	# that as the greenstone doc ID. Having a predictable ID means we can
	350	# link to other messages, eg from In-Reply-To or References headers...
	351	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
	352	my $id=escape_msg_id($1);
	353	$doc_obj->{'msgid'}=$id;
	354	}
	355	# link to another message, if this is a reply
	356	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
	357	my $id=escape_msg_id($1);
	358	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	359	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
	360	# References can have multiple, get the last one
	361	my $id=escape_msg_id($1);
	362	# not necessarily in-reply-to, but same thread...
	363	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	364	}
	365
	366
	367
[2630]	368	my $mimetype="text/plain";
	369	my $mimeinfo="";
[3132]	370	my $charset = $default_header_encoding;
[3073]	371	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	372	# more than one parameter given to Content-type.
	373	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]	374	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
[2630]	375	{
	376	$mimetype=$1;
	377	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	378
	379	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	380	$mimetype = "text/plain";
	381	}
	382
[2630]	383	$mimeinfo=$2;
[3073]	384	if (!defined $mimeinfo) {
	385	$mimeinfo="";
	386	} else { # strip leading and trailing stuff
	387	$mimeinfo =~ s/^\;\s*//;
	388	$mimeinfo =~ s/\s*$//;
	389	}
	390	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	391	$charset = $1;
	392	}
[2630]	393	}
	394
[2680]	395	my $transfer_encoding="7bit";
	396	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	397	$transfer_encoding=$1;
	398	}
[6062]	399
[2886]	400	if ($mimetype eq "text/html") {
[9971]	401	$$textref= $self->text_from_part($$textref, $Headers);
[2886]	402	} elsif ($mimetype ne "text/plain") {
[3630]	403	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[16341]	404	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
[6062]	405	} else { # mimetype eq text/plain
	406
	407	if ($transfer_encoding =~ /quoted\-printable/) {
	408	$$textref=qp_decode($$textref);
	409	} elsif ($transfer_encoding =~ /base64/) {
	410	$$textref=base64_decode($$textref);
	411	}
[2886]	412	$self->convert2unicode($charset, $textref);
[8904]	413
	414	$$textref = &text_into_html($$textref);
	415	$$textref =~ s@_@\\_@g; # protect against GS macro language
	416
[2680]	417	}
[6062]	418
[2630]	419
[10218]	420	if ($self->{'headers'} && $self->{'headers'} == 1) {
[6916]	421	# Add "All headers" metadata
	422	$Headers = &text_into_html($Headers);
[2630]	423
[6916]	424	$Headers = "No headers" unless ($Headers =~ /\w/);
	425	$Headers =~ s/@/&#64\;/g;
	426	# escape [] so it isn't re-interpreted as metadata
	427	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
	428	$self->convert2unicode($charset, \$Headers);
[2754]	429
[6916]	430	$Headers =~ s@_@\\_@g; # protect against GS macro language
	431	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
	432	}
[6062]	433
	434
[2918]	435	# Add Title metadata
	436	my $Title = text_into_html($raw{'Subject'});
[7703]	437	$Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]	438	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	439	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	440
	441	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	442
[8121]	443	# Add FileFormat metadata
	444	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
[2918]	445
[1244]	446	# Add text to document object
	447	$$textref = "No message" unless ($$textref =~ /\w/);
[6062]	448
[1244]	449	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	450
[1244]	451	return 1;
[638]	452	}
	453
[16341]	454	# delete any temp files that we have created
	455	sub clean_up_after_doc_obj_processing {
	456	my $self = shift(@_);
	457
	458	foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
	459	if (-e $tmp_file_path) {
	460	&util::rm($tmp_file_path);
	461	}
	462	}
	463
	464	}
[638]	465
	466	# Convert a text string into HTML.
	467	#
	468	# The HTML is going to be inserted into a GML file, so
	469	# we have to be careful not to use symbols like ">",
	470	# which ocurs frequently in email messages (and use
	471	# &gt instead.
	472	#
	473	# This function also turns links and email addresses into hyperlinks,
	474	# and replaces carriage returns with <BR> tags (and multiple carriage
	475	# returns with <P> tags).
	476
[1206]	477
[638]	478	sub text_into_html {
	479	my ($text) = @_;
	480
[1244]	481	# Convert problem characters into HTML symbols
[3132]	482	$text =~ s/&/&/g;
	483	$text =~ s/</</g;
	484	$text =~ s/>/>/g;
	485	$text =~ s/\"/"/g;
[638]	486
[2630]	487	# convert email addresses and URIs into links
	488	# don't markup email addresses for now
	489	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	490
[2918]	491	# try to munge email addresses a little bit...
	492	$text =~ s/@/@/;
[2730]	493	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	494	# assume URI doesn't finish with a '.'
[10827]	495	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
[2630]	496
	497
[638]	498	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	499	$text =~ s/ +/ /g;
	500	$text =~ s/\s*$//g;
	501	$text =~ s/^\s*//g;
	502	$text =~ s/\n/\n<br>/g;
	503	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	504
	505	return $text;
	506	}
	507
	508
[2630]	509
	510
	511	#Process a MIME message.
	512	# the textref we are given DOES NOT include the header.
	513	sub text_from_mime_message {
[2847]	514	my $self = shift(@_);
[16341]	515	my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
[6062]	516	my $outhandle=$self->{'outhandle'};
[2630]	517	# Check for multiparts - $mimeinfo will be a boundary
	518	if ($mimetype =~ /multipart/) {
[6062]	519	my $boundary="";
[2732]	520	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	521	$boundary=$1;
[2732]	522	if ($boundary =~ m@^\"@) {
	523	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	524	}
	525	} else {
[15872]	526	print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
[2630]	527	}
	528	# parts start with "--$boundary"
	529	# message ends with "--$boundary--"
	530	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	531	# that perl might want to interpolate. Also allows spaces...
[2630]	532	$boundary=~s/\\/\\\\/g;
	533	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]	534	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	535	# remove first "part" and last "part" (final --)
	536	shift @message_parts;
	537	my $last=pop @message_parts;
[2680]	538	# if our boundaries are a bit dodgy and we only found 1 part...
	539	if (!defined($last)) {$last="";}
[2630]	540	# make sure it is only -- and whitespace
	541	if ($last !~ /^\-\-\s*$/ms) {
[15872]	542	print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
[2630]	543	}
	544	foreach my $message_part (@message_parts) {
	545	# remove the leading newline left from split.
	546	$message_part=~s/^\r?\n//;
	547	}
	548	if ($mimetype eq "multipart/alternative") {
	549	# check for an HTML version first, then TEXT, otherwise use first.
	550	my $part_text="";
	551	foreach my $message_part (@message_parts) {
[10839]	552	if ($message_part =~ m@^content\-type:\s*text/html@i)
[2630]	553	{
	554	# Use the HTML version
[10834]	555	$part_text = $self->text_from_part($message_part);
[2630]	556	$mimetype="text/html";
	557	last;
	558	}
	559	}
	560	if ($part_text eq "") { # try getting a text part instead
	561	foreach my $message_part (@message_parts) {
[10839]	562	if ($message_part =~ m@^content\-type:\s*text/plain@i)
[2630]	563	{
	564	# Use the plain version
[10839]	565	$part_text = $self->text_from_part($message_part);
[2732]	566	if ($part_text =~/[^\s]/) {
[3721]	567	$part_text = text_into_html($part_text);
[2732]	568	}
[2630]	569	$mimetype="text/plain";
	570	last;
	571	}
	572	}
	573	}
[3721]	574	if ($part_text eq "") { #use first part (no html/text part found)
	575	$part_text = $self->text_from_part(shift @message_parts);
	576	$part_text = text_into_html($part_text);
[2630]	577	}
	578	if ($part_text eq "") { # we couldn't get anything!!!
	579	# or it was an empty message...
	580	# do nothing...
[16013]	581	gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
[2630]	582	} else {
[3721]	583	$text = $part_text;
[2630]	584	}
[3352]	585	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
[3721]	586	$text = "";
[3352]	587	# signed is for PGP/GPG messages... the last part is a hash
	588	if ($mimetype =~ m@multipart/signed@) {
	589	pop @message_parts;
	590	}
[6062]	591	my $is_first_part=1;
[2630]	592	foreach my $message_part (@message_parts) {
[6062]	593	if ($is_first_part && $text ne "") {$is_first_part=0;}
	594
[2630]	595	if ($mimetype eq "multipart/digest") {
[6062]	596	# default type - RTFRFC!! Set if not already set
	597	$message_part =~ m@^(.*)\n\r?\n@s;
	598	my $part_header=$1;
	599	if ($part_header !~ m@^content-type@mi) {
	600	$message_part="Content-type: message/rfc822\n"
	601	. $message_part; # prepend default type
[4224]	602	}
[2630]	603	}
	604
[16341]	605	$text .= $self->process_multipart_part($default_header_encoding,
	606	$message_part,
[6062]	607	$is_first_part);
[2630]	608	} # foreach message part.
	609	} else {
	610	# we can't handle this multipart type (not mixed or alternative)
	611	# the RFC also mentions "parallel".
	612	}
[3627]	613	} # end of ($mimetype =~ multipart)
[2918]	614	elsif ($mimetype =~ m@message/rfc822@) {
	615	my $msg_header = $text;
	616	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	617	$text = $1;
	618
[3630]	619	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
[2918]	620	{
	621	$mimetype=$1;
[9971]	622	$mimeinfo=$2;
[2918]	623	$mimetype =~ tr/[A-Z]/[a-z]/;
[9971]	624
[2918]	625	my $msg_text;
	626	if ($mimetype =~ m@multipart/@) {
[16341]	627	$msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
	628	$default_header_encoding,
[6062]	629	$text);
[9823]	630	} else {
[9971]	631	$msg_text=$self->text_from_part($text,$msg_header);
[9823]	632	}
[2918]	633
[3630]	634	my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]	635	$text= "\n<b><<attached message>></b><br>";
	636	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	637	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	638	. "</td></tr></table>";
	639	}
	640	} else {
[2886]	641	# we don't do any processing of the content.
	642	}
	643
[2630]	644	return $text;
	645	}
	646
	647
[6062]	648
[7703]	649	# used for turning a message id into a more friendly string for greenstone
	650	sub escape_msg_id {
	651	#msgid
	652	my $id=shift;
	653	chomp $id; $id =~ s!\s!!g; # remove spaces
	654	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
	655	$id =~ s![_&]!-!g; # replace symbols that might cause problems
[10834]	656	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
[7703]	657	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
	658	return $id;
	659	}
[6062]	660
[7703]	661
	662
[6062]	663	sub process_multipart_part {
	664	my $self = shift;
[16341]	665	my $default_header_encoding = shift;
[6062]	666	my $message_part = shift;
	667	my $is_first_part = shift;
	668
	669	my $return_text="";
	670	my $part_header=$message_part;
	671	my $part_body;
	672	if ($message_part=~ /^\s*\n/) {
	673	# no header... use defaults
	674	$part_body=$message_part;
	675	$part_header="Content-type: text/plain; charset=us-ascii";
	676	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
	677	$part_body=$1;
	678	} else {
	679	# something's gone wrong...
	680	$part_header="";
	681	$part_body=$message_part;
	682	}
	683
	684	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	685	my $part_content_type="";
	686	my $part_content_info="";
	687
	688	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
	689	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	690	$part_content_info=$2;
	691	if (!defined($part_content_info)) {
	692	$part_content_info="";
	693	} else {
	694	$part_content_info =~ s/^\;\s*//;
	695	$part_content_info =~ s/\s*$//;
	696	}
	697	}
	698	my $filename="";
[6079]	699	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]	700	$filename=$1;
[6079]	701	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
[16341]	702	# decode the filename
	703	$self->decode_header_value($default_header_encoding, \$filename);
	704
[6062]	705	}
	706
	707	# disposition - either inline or attachment.
	708	# NOT CURRENTLY USED - we display all text types instead...
	709	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	710
	711	# add <<attachment>> to each part except the first...
	712	if (!$is_first_part) {
	713	$return_text.="\n<p><hr><strong><<attachment>>";
	714	# add part info header
[10827]	715	my $header_text = "<br>Type: $part_content_type<br>\n";
[6062]	716	if ($filename ne "") {
[10827]	717	$header_text .= "Filename: $filename\n";
[6062]	718	}
	719	$header_text =~ s@_@\\_@g;
[10827]	720	$return_text .= $header_text . "</strong></p>\n<p>\n";
[6062]	721	}
	722
	723	if ($part_content_type =~ m@text/@)
	724	{
[10827]	725	# $message_part includes the mime part headers
	726	my $part_text = $self->text_from_part($message_part);
[6062]	727	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	728	$part_text = text_into_html($part_text);
	729	}
	730	if ($part_text eq "") {
	731	$part_text = ' ';
	732	}
	733	$return_text .= $part_text;
	734	} elsif ($part_content_type =~ m@message/rfc822@) {
	735	# This is a forwarded message
	736	my $message_part_headers=$part_body;
	737	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	738	my $message_part_body=$1;
	739	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	740
	741	my $rfc822_formatted_body=""; # put result in here
	742	if ($message_part_headers =~
	743	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
	744	{
	745	# The message header uses MIME flags
	746	my $message_content_type=$1;
	747	my $message_content_info=$2;
	748	if (!defined($message_content_info)) {
	749	$message_content_info="";
	750	} else {
	751	$message_content_info =~ s/^\;\s*//;
	752	$message_content_info =~ s/\s*$//;
	753	}
	754	$message_content_type =~ tr/A-Z/a-z/;
	755	if ($message_content_type =~ /multipart/) {
	756	$rfc822_formatted_body=
	757	$self->text_from_mime_message($message_content_type,
	758	$message_content_info,
[16341]	759	$default_header_encoding,
[6062]	760	$message_part_body);
	761	} else {
[9971]	762	$message_part_body=$self->text_from_part($part_body,
	763	$message_part_headers);
[6062]	764	$rfc822_formatted_body=text_into_html($message_part_body);
	765	}
	766	} else {
	767	# message doesn't use MIME flags
	768	$rfc822_formatted_body=text_into_html($message_part_body);
	769	$rfc822_formatted_body =~ s@_@\\_@g;
	770	}
	771	# Add the returned text to the output
	772	# don't put all the headers...
	773	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	774	my $brief_headers=get_brief_headers($message_part_headers);
	775	$return_text.=text_into_html($brief_headers);
	776	$return_text.="</p><p>\n";
	777	$return_text.=$rfc822_formatted_body;
	778	$return_text.="</p>\n";
	779	# end of message/rfc822
	780	} elsif ($part_content_type =~ /multipart/) {
	781	# recurse again
	782
	783	my $tmptext= $self->text_from_mime_message($part_content_type,
	784	$part_content_info,
[16341]	785	$default_header_encoding,
[6062]	786	$part_body);
	787	$return_text.=$tmptext;
	788	} else {
	789	# this part isn't text/* or another message...
	790	if ($is_first_part) {
	791	# this is the first part of a multipart, or only part!
	792	$return_text="\n<p><hr><strong><<attachment>>";
	793	# add part info header
	794	my $header_text="<br>Type: $part_content_type<br>\n";
	795	$header_text.="Filename: $filename</strong></p>\n<p>\n";
	796	$header_text =~ s@_@\\_@g;
	797	$return_text.=$header_text;
	798	}
	799
	800	# save attachment by default
[10218]	801	if (!$self->{'no_attachments'}
[6062]	802	&& $filename ne "") { # this part has a file...
	803	my $encoding="8bit";
	804	if ($part_header =~
	805	/^content-transfer-encoding:\s*(\w+)/mi ) {
	806	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
	807	}
[16341]	808	my $tmpdir=&util::filename_cat($ENV{'GSDLHOME'}, "tmp");
[6062]	809	my $save_filename=$filename;
	810
	811	# make sure we don't clobber files with same name;
	812	# need to keep state between .mbx files
	813	my $assoc_files=$self->{'assoc_filenames'};
	814	if ($assoc_files->{$filename}) { # it's been set...
	815	$assoc_files->{$filename}++;
	816	$filename =~ m/(.+)\.(\w+)$/;
	817	my ($filestem, $ext)=($1,$2);
	818	$save_filename="${filestem}_"
	819	. $assoc_files->{$filename} . ".$ext";
	820	} else { # first file with this name
	821	$assoc_files->{$filename}=1;
	822	}
[16341]	823	my $tmp_filename = &util::filename_cat($tmpdir, $save_filename);
	824	open (SAVE, ">$tmp_filename") \|\|
	825	warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
	826	binmode(SAVE); # needed on Windows
[6062]	827	my $part_text = $message_part;
	828	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
	829	if ($encoding eq "base64") {
	830	print SAVE base64_decode($part_text);
	831	} elsif ($encoding eq "quoted-printable") {
	832	print SAVE qp_decode($part_text);
	833	} else { # 7bit, 8bit, binary, etc...
	834	print SAVE $part_text;
	835	}
	836	close SAVE;
	837	my $doc_obj=$self->{'doc_obj'};
[16341]	838	$doc_obj->associate_file("$tmp_filename",
[6062]	839	"$save_filename",
	840	$part_content_type # mimetype
	841	);
[16341]	842	# add this file to the list of tmp files for deleting later
	843	push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
	844
[6062]	845	my $outhandle=$self->{'outhandle'};
[15872]	846	print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
[6062]	847
	848	# be nice if "download" was a translatable macro :(
	849	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
	850	} # end of save attachment
	851	} # end of !text/message part
	852
	853
	854	return $return_text;
	855	}
	856
	857
[3630]	858	# Return only the "important" headers from a set of message headers
	859	sub get_brief_headers {
	860	my $msg_header = shift;
	861	my $brief_header = "";
[2630]	862
[3630]	863	# Order matters!
	864	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
	865	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
	866	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
	867	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
	868	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]	869
[3630]	870	return $brief_header;
	871	}
[2630]	872
	873
	874	# Process a MIME part. Return "" if we can't decode it.
[6062]	875	# should only be called for parts with type "text/*" ?
[9971]	876	# Either pass the entire mime part (including the part's header),
	877	# or pass the mime part's text and optionally the part's header.
[2630]	878	sub text_from_part {
[3132]	879	my $self = shift;
[3136]	880	my $text = shift \|\| '';
[9971]	881	my $part_header = shift;
[3136]	882
[10834]	883
[9971]	884	my $type="text/plain"; # default, overridden from part header
[10827]	885	my $charset=undef; # convert2unicode() will guess if necessary
[9971]	886
	887	if (! $part_header) { # no header argument was given. check the body
	888	$part_header = $text;
	889	# check for empty part header (leading blank line)
	890	if ($text =~ /^\s*\r?\n/) {
	891	$part_header="Content-type: text/plain; charset=us-ascii";
	892	} else {
	893	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	894	$text=$1; if (!defined($text)) {$text="";}
	895	}
	896	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2681]	897	}
[9971]	898
	899	if ($part_header =~
	900	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
	901	$type=$1;
	902	$charset=$2;
	903	}
[2630]	904	my $encoding="";
[2638]	905	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	906	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	907	}
	908	# Content-Transfer-Encoding is per-part
	909	if ($encoding ne "") {
	910	if ($encoding =~ /quoted\-printable/) {
	911	$text=qp_decode($text);
	912	} elsif ($encoding =~ /base64/) {
	913	$text=base64_decode($text);
	914	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	915	# rfc2045 also allows binary, which we ignore (for now).
[3630]	916	my $outhandle=$self->{'outhandle'};
[15872]	917	print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
[2630]	918	return "";
	919	}
	920	}
[10834]	921
[2630]	922	if ($type eq "text/html") {
	923	# only get stuff between <body> tags, or <html> tags.
[2730]	924	$text =~ s@^.<html[^>]>@@is;
	925	$text =~ s@</html>.*$@@is;
	926	$text =~ s/^.?<body[^>]>//si;
	927	$text =~ s/<\/body>.*$//si;
[2630]	928	}
	929	elsif ($type eq "text/xml") {
	930	$text=~s/</</g;$text=~s/>/>/g;
	931	$text="<pre>\n$text\n</pre>\n";
	932	}
[2730]	933	# convert to unicode
[2847]	934	$self->convert2unicode($charset, \$text);
[6062]	935	$text =~ s@_@\\_@g; # protect against GS macro language
[2630]	936	return $text;
	937	}
	938
	939
[6062]	940
	941
[2630]	942	# decode quoted-printable text
	943	sub qp_decode {
	944	my $text=shift;
	945
	946	# if a line ends with "=\s*", it is a soft line break, otherwise
	947	# keep in any newline characters.
	948
[3627]	949	$text =~ s/=\s*\r?\n//mg;
	950	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]	951	return $text;
	952	}
	953
	954	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	955	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	956	# for us at least).
	957	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	958	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	959	# from each encoded byte.
[2630]	960
	961
	962	sub base64_decode {
	963	my $enc_text = shift;
	964	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	965	# also '=' is used for padding at the end, but we remove it anyway.
	966	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	967	# map each MIME char into it's value, for more efficient lookup.
	968	my %index;
	969	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	970	# remove all non-base64 chars. eval to get variable in transliteration...
	971	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	972	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	973	my $decoded="";
	974	while (length ($enc_text)>3)
	975	{
	976	my $fourchars=substr($enc_text,0,4,"");
	977	my @chars=(split '',$fourchars);
	978	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	979	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	980	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	981	}
	982	# if there are any input chars left, there are either
	983	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	984	my @chars=(split '',$enc_text);
	985	if (length($enc_text)) {
	986	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	987	}
	988	if (length($enc_text)==3) {
	989	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	990	}
	991	return $decoded;
	992	}
	993
[10834]	994	# returns 0 if valid utf-8, 1 if invalid
	995	sub is_utf8 {
	996	my $self = shift;
	997	my $textref = shift;
	998
	999	$$textref =~ m/^/g; # to set \G
	1000	my $badbytesfound=0;
	1001	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1002	my $highbytes=$1;
	1003	my $highbyteslength=length($highbytes);
	1004	# replace any non utf8 complaint bytes
	1005	$highbytes =~ /^/g; # set pos()
	1006	while ($highbytes =~
	1007	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	1008	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	1009	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1010	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	1011	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1012	)*([\x80-\xff])? !xg
	1013	) {
	1014	my $badbyte=$1;
	1015	if (!defined $badbyte) {next} # hit end of string
	1016	return 1;
	1017	}
	1018	}
	1019	return 0;
	1020	}
	1021
[16341]	1022	# words with non ascii characters in header values must be encoded in the
	1023	# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
[10834]	1024
[16341]	1025	sub decode_header_value {
	1026	my $self = shift(@_);
	1027	my ($default_header_encoding, $textref) = @_;
	1028
	1029	if (!$$textref) {
	1030	# nothing to do!
	1031	return;
	1032	}
	1033	my $value = $$textref;
	1034	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
	1035	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
	1036	my $original_value=$value;
	1037	my $encoded=$value;
	1038	$value="";
	1039	# we should ignore spaces between consecutive encoded-texts
	1040	$encoded =~ s@\?=\s+=\?@\?==\?@g;
	1041	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
	1042	my ($charset, $encoding, $data)=($2,$3,$4);
	1043	my ($decoded_data);
	1044	my $leading_chars = "$1";
	1045	$self->convert2unicode($default_header_encoding, \$leading_chars);
	1046	$value.=$leading_chars;
	1047
	1048	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	1049	chomp $data;
	1050	$encoding =~ tr/BQ/bq/;
	1051	if ($encoding eq "q") { # quoted printable
	1052	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
	1053	$decoded_data=qp_decode($data);
	1054	# qp_decode adds \n, which is default for body text
	1055	chomp($decoded_data);
	1056	} else { # base 64
	1057	$decoded_data=base64_decode($data);
	1058	}
	1059	$self->convert2unicode($charset, \$decoded_data);
	1060	$value .= $decoded_data;
	1061	} # end of while loop
	1062
	1063	# get any trailing characters
	1064	$self->convert2unicode($default_header_encoding, \$encoded);
	1065	$value.=$encoded;
	1066
	1067	if ($value =~ /^\s*$/) { # we couldn't extract anything...
	1068	$self->convert2unicode($default_header_encoding,
	1069	\$original_value);
	1070	$value=$original_value;
	1071	}
	1072	$$textref = $value;
	1073	} # end of if =?...?=
	1074
	1075	# In the absense of other charset information, assume the
	1076	# header is the default (usually "iso_8859_1") and convert to unicode.
	1077	else {
	1078	$self->convert2unicode($default_header_encoding, $textref);
	1079	}
	1080
	1081	}
[10834]	1082
[16341]	1083
	1084
[2847]	1085	sub convert2unicode {
	1086	my $self = shift(@_);
	1087	my ($charset, $textref) = @_;
[2630]	1088
[7703]	1089	if (!$$textref) {
	1090	# nothing to do!
	1091	return;
	1092	}
	1093
[10827]	1094	if (! defined $charset) {
	1095	# check if we have valid utf-8
[10834]	1096	if ($self->is_utf8($textref)) { $charset = "utf8" }
[10827]	1097
	1098	# default to latin
	1099	$charset = "iso_8859_1" if ! defined($charset);
	1100	}
	1101
[2847]	1102	# first get our character encoding name in the right form.
[7703]	1103	$charset =~ tr/A-Z/a-z/; # lowercase
	1104	$charset =~ s/\-/_/g;
[8902]	1105	if ($charset =~ /gb_?2312/) { $charset="gb" }
[2847]	1106	# assumes EUC-KR, not ISO-2022 !?
[7703]	1107	$charset =~ s/^ks_c_5601_1987/korean/;
	1108	if ($charset eq 'utf_8') {$charset='utf8'}
[2847]	1109
[7703]	1110	my $outhandle = $self->{'outhandle'};
	1111
	1112	if ($charset eq "utf8") {
	1113	# no conversion needed, but lets check that it's valid utf8
	1114	# see utf-8 manpage for valid ranges
	1115	$$textref =~ m/^/g; # to set \G
	1116	my $badbytesfound=0;
	1117	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1118	my $highbytes=$1;
	1119	my $highbyteslength=length($highbytes);
	1120	# replace any non utf8 complaint bytes
	1121	$highbytes =~ /^/g; # set pos()
	1122	while ($highbytes =~
[8904]	1123	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7703]	1124	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8904]	1125	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1126	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
[7703]	1127	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1128	)*([\x80-\xff])? !xg
	1129	) {
	1130	my $badbyte=$1;
	1131	if (!defined $badbyte) {next} # hit end of string
	1132	my $pos=pos($highbytes);
	1133	substr($highbytes, $pos-1, 1, "\xc2\x80");
	1134	# update the position to continue searching (for \G)
	1135	pos($highbytes) = $pos+1; # set to just after the \x80
	1136	$badbytesfound=1;
	1137	}
	1138	if ($badbytesfound==1) {
	1139	# claims to be utf8, but it isn't!
[15872]	1140	print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
[7703]	1141	. "detected and removed.\n";
	1142
	1143	my $replength=length($highbytes);
	1144	my $textpos=pos($$textref);
	1145	# replace bad bytes with good bytes
	1146	substr( $$textref, $textpos-$replength,
	1147	$replength, $highbytes);
	1148	# update the position to continue searching (for \G)
	1149	pos($$textref)=$textpos+($replength-$highbyteslength);
	1150	}
	1151	}
[3206]	1152	return;
	1153	}
	1154
[2847]	1155	# It appears that we can't always trust ascii text so we'll treat it
	1156	# as iso-8859-1 (letting characters above 0x80 through without
	1157	# converting them to utf-8 will result in invalid XML documents
	1158	# which can't be parsed at build time).
	1159	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	1160
[3073]	1161	if ($charset eq "iso_8859_1") {
	1162	# test if the mailer lied, and it has win1252 chars in it...
	1163	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	1164	if ($$textref =~ m/[\x80-\x9f]/) {
[15872]	1165	print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
[3073]	1166	print $outhandle "codepage 1252 detected.\n";
	1167	$charset = "windows_1252";
	1168	}
	1169	}
[3351]	1170	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	1171
	1172	if ($utf8_text ne "") {
	1173	$$textref=$utf8_text;
	1174	} else {
	1175	# we didn't get any text... unsupported encoding perhaps? Or it is
	1176	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
	1177	# characters out here if this causes problems...
[3726]	1178	my $outhandle=$self->{'outhandle'};
[15872]	1179	print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
[3351]	1180	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
	1181
	1182	}
[2847]	1183	}
	1184
[17026]	1185	sub get_base_OID {
	1186	my $self = shift(@_);
	1187	my ($doc_obj) = @_;
[2847]	1188
[17026]	1189	if ($self->{'OIDtype'} eq "message_id") {
	1190	# temporarily set OIDtype to hash to get a base id
	1191	$self->{'OIDtype'} = "hash_on_ga_xml";
[19282]	1192	my $id = $self->SUPER::get_base_OID(@_);
[17026]	1193	$self->{'OIDtype'} = "message_id";
	1194	return $id;
	1195	}
	1196	return $self->SUPER::get_base_OID(@_);
	1197	}
	1198
	1199
	1200	sub add_OID {
[7703]	1201	my $self = shift (@_);
	1202	my ($doc_obj, $id, $segment_number) = @_;
[17026]	1203	if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
[7703]	1204	$doc_obj->set_OID($doc_obj->{'msgid'});
[17026]	1205	}
	1206	else {
[7703]	1207	$doc_obj->set_OID("$id\_$segment_number");
	1208	}
	1209	}
	1210
	1211
[1206]	1212	# Perl packages have to return true if they are run.
	1213	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: