Context Navigation

source: gsdl/trunk/perllib/plugins/EmailPlugin.pm@ 19282

Last change on this file since 19282 was 19282, checked in by kjdon, 15 years ago
call super get_base_OID in get_base_OID, not self->add_OID, as id and segment are not defined, which is the whole point of callign this.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 37.9 KB

Rev	Line
[638]	1	###########################################################################
	2	#
[15872]	3	# EmailPlugin.pm - a plugin for parsing email files
[638]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[3630]	9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
[15872]	29	# EmailPlugin
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[6916]	42	# $Headers All the header content (optional, not stored by default)
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[19281]	52	# $InReplyTo Message id of the one this replies to
[2630]	53	#
	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]	63
[16341]	64
[15872]	65	package EmailPlugin;
[3540]	66
[7830]	67	use strict;
	68	no strict "refs"; # so we can use a variable as a filehandle for print $out
	69
[638]	70
[15872]	71	use SplitTextFile;
[7830]	72	use unicode; # gs conv functions
	73	use gsprintf 'gsprintf'; # translations
[2730]	74
[638]	75	use sorttools;
	76	use util;
	77
[10254]	78	sub BEGIN {
[15872]	79	@EmailPlugin::ISA = ('SplitTextFile');
[10254]	80	}
[638]	81
[17026]	82	my $extended_oidtype_list =
	83	[ {'name' => "message_id",
	84	'desc' => "{EmailPlugin.OIDtype.message_id}" }
	85	];
[638]	86
[17026]	87	# add in all the standard options from BasePlugin
[17053]	88	unshift (@$extended_oidtype_list, @{$BasePlugin::oidtype_list});
[17026]	89
[3540]	90	my $arguments =
[4744]	91	[ { 'name' => "process_exp",
[15872]	92	'desc' => "{BasePlugin.process_exp}",
[6408]	93	'type' => "regexp",
[3540]	94	'reqd' => "no",
[4744]	95	'deft' => &get_default_process_exp() },
	96	{ 'name' => "no_attachments",
[15872]	97	'desc' => "{EmailPlugin.no_attachments}",
[3630]	98	'type' => "flag",
[4744]	99	'reqd' => "no" },
[6916]	100	{ 'name' => "headers",
[15872]	101	'desc' => "{EmailPlugin.headers}",
[6916]	102	'type' => "flag",
	103	'reqd' => "no" },
[17026]	104	{ 'name' => "OIDtype",
	105	'desc' => "{import.OIDtype}",
	106	'type' => "enum",
	107	'list' => $extended_oidtype_list,
[19281]	108	'deft' => "message_id",
[18591]	109	'reqd' => "no" },
[17026]	110	{ 'name' => "OIDmetadata",
	111	'desc' => "{import.OIDmetadata}",
	112	'type' => "metadata",
	113	'deft' => "dc.Identifier",
[18591]	114	'reqd' => "no" },
[4744]	115	{ 'name' => "split_exp",
[15872]	116	'desc' => "{EmailPlugin.split_exp}",
[6408]	117	'type' => "regexp",
	118	'reqd' => "no",
	119	'deft' => &get_default_split_exp() }
	120	];
[3540]	121
[15872]	122	my $options = { 'name' => "EmailPlugin",
	123	'desc' => "{EmailPlugin.desc}",
[6408]	124	'abstract' => "no",
	125	'inherits' => "yes",
[4744]	126	'args' => $arguments };
[3540]	127
[638]	128	sub new {
[10218]	129	my ($class) = shift (@_);
	130	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	131	push(@$pluginlist, $class);
[3540]	132
[15872]	133	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	134	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	135
[15872]	136	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	137
[3630]	138	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
[16341]	139	$self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
[3630]	140
[2730]	141	# this might not actually be true at read-time, but after processing
	142	# it should all be utf8.
	143	$self->{'input_encoding'}="utf8";
[638]	144	return bless $self, $class;
	145	}
	146
[1244]	147	sub get_default_process_exp {
	148	my $self = shift (@_);
[2096]	149	# mbx/email for mailbox file format, \d+ for maildir (each message is
	150	# in a separate file, with a unique number for filename)
[3111]	151	# mozilla and IE will save individual mbx format files with a ".eml" ext.
	152	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
[638]	153	}
	154
[1895]	155	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	156	# It is supposed to be "\n\nFrom ", but this isn't always used.
[9971]	157	# add \d{4} so that the line ends in a year (in case the text has an
	158	# unescaped "From " at the start of a line).
[1895]	159	sub get_default_split_exp {
[9971]	160	return q^\nFrom .*\d{4}\n^;
[3523]	161
[1895]	162	}
	163
[16677]	164	sub can_process_this_file {
	165	my $self = shift(@_);
	166	my ($filename) = @_;
[1895]	167
[16677]	168	# avoid any confusion between filenames matching \d+ (which are by default
	169	# matched by EmailPlugin) and directories that match \d+ (which should not)
	170
	171	return 0 if (-d $filename);
	172
	173	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
	174	return 1;
	175	}
	176	return 0;
	177
	178	}
	179
	180
[1244]	181	# do plugin specific processing of doc_obj
	182	sub process {
[2630]	183
[638]	184	my $self = shift (@_);
[6332]	185	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]	186	my $outhandle = $self->{'outhandle'};
	187
[1244]	188	# Check that we're dealing with a valid mail file
[3111]	189	# mbox message files start with "From "
	190	# maildir messages usually start with Return-Path and Delivered-To
	191	# mh is very similar to maildir
	192	my $startoffile=substr($$textref,0,256);
	193	if (($startoffile !~ /^(From )/) &&
	194	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	195	return undef;
	196	}
[638]	197
[1244]	198	my $cursection = $doc_obj->get_top_section();
[638]	199
[1206]	200	#
	201	# Parse the document's text and extract metadata
	202	#
[638]	203
[2652]	204	# Protect backslashes
	205	$$textref =~ s@\\@\\\\@g;
	206
[1206]	207	# Separate header from body of message
[1244]	208	my $Headers = $$textref;
[2630]	209	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	210	$$textref = $1;
[4089]	211	$Headers .= "\n";
[2779]	212
[2630]	213	# Unfold headers - see rfc822
	214	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	215	# Extract basic metadata from header
	216	my @headers = ("From", "To", "Subject", "Date");
	217	my %raw;
[1658]	218	foreach my $name (@headers) {
	219	$raw{$name} = "No $name value";
	220	}
[1206]	221
[3132]	222	# Get a default encoding for the header - RFC says should be ascii...
[6062]	223	my $default_header_encoding="iso_8859_1";
[3132]	224
	225	# We don't know what character set is the user's default...
	226	# We could use textcat to guess... for now we'll look at mime content-type
	227	# if ($Headers =~ /([[:^ascii:]])/) {
	228	# }
	229	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	230	$default_header_encoding=$1;
	231	$default_header_encoding =~ s@\-@_@g;
	232	$default_header_encoding =~ tr/A-Z/a-z/;
	233	}
	234
	235
[1658]	236	# Examine each line of the headers
	237	my ($line, $name, $value);
	238	my @parts;
	239	foreach $line (split(/\n/, $Headers)) {
	240
	241	# Ignore lines with no content or which begin with whitespace
	242	next unless ($line =~ /:/);
	243	next if ($line =~ /^\s/);
	244
	245	# Find out what metadata is on this line
	246	@parts = split(/:/, $line);
	247	$name = shift @parts;
[3136]	248	# get fieldname in canonical form - first cap, then lower case.
[3134]	249	$name =~ tr/A-Z/a-z/;
[3136]	250	# uppercase the first character according to the current locale
[2630]	251	$name=~s/(.+)/\u$1/;
[1658]	252	next unless $name;
	253	next unless ($raw{$name});
	254
	255	# Find the value of that metadata
	256	$value = join(":", @parts);
	257	$value =~ s/^\s+//;
	258	$value =~ s/\s+$//;
[16341]	259	# decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
	260	$self->decode_header_value($default_header_encoding, \$value);
[2730]	261
[1658]	262	# Store the metadata
[6062]	263	$value =~ s@_@\\_@g; # protect against GS macro language
[1206]	264	$raw{$name} = $value;
	265	}
	266
[2630]	267	# Extract the name and e-mail address from the From metadata
[6062]	268	my $frommeta = $raw{"From"};
[2680]	269	my $fromnamemeta;
	270	my $fromaddrmeta;
	271
	272	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	273
	274	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	275	$fromnamemeta=$1;
	276	$fromaddrmeta=$2;
	277	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
	278	$fromnamemeta=$2;
	279	$fromaddrmeta=$1;
	280	}
[2630]	281	if (!defined($fromaddrmeta)) {
	282	$fromaddrmeta=$frommeta;
	283	}
[2680]	284	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	285	# minor attempt to prevent spam-bots from harvesting addresses...
	286	$fromaddrmeta=~s/@/@/;
[6062]	287
[2630]	288	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	289
[3328]	290	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]	291	$fromnamemeta =~ s/\"//g; # remove quotes
	292	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]	293	}
	294	else {
	295	$fromnamemeta = $fromaddrmeta;
	296	}
	297	# if name is an address
	298	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	299	$fromnamemeta=~s/@/&#64\;/;
[2630]	300	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	301
	302	$raw{"From"}=$frommeta;
	303
[1206]	304	# Process Date information
[1658]	305	if ($raw{"Date"} !~ /No Date/) {
[1206]	306	$raw{"DateText"} = $raw{"Date"};
[638]	307
[1206]	308	# Convert the date text to internal date format
	309	$value = $raw{"Date"};
[15212]	310	# proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
[1206]	311	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[15212]	312	if (!defined($day) \|\| !defined($month) \|\| !defined ($year)) {
	313	# try monthly archive format: Wed Apr 23 00:26:08 2008
	314	($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
	315	}
	316
[3143]	317	# make some assumptions about the year formatting...
	318	# some (old) software thinks 2001 is 101, some think 2001 is 01
	319	if ($year < 20) { $year += 2000; } # assume not really 1920...
	320	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	321	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	322
[1206]	323	} else {
	324	# We have not extracted a date
	325	$raw{"DateText"} = "Unknown.";
	326	$raw{"Date"} = "19000000";
[638]	327	}
	328
[1244]	329	# Add extracted metadata to document object
[1206]	330	foreach my $name (keys %raw) {
	331	$value = $raw{$name};
	332	if ($value) {
[2730]	333	# assume subject, etc headers have no special HTML meaning.
[1206]	334	$value = &text_into_html($value);
[2730]	335	# escape [] so it isn't re-interpreted as metadata
	336	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	337	} else {
	338	$value = "No $name field";
	339	}
[1244]	340	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	341	}
[638]	342
[7703]	343
	344	# extract a message ID from the headers, if there is one, and we'll use
	345	# that as the greenstone doc ID. Having a predictable ID means we can
	346	# link to other messages, eg from In-Reply-To or References headers...
	347	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
	348	my $id=escape_msg_id($1);
	349	$doc_obj->{'msgid'}=$id;
	350	}
	351	# link to another message, if this is a reply
	352	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
	353	my $id=escape_msg_id($1);
	354	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	355	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
	356	# References can have multiple, get the last one
	357	my $id=escape_msg_id($1);
	358	# not necessarily in-reply-to, but same thread...
	359	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	360	}
	361
	362
	363
[2630]	364	my $mimetype="text/plain";
	365	my $mimeinfo="";
[3132]	366	my $charset = $default_header_encoding;
[3073]	367	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	368	# more than one parameter given to Content-type.
	369	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]	370	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
[2630]	371	{
	372	$mimetype=$1;
	373	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	374
	375	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	376	$mimetype = "text/plain";
	377	}
	378
[2630]	379	$mimeinfo=$2;
[3073]	380	if (!defined $mimeinfo) {
	381	$mimeinfo="";
	382	} else { # strip leading and trailing stuff
	383	$mimeinfo =~ s/^\;\s*//;
	384	$mimeinfo =~ s/\s*$//;
	385	}
	386	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	387	$charset = $1;
	388	}
[2630]	389	}
	390
[2680]	391	my $transfer_encoding="7bit";
	392	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	393	$transfer_encoding=$1;
	394	}
[6062]	395
[2886]	396	if ($mimetype eq "text/html") {
[9971]	397	$$textref= $self->text_from_part($$textref, $Headers);
[2886]	398	} elsif ($mimetype ne "text/plain") {
[3630]	399	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[16341]	400	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
[6062]	401	} else { # mimetype eq text/plain
	402
	403	if ($transfer_encoding =~ /quoted\-printable/) {
	404	$$textref=qp_decode($$textref);
	405	} elsif ($transfer_encoding =~ /base64/) {
	406	$$textref=base64_decode($$textref);
	407	}
[2886]	408	$self->convert2unicode($charset, $textref);
[8904]	409
	410	$$textref = &text_into_html($$textref);
	411	$$textref =~ s@_@\\_@g; # protect against GS macro language
	412
[2680]	413	}
[6062]	414
[2630]	415
[10218]	416	if ($self->{'headers'} && $self->{'headers'} == 1) {
[6916]	417	# Add "All headers" metadata
	418	$Headers = &text_into_html($Headers);
[2630]	419
[6916]	420	$Headers = "No headers" unless ($Headers =~ /\w/);
	421	$Headers =~ s/@/&#64\;/g;
	422	# escape [] so it isn't re-interpreted as metadata
	423	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
	424	$self->convert2unicode($charset, \$Headers);
[2754]	425
[6916]	426	$Headers =~ s@_@\\_@g; # protect against GS macro language
	427	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
	428	}
[6062]	429
	430
[2918]	431	# Add Title metadata
	432	my $Title = text_into_html($raw{'Subject'});
[7703]	433	$Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]	434	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	435	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	436
	437	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	438
[8121]	439	# Add FileFormat metadata
	440	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
[2918]	441
[1244]	442	# Add text to document object
	443	$$textref = "No message" unless ($$textref =~ /\w/);
[6062]	444
[1244]	445	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	446
[1244]	447	return 1;
[638]	448	}
	449
[16341]	450	# delete any temp files that we have created
	451	sub clean_up_after_doc_obj_processing {
	452	my $self = shift(@_);
	453
	454	foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
	455	if (-e $tmp_file_path) {
	456	&util::rm($tmp_file_path);
	457	}
	458	}
	459
	460	}
[638]	461
	462	# Convert a text string into HTML.
	463	#
	464	# The HTML is going to be inserted into a GML file, so
	465	# we have to be careful not to use symbols like ">",
	466	# which ocurs frequently in email messages (and use
	467	# &gt instead.
	468	#
	469	# This function also turns links and email addresses into hyperlinks,
	470	# and replaces carriage returns with <BR> tags (and multiple carriage
	471	# returns with <P> tags).
	472
[1206]	473
[638]	474	sub text_into_html {
	475	my ($text) = @_;
	476
[1244]	477	# Convert problem characters into HTML symbols
[3132]	478	$text =~ s/&/&/g;
	479	$text =~ s/</</g;
	480	$text =~ s/>/>/g;
	481	$text =~ s/\"/"/g;
[638]	482
[2630]	483	# convert email addresses and URIs into links
	484	# don't markup email addresses for now
	485	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	486
[2918]	487	# try to munge email addresses a little bit...
	488	$text =~ s/@/@/;
[2730]	489	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	490	# assume URI doesn't finish with a '.'
[10827]	491	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
[2630]	492
	493
[638]	494	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	495	$text =~ s/ +/ /g;
	496	$text =~ s/\s*$//g;
	497	$text =~ s/^\s*//g;
	498	$text =~ s/\n/\n<br>/g;
	499	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	500
	501	return $text;
	502	}
	503
	504
[2630]	505
	506
	507	#Process a MIME message.
	508	# the textref we are given DOES NOT include the header.
	509	sub text_from_mime_message {
[2847]	510	my $self = shift(@_);
[16341]	511	my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
[6062]	512	my $outhandle=$self->{'outhandle'};
[2630]	513	# Check for multiparts - $mimeinfo will be a boundary
	514	if ($mimetype =~ /multipart/) {
[6062]	515	my $boundary="";
[2732]	516	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	517	$boundary=$1;
[2732]	518	if ($boundary =~ m@^\"@) {
	519	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	520	}
	521	} else {
[15872]	522	print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
[2630]	523	}
	524	# parts start with "--$boundary"
	525	# message ends with "--$boundary--"
	526	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	527	# that perl might want to interpolate. Also allows spaces...
[2630]	528	$boundary=~s/\\/\\\\/g;
	529	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]	530	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	531	# remove first "part" and last "part" (final --)
	532	shift @message_parts;
	533	my $last=pop @message_parts;
[2680]	534	# if our boundaries are a bit dodgy and we only found 1 part...
	535	if (!defined($last)) {$last="";}
[2630]	536	# make sure it is only -- and whitespace
	537	if ($last !~ /^\-\-\s*$/ms) {
[15872]	538	print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
[2630]	539	}
	540	foreach my $message_part (@message_parts) {
	541	# remove the leading newline left from split.
	542	$message_part=~s/^\r?\n//;
	543	}
	544	if ($mimetype eq "multipart/alternative") {
	545	# check for an HTML version first, then TEXT, otherwise use first.
	546	my $part_text="";
	547	foreach my $message_part (@message_parts) {
[10839]	548	if ($message_part =~ m@^content\-type:\s*text/html@i)
[2630]	549	{
	550	# Use the HTML version
[10834]	551	$part_text = $self->text_from_part($message_part);
[2630]	552	$mimetype="text/html";
	553	last;
	554	}
	555	}
	556	if ($part_text eq "") { # try getting a text part instead
	557	foreach my $message_part (@message_parts) {
[10839]	558	if ($message_part =~ m@^content\-type:\s*text/plain@i)
[2630]	559	{
	560	# Use the plain version
[10839]	561	$part_text = $self->text_from_part($message_part);
[2732]	562	if ($part_text =~/[^\s]/) {
[3721]	563	$part_text = text_into_html($part_text);
[2732]	564	}
[2630]	565	$mimetype="text/plain";
	566	last;
	567	}
	568	}
	569	}
[3721]	570	if ($part_text eq "") { #use first part (no html/text part found)
	571	$part_text = $self->text_from_part(shift @message_parts);
	572	$part_text = text_into_html($part_text);
[2630]	573	}
	574	if ($part_text eq "") { # we couldn't get anything!!!
	575	# or it was an empty message...
	576	# do nothing...
[16013]	577	gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
[2630]	578	} else {
[3721]	579	$text = $part_text;
[2630]	580	}
[3352]	581	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
[3721]	582	$text = "";
[3352]	583	# signed is for PGP/GPG messages... the last part is a hash
	584	if ($mimetype =~ m@multipart/signed@) {
	585	pop @message_parts;
	586	}
[6062]	587	my $is_first_part=1;
[2630]	588	foreach my $message_part (@message_parts) {
[6062]	589	if ($is_first_part && $text ne "") {$is_first_part=0;}
	590
[2630]	591	if ($mimetype eq "multipart/digest") {
[6062]	592	# default type - RTFRFC!! Set if not already set
	593	$message_part =~ m@^(.*)\n\r?\n@s;
	594	my $part_header=$1;
	595	if ($part_header !~ m@^content-type@mi) {
	596	$message_part="Content-type: message/rfc822\n"
	597	. $message_part; # prepend default type
[4224]	598	}
[2630]	599	}
	600
[16341]	601	$text .= $self->process_multipart_part($default_header_encoding,
	602	$message_part,
[6062]	603	$is_first_part);
[2630]	604	} # foreach message part.
	605	} else {
	606	# we can't handle this multipart type (not mixed or alternative)
	607	# the RFC also mentions "parallel".
	608	}
[3627]	609	} # end of ($mimetype =~ multipart)
[2918]	610	elsif ($mimetype =~ m@message/rfc822@) {
	611	my $msg_header = $text;
	612	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	613	$text = $1;
	614
[3630]	615	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
[2918]	616	{
	617	$mimetype=$1;
[9971]	618	$mimeinfo=$2;
[2918]	619	$mimetype =~ tr/[A-Z]/[a-z]/;
[9971]	620
[2918]	621	my $msg_text;
	622	if ($mimetype =~ m@multipart/@) {
[16341]	623	$msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
	624	$default_header_encoding,
[6062]	625	$text);
[9823]	626	} else {
[9971]	627	$msg_text=$self->text_from_part($text,$msg_header);
[9823]	628	}
[2918]	629
[3630]	630	my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]	631	$text= "\n<b><<attached message>></b><br>";
	632	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	633	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	634	. "</td></tr></table>";
	635	}
	636	} else {
[2886]	637	# we don't do any processing of the content.
	638	}
	639
[2630]	640	return $text;
	641	}
	642
	643
[6062]	644
[7703]	645	# used for turning a message id into a more friendly string for greenstone
	646	sub escape_msg_id {
	647	#msgid
	648	my $id=shift;
	649	chomp $id; $id =~ s!\s!!g; # remove spaces
	650	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
	651	$id =~ s![_&]!-!g; # replace symbols that might cause problems
[10834]	652	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
[7703]	653	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
	654	return $id;
	655	}
[6062]	656
[7703]	657
	658
[6062]	659	sub process_multipart_part {
	660	my $self = shift;
[16341]	661	my $default_header_encoding = shift;
[6062]	662	my $message_part = shift;
	663	my $is_first_part = shift;
	664
	665	my $return_text="";
	666	my $part_header=$message_part;
	667	my $part_body;
	668	if ($message_part=~ /^\s*\n/) {
	669	# no header... use defaults
	670	$part_body=$message_part;
	671	$part_header="Content-type: text/plain; charset=us-ascii";
	672	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
	673	$part_body=$1;
	674	} else {
	675	# something's gone wrong...
	676	$part_header="";
	677	$part_body=$message_part;
	678	}
	679
	680	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	681	my $part_content_type="";
	682	my $part_content_info="";
	683
	684	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
	685	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	686	$part_content_info=$2;
	687	if (!defined($part_content_info)) {
	688	$part_content_info="";
	689	} else {
	690	$part_content_info =~ s/^\;\s*//;
	691	$part_content_info =~ s/\s*$//;
	692	}
	693	}
	694	my $filename="";
[6079]	695	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]	696	$filename=$1;
[6079]	697	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
[16341]	698	# decode the filename
	699	$self->decode_header_value($default_header_encoding, \$filename);
	700
[6062]	701	}
	702
	703	# disposition - either inline or attachment.
	704	# NOT CURRENTLY USED - we display all text types instead...
	705	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	706
	707	# add <<attachment>> to each part except the first...
	708	if (!$is_first_part) {
	709	$return_text.="\n<p><hr><strong><<attachment>>";
	710	# add part info header
[10827]	711	my $header_text = "<br>Type: $part_content_type<br>\n";
[6062]	712	if ($filename ne "") {
[10827]	713	$header_text .= "Filename: $filename\n";
[6062]	714	}
	715	$header_text =~ s@_@\\_@g;
[10827]	716	$return_text .= $header_text . "</strong></p>\n<p>\n";
[6062]	717	}
	718
	719	if ($part_content_type =~ m@text/@)
	720	{
[10827]	721	# $message_part includes the mime part headers
	722	my $part_text = $self->text_from_part($message_part);
[6062]	723	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	724	$part_text = text_into_html($part_text);
	725	}
	726	if ($part_text eq "") {
	727	$part_text = ' ';
	728	}
	729	$return_text .= $part_text;
	730	} elsif ($part_content_type =~ m@message/rfc822@) {
	731	# This is a forwarded message
	732	my $message_part_headers=$part_body;
	733	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	734	my $message_part_body=$1;
	735	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	736
	737	my $rfc822_formatted_body=""; # put result in here
	738	if ($message_part_headers =~
	739	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
	740	{
	741	# The message header uses MIME flags
	742	my $message_content_type=$1;
	743	my $message_content_info=$2;
	744	if (!defined($message_content_info)) {
	745	$message_content_info="";
	746	} else {
	747	$message_content_info =~ s/^\;\s*//;
	748	$message_content_info =~ s/\s*$//;
	749	}
	750	$message_content_type =~ tr/A-Z/a-z/;
	751	if ($message_content_type =~ /multipart/) {
	752	$rfc822_formatted_body=
	753	$self->text_from_mime_message($message_content_type,
	754	$message_content_info,
[16341]	755	$default_header_encoding,
[6062]	756	$message_part_body);
	757	} else {
[9971]	758	$message_part_body=$self->text_from_part($part_body,
	759	$message_part_headers);
[6062]	760	$rfc822_formatted_body=text_into_html($message_part_body);
	761	}
	762	} else {
	763	# message doesn't use MIME flags
	764	$rfc822_formatted_body=text_into_html($message_part_body);
	765	$rfc822_formatted_body =~ s@_@\\_@g;
	766	}
	767	# Add the returned text to the output
	768	# don't put all the headers...
	769	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	770	my $brief_headers=get_brief_headers($message_part_headers);
	771	$return_text.=text_into_html($brief_headers);
	772	$return_text.="</p><p>\n";
	773	$return_text.=$rfc822_formatted_body;
	774	$return_text.="</p>\n";
	775	# end of message/rfc822
	776	} elsif ($part_content_type =~ /multipart/) {
	777	# recurse again
	778
	779	my $tmptext= $self->text_from_mime_message($part_content_type,
	780	$part_content_info,
[16341]	781	$default_header_encoding,
[6062]	782	$part_body);
	783	$return_text.=$tmptext;
	784	} else {
	785	# this part isn't text/* or another message...
	786	if ($is_first_part) {
	787	# this is the first part of a multipart, or only part!
	788	$return_text="\n<p><hr><strong><<attachment>>";
	789	# add part info header
	790	my $header_text="<br>Type: $part_content_type<br>\n";
	791	$header_text.="Filename: $filename</strong></p>\n<p>\n";
	792	$header_text =~ s@_@\\_@g;
	793	$return_text.=$header_text;
	794	}
	795
	796	# save attachment by default
[10218]	797	if (!$self->{'no_attachments'}
[6062]	798	&& $filename ne "") { # this part has a file...
	799	my $encoding="8bit";
	800	if ($part_header =~
	801	/^content-transfer-encoding:\s*(\w+)/mi ) {
	802	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
	803	}
[16341]	804	my $tmpdir=&util::filename_cat($ENV{'GSDLHOME'}, "tmp");
[6062]	805	my $save_filename=$filename;
	806
	807	# make sure we don't clobber files with same name;
	808	# need to keep state between .mbx files
	809	my $assoc_files=$self->{'assoc_filenames'};
	810	if ($assoc_files->{$filename}) { # it's been set...
	811	$assoc_files->{$filename}++;
	812	$filename =~ m/(.+)\.(\w+)$/;
	813	my ($filestem, $ext)=($1,$2);
	814	$save_filename="${filestem}_"
	815	. $assoc_files->{$filename} . ".$ext";
	816	} else { # first file with this name
	817	$assoc_files->{$filename}=1;
	818	}
[16341]	819	my $tmp_filename = &util::filename_cat($tmpdir, $save_filename);
	820	open (SAVE, ">$tmp_filename") \|\|
	821	warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
	822	binmode(SAVE); # needed on Windows
[6062]	823	my $part_text = $message_part;
	824	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
	825	if ($encoding eq "base64") {
	826	print SAVE base64_decode($part_text);
	827	} elsif ($encoding eq "quoted-printable") {
	828	print SAVE qp_decode($part_text);
	829	} else { # 7bit, 8bit, binary, etc...
	830	print SAVE $part_text;
	831	}
	832	close SAVE;
	833	my $doc_obj=$self->{'doc_obj'};
[16341]	834	$doc_obj->associate_file("$tmp_filename",
[6062]	835	"$save_filename",
	836	$part_content_type # mimetype
	837	);
[16341]	838	# add this file to the list of tmp files for deleting later
	839	push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
	840
[6062]	841	my $outhandle=$self->{'outhandle'};
[15872]	842	print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
[6062]	843
	844	# be nice if "download" was a translatable macro :(
	845	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
	846	} # end of save attachment
	847	} # end of !text/message part
	848
	849
	850	return $return_text;
	851	}
	852
	853
[3630]	854	# Return only the "important" headers from a set of message headers
	855	sub get_brief_headers {
	856	my $msg_header = shift;
	857	my $brief_header = "";
[2630]	858
[3630]	859	# Order matters!
	860	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
	861	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
	862	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
	863	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
	864	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]	865
[3630]	866	return $brief_header;
	867	}
[2630]	868
	869
	870	# Process a MIME part. Return "" if we can't decode it.
[6062]	871	# should only be called for parts with type "text/*" ?
[9971]	872	# Either pass the entire mime part (including the part's header),
	873	# or pass the mime part's text and optionally the part's header.
[2630]	874	sub text_from_part {
[3132]	875	my $self = shift;
[3136]	876	my $text = shift \|\| '';
[9971]	877	my $part_header = shift;
[3136]	878
[10834]	879
[9971]	880	my $type="text/plain"; # default, overridden from part header
[10827]	881	my $charset=undef; # convert2unicode() will guess if necessary
[9971]	882
	883	if (! $part_header) { # no header argument was given. check the body
	884	$part_header = $text;
	885	# check for empty part header (leading blank line)
	886	if ($text =~ /^\s*\r?\n/) {
	887	$part_header="Content-type: text/plain; charset=us-ascii";
	888	} else {
	889	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	890	$text=$1; if (!defined($text)) {$text="";}
	891	}
	892	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2681]	893	}
[9971]	894
	895	if ($part_header =~
	896	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
	897	$type=$1;
	898	$charset=$2;
	899	}
[2630]	900	my $encoding="";
[2638]	901	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	902	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	903	}
	904	# Content-Transfer-Encoding is per-part
	905	if ($encoding ne "") {
	906	if ($encoding =~ /quoted\-printable/) {
	907	$text=qp_decode($text);
	908	} elsif ($encoding =~ /base64/) {
	909	$text=base64_decode($text);
	910	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	911	# rfc2045 also allows binary, which we ignore (for now).
[3630]	912	my $outhandle=$self->{'outhandle'};
[15872]	913	print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
[2630]	914	return "";
	915	}
	916	}
[10834]	917
[2630]	918	if ($type eq "text/html") {
	919	# only get stuff between <body> tags, or <html> tags.
[2730]	920	$text =~ s@^.<html[^>]>@@is;
	921	$text =~ s@</html>.*$@@is;
	922	$text =~ s/^.?<body[^>]>//si;
	923	$text =~ s/<\/body>.*$//si;
[2630]	924	}
	925	elsif ($type eq "text/xml") {
	926	$text=~s/</</g;$text=~s/>/>/g;
	927	$text="<pre>\n$text\n</pre>\n";
	928	}
[2730]	929	# convert to unicode
[2847]	930	$self->convert2unicode($charset, \$text);
[6062]	931	$text =~ s@_@\\_@g; # protect against GS macro language
[2630]	932	return $text;
	933	}
	934
	935
[6062]	936
	937
[2630]	938	# decode quoted-printable text
	939	sub qp_decode {
	940	my $text=shift;
	941
	942	# if a line ends with "=\s*", it is a soft line break, otherwise
	943	# keep in any newline characters.
	944
[3627]	945	$text =~ s/=\s*\r?\n//mg;
	946	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]	947	return $text;
	948	}
	949
	950	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	951	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	952	# for us at least).
	953	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	954	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	955	# from each encoded byte.
[2630]	956
	957
	958	sub base64_decode {
	959	my $enc_text = shift;
	960	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	961	# also '=' is used for padding at the end, but we remove it anyway.
	962	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	963	# map each MIME char into it's value, for more efficient lookup.
	964	my %index;
	965	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	966	# remove all non-base64 chars. eval to get variable in transliteration...
	967	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	968	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	969	my $decoded="";
	970	while (length ($enc_text)>3)
	971	{
	972	my $fourchars=substr($enc_text,0,4,"");
	973	my @chars=(split '',$fourchars);
	974	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	975	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	976	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	977	}
	978	# if there are any input chars left, there are either
	979	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	980	my @chars=(split '',$enc_text);
	981	if (length($enc_text)) {
	982	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	983	}
	984	if (length($enc_text)==3) {
	985	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	986	}
	987	return $decoded;
	988	}
	989
[10834]	990	# returns 0 if valid utf-8, 1 if invalid
	991	sub is_utf8 {
	992	my $self = shift;
	993	my $textref = shift;
	994
	995	$$textref =~ m/^/g; # to set \G
	996	my $badbytesfound=0;
	997	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	998	my $highbytes=$1;
	999	my $highbyteslength=length($highbytes);
	1000	# replace any non utf8 complaint bytes
	1001	$highbytes =~ /^/g; # set pos()
	1002	while ($highbytes =~
	1003	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	1004	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	1005	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1006	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	1007	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1008	)*([\x80-\xff])? !xg
	1009	) {
	1010	my $badbyte=$1;
	1011	if (!defined $badbyte) {next} # hit end of string
	1012	return 1;
	1013	}
	1014	}
	1015	return 0;
	1016	}
	1017
[16341]	1018	# words with non ascii characters in header values must be encoded in the
	1019	# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
[10834]	1020
[16341]	1021	sub decode_header_value {
	1022	my $self = shift(@_);
	1023	my ($default_header_encoding, $textref) = @_;
	1024
	1025	if (!$$textref) {
	1026	# nothing to do!
	1027	return;
	1028	}
	1029	my $value = $$textref;
	1030	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
	1031	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
	1032	my $original_value=$value;
	1033	my $encoded=$value;
	1034	$value="";
	1035	# we should ignore spaces between consecutive encoded-texts
	1036	$encoded =~ s@\?=\s+=\?@\?==\?@g;
	1037	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
	1038	my ($charset, $encoding, $data)=($2,$3,$4);
	1039	my ($decoded_data);
	1040	my $leading_chars = "$1";
	1041	$self->convert2unicode($default_header_encoding, \$leading_chars);
	1042	$value.=$leading_chars;
	1043
	1044	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	1045	chomp $data;
	1046	$encoding =~ tr/BQ/bq/;
	1047	if ($encoding eq "q") { # quoted printable
	1048	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
	1049	$decoded_data=qp_decode($data);
	1050	# qp_decode adds \n, which is default for body text
	1051	chomp($decoded_data);
	1052	} else { # base 64
	1053	$decoded_data=base64_decode($data);
	1054	}
	1055	$self->convert2unicode($charset, \$decoded_data);
	1056	$value .= $decoded_data;
	1057	} # end of while loop
	1058
	1059	# get any trailing characters
	1060	$self->convert2unicode($default_header_encoding, \$encoded);
	1061	$value.=$encoded;
	1062
	1063	if ($value =~ /^\s*$/) { # we couldn't extract anything...
	1064	$self->convert2unicode($default_header_encoding,
	1065	\$original_value);
	1066	$value=$original_value;
	1067	}
	1068	$$textref = $value;
	1069	} # end of if =?...?=
	1070
	1071	# In the absense of other charset information, assume the
	1072	# header is the default (usually "iso_8859_1") and convert to unicode.
	1073	else {
	1074	$self->convert2unicode($default_header_encoding, $textref);
	1075	}
	1076
	1077	}
[10834]	1078
[16341]	1079
	1080
[2847]	1081	sub convert2unicode {
	1082	my $self = shift(@_);
	1083	my ($charset, $textref) = @_;
[2630]	1084
[7703]	1085	if (!$$textref) {
	1086	# nothing to do!
	1087	return;
	1088	}
	1089
[10827]	1090	if (! defined $charset) {
	1091	# check if we have valid utf-8
[10834]	1092	if ($self->is_utf8($textref)) { $charset = "utf8" }
[10827]	1093
	1094	# default to latin
	1095	$charset = "iso_8859_1" if ! defined($charset);
	1096	}
	1097
[2847]	1098	# first get our character encoding name in the right form.
[7703]	1099	$charset =~ tr/A-Z/a-z/; # lowercase
	1100	$charset =~ s/\-/_/g;
[8902]	1101	if ($charset =~ /gb_?2312/) { $charset="gb" }
[2847]	1102	# assumes EUC-KR, not ISO-2022 !?
[7703]	1103	$charset =~ s/^ks_c_5601_1987/korean/;
	1104	if ($charset eq 'utf_8') {$charset='utf8'}
[2847]	1105
[7703]	1106	my $outhandle = $self->{'outhandle'};
	1107
	1108	if ($charset eq "utf8") {
	1109	# no conversion needed, but lets check that it's valid utf8
	1110	# see utf-8 manpage for valid ranges
	1111	$$textref =~ m/^/g; # to set \G
	1112	my $badbytesfound=0;
	1113	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1114	my $highbytes=$1;
	1115	my $highbyteslength=length($highbytes);
	1116	# replace any non utf8 complaint bytes
	1117	$highbytes =~ /^/g; # set pos()
	1118	while ($highbytes =~
[8904]	1119	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7703]	1120	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8904]	1121	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1122	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
[7703]	1123	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1124	)*([\x80-\xff])? !xg
	1125	) {
	1126	my $badbyte=$1;
	1127	if (!defined $badbyte) {next} # hit end of string
	1128	my $pos=pos($highbytes);
	1129	substr($highbytes, $pos-1, 1, "\xc2\x80");
	1130	# update the position to continue searching (for \G)
	1131	pos($highbytes) = $pos+1; # set to just after the \x80
	1132	$badbytesfound=1;
	1133	}
	1134	if ($badbytesfound==1) {
	1135	# claims to be utf8, but it isn't!
[15872]	1136	print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
[7703]	1137	. "detected and removed.\n";
	1138
	1139	my $replength=length($highbytes);
	1140	my $textpos=pos($$textref);
	1141	# replace bad bytes with good bytes
	1142	substr( $$textref, $textpos-$replength,
	1143	$replength, $highbytes);
	1144	# update the position to continue searching (for \G)
	1145	pos($$textref)=$textpos+($replength-$highbyteslength);
	1146	}
	1147	}
[3206]	1148	return;
	1149	}
	1150
[2847]	1151	# It appears that we can't always trust ascii text so we'll treat it
	1152	# as iso-8859-1 (letting characters above 0x80 through without
	1153	# converting them to utf-8 will result in invalid XML documents
	1154	# which can't be parsed at build time).
	1155	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	1156
[3073]	1157	if ($charset eq "iso_8859_1") {
	1158	# test if the mailer lied, and it has win1252 chars in it...
	1159	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	1160	if ($$textref =~ m/[\x80-\x9f]/) {
[15872]	1161	print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
[3073]	1162	print $outhandle "codepage 1252 detected.\n";
	1163	$charset = "windows_1252";
	1164	}
	1165	}
[3351]	1166	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	1167
	1168	if ($utf8_text ne "") {
	1169	$$textref=$utf8_text;
	1170	} else {
	1171	# we didn't get any text... unsupported encoding perhaps? Or it is
	1172	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
	1173	# characters out here if this causes problems...
[3726]	1174	my $outhandle=$self->{'outhandle'};
[15872]	1175	print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
[3351]	1176	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
	1177
	1178	}
[2847]	1179	}
	1180
[17026]	1181	sub get_base_OID {
	1182	my $self = shift(@_);
	1183	my ($doc_obj) = @_;
[2847]	1184
[17026]	1185	if ($self->{'OIDtype'} eq "message_id") {
	1186	# temporarily set OIDtype to hash to get a base id
	1187	$self->{'OIDtype'} = "hash_on_ga_xml";
[19282]	1188	my $id = $self->SUPER::get_base_OID(@_);
[17026]	1189	$self->{'OIDtype'} = "message_id";
	1190	return $id;
	1191	}
	1192	return $self->SUPER::get_base_OID(@_);
	1193	}
	1194
	1195
	1196	sub add_OID {
[7703]	1197	my $self = shift (@_);
	1198	my ($doc_obj, $id, $segment_number) = @_;
[17026]	1199	if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
[7703]	1200	$doc_obj->set_OID($doc_obj->{'msgid'});
[17026]	1201	}
	1202	else {
[7703]	1203	$doc_obj->set_OID("$id\_$segment_number");
	1204	}
	1205	}
	1206
	1207
[1206]	1208	# Perl packages have to return true if they are run.
	1209	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: