Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 12610

Last change on this file since 12610 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 35.3 KB

Rev	Line
[638]	1	###########################################################################
	2	#
	3	# EMAILPlug.pm - a plugin for parsing email files
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[3630]	9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
	29	# EMAILPlug
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[6916]	42	# $Headers All the header content (optional, not stored by default)
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[2630]	52	#
[2918]	53	#
[2630]	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]	63
	64	# 12/05/02 Added usage datastructure - John Thompson
[10254]	65	package EMAILPlug;
[3540]	66
[7830]	67	use strict;
	68	no strict "refs"; # so we can use a variable as a filehandle for print $out
	69
[638]	70
[1895]	71	use SplitPlug;
[7830]	72	use unicode; # gs conv functions
	73	use gsprintf 'gsprintf'; # translations
[2730]	74
[638]	75	use sorttools;
	76	use util;
	77
[10254]	78	sub BEGIN {
	79	@EMAILPlug::ISA = ('SplitPlug');
	80	}
[638]	81
	82
[3540]	83	my $arguments =
[4744]	84	[ { 'name' => "process_exp",
[4873]	85	'desc' => "{BasPlug.process_exp}",
[6408]	86	'type' => "regexp",
[3540]	87	'reqd' => "no",
[4744]	88	'deft' => &get_default_process_exp() },
	89	{ 'name' => "no_attachments",
[4873]	90	'desc' => "{EMAILPlug.no_attachments}",
[3630]	91	'type' => "flag",
[4744]	92	'reqd' => "no" },
[6916]	93	{ 'name' => "headers",
	94	'desc' => "{EMAILPlug.headers}",
	95	'type' => "flag",
	96	'reqd' => "no" },
[4744]	97	{ 'name' => "split_exp",
[4873]	98	'desc' => "{EMAILPlug.split_exp}",
[6408]	99	'type' => "regexp",
	100	'reqd' => "no",
	101	'deft' => &get_default_split_exp() }
	102	];
[3540]	103
[4744]	104	my $options = { 'name' => "EMAILPlug",
[5680]	105	'desc' => "{EMAILPlug.desc}",
[6408]	106	'abstract' => "no",
	107	'inherits' => "yes",
[4744]	108	'args' => $arguments };
[3540]	109
[638]	110	# Create a new EMAILPlug object with which to parse a file.
[1206]	111	# Accomplished by creating a new BasPlug and using bless to
[638]	112	# turn it into an EMAILPlug.
	113
	114	sub new {
[10218]	115	my ($class) = shift (@_);
	116	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	117	push(@$pluginlist, $class);
[3540]	118
[10218]	119	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	120	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	121
[12169]	122	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
[10218]	123
[3630]	124	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
	125
[2730]	126	# this might not actually be true at read-time, but after processing
	127	# it should all be utf8.
	128	$self->{'input_encoding'}="utf8";
[638]	129	return bless $self, $class;
	130	}
	131
[1244]	132	sub get_default_process_exp {
	133	my $self = shift (@_);
[2096]	134	# mbx/email for mailbox file format, \d+ for maildir (each message is
	135	# in a separate file, with a unique number for filename)
[3111]	136	# mozilla and IE will save individual mbx format files with a ".eml" ext.
	137	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
[638]	138	}
	139
[1895]	140	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	141	# It is supposed to be "\n\nFrom ", but this isn't always used.
[9971]	142	# add \d{4} so that the line ends in a year (in case the text has an
	143	# unescaped "From " at the start of a line).
[1895]	144	sub get_default_split_exp {
[9971]	145	return q^\nFrom .*\d{4}\n^;
[3523]	146
[1895]	147	}
	148
	149
[1244]	150	# do plugin specific processing of doc_obj
	151	sub process {
[2630]	152
[638]	153	my $self = shift (@_);
[6332]	154	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]	155	my $outhandle = $self->{'outhandle'};
	156
[1244]	157	# Check that we're dealing with a valid mail file
[3111]	158	# mbox message files start with "From "
	159	# maildir messages usually start with Return-Path and Delivered-To
	160	# mh is very similar to maildir
	161	my $startoffile=substr($$textref,0,256);
	162	if (($startoffile !~ /^(From )/) &&
	163	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	164	return undef;
	165	}
[638]	166
[1757]	167
[6332]	168	print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
	169
[7830]	170	gsprintf($outhandle, "EMAILPlug: {common.processing} $file\n")
[1244]	171	if $self->{'verbosity'} > 1;
[1206]	172
[1244]	173	my $cursection = $doc_obj->get_top_section();
[638]	174
[1206]	175	#
	176	# Parse the document's text and extract metadata
	177	#
[638]	178
[2652]	179	# Protect backslashes
	180	$$textref =~ s@\\@\\\\@g;
	181
[1206]	182	# Separate header from body of message
[1244]	183	my $Headers = $$textref;
[2630]	184	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	185	$$textref = $1;
[4089]	186	$Headers .= "\n";
[2779]	187
[2630]	188	# Unfold headers - see rfc822
	189	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	190	# Extract basic metadata from header
	191	my @headers = ("From", "To", "Subject", "Date");
	192	my %raw;
[1658]	193	foreach my $name (@headers) {
	194	$raw{$name} = "No $name value";
	195	}
[1206]	196
[3132]	197	# Get a default encoding for the header - RFC says should be ascii...
[6062]	198	my $default_header_encoding="iso_8859_1";
[3132]	199
	200	# We don't know what character set is the user's default...
	201	# We could use textcat to guess... for now we'll look at mime content-type
	202	# if ($Headers =~ /([[:^ascii:]])/) {
	203	# }
	204	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	205	$default_header_encoding=$1;
	206	$default_header_encoding =~ s@\-@_@g;
	207	$default_header_encoding =~ tr/A-Z/a-z/;
	208	}
	209
	210
[1658]	211	# Examine each line of the headers
	212	my ($line, $name, $value);
	213	my @parts;
	214	foreach $line (split(/\n/, $Headers)) {
	215
	216	# Ignore lines with no content or which begin with whitespace
	217	next unless ($line =~ /:/);
	218	next if ($line =~ /^\s/);
	219
	220	# Find out what metadata is on this line
	221	@parts = split(/:/, $line);
	222	$name = shift @parts;
[3136]	223	# get fieldname in canonical form - first cap, then lower case.
[3134]	224	$name =~ tr/A-Z/a-z/;
[3136]	225	# uppercase the first character according to the current locale
[2630]	226	$name=~s/(.+)/\u$1/;
[1658]	227	next unless $name;
	228	next unless ($raw{$name});
	229
	230	# Find the value of that metadata
	231	$value = join(":", @parts);
	232	$value =~ s/^\s+//;
	233	$value =~ s/\s+$//;
[2730]	234	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
[3132]	235	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
[2730]	236	my $original_value=$value;
	237	my $encoded=$value;
	238	$value="";
[3215]	239	# we should ignore spaces between consecutive encoded-texts
	240	$encoded =~ s@\?=\s+=\?@\?==\?@g;
	241	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
[2730]	242	my ($charset, $encoding, $data)=($2,$3,$4);
[2847]	243	my ($decoded_data);
[2730]	244	$value.="$1"; # any leading chars
	245	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	246	chomp $data;
	247	$encoding =~ tr/BQ/bq/;
	248	if ($encoding eq "q") { # quoted printable
[2733]	249	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
[2730]	250	$decoded_data=qp_decode($data);
[3215]	251	# qp_decode adds \n, which is default for body text
	252	chomp($decoded_data);
[2730]	253	} else { # base 64
	254	$decoded_data=base64_decode($data);
	255	}
[2847]	256	$self->convert2unicode($charset, \$decoded_data);
	257	$value .= $decoded_data;
	258	} # end of while loop
[3132]	259
[2847]	260	# get any trailing characters
[3132]	261	$self->convert2unicode($default_header_encoding, \$encoded);
[2847]	262	$value.=$encoded;
	263
[2730]	264	if ($value =~ /^\s*$/) { # we couldn't extract anything...
[3132]	265	$self->convert2unicode($default_header_encoding,
	266	\$original_value);
[6062]	267	$value=$original_value;
[2730]	268	}
[3136]	269	} # end of if =?...?=
	270
	271	# In the absense of other charset information, assume the
[6062]	272	# header is the default (usually "iso_8859_1") and convert to unicode.
[3136]	273	else {
[3132]	274	$self->convert2unicode($default_header_encoding, \$value);
	275	}
[2730]	276
[1658]	277	# Store the metadata
[6062]	278	$value =~ s@_@\\_@g; # protect against GS macro language
[1206]	279	$raw{$name} = $value;
	280	}
	281
[2630]	282	# Extract the name and e-mail address from the From metadata
[6062]	283	my $frommeta = $raw{"From"};
[2680]	284	my $fromnamemeta;
	285	my $fromaddrmeta;
	286
	287	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	288
	289	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	290	$fromnamemeta=$1;
	291	$fromaddrmeta=$2;
	292	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
	293	$fromnamemeta=$2;
	294	$fromaddrmeta=$1;
	295	}
[2630]	296	if (!defined($fromaddrmeta)) {
	297	$fromaddrmeta=$frommeta;
	298	}
[2680]	299	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	300	# minor attempt to prevent spam-bots from harvesting addresses...
	301	$fromaddrmeta=~s/@/@/;
[6062]	302
[2630]	303	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	304
[3328]	305	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]	306	$fromnamemeta =~ s/\"//g; # remove quotes
	307	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]	308	}
	309	else {
	310	$fromnamemeta = $fromaddrmeta;
	311	}
	312	# if name is an address
	313	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	314	$fromnamemeta=~s/@/&#64\;/;
[2630]	315	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	316
	317	$raw{"From"}=$frommeta;
	318
[1206]	319	# Process Date information
[1658]	320	if ($raw{"Date"} !~ /No Date/) {
[1206]	321	$raw{"DateText"} = $raw{"Date"};
[638]	322
[1206]	323	# Convert the date text to internal date format
	324	$value = $raw{"Date"};
	325	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[3143]	326	# make some assumptions about the year formatting...
	327	# some (old) software thinks 2001 is 101, some think 2001 is 01
	328	if ($year < 20) { $year += 2000; } # assume not really 1920...
	329	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	330	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	331
[1206]	332	} else {
	333	# We have not extracted a date
	334	$raw{"DateText"} = "Unknown.";
	335	$raw{"Date"} = "19000000";
[638]	336	}
	337
[1244]	338	# Add extracted metadata to document object
[1206]	339	foreach my $name (keys %raw) {
	340	$value = $raw{$name};
	341	if ($value) {
[2730]	342	# assume subject, etc headers have no special HTML meaning.
[1206]	343	$value = &text_into_html($value);
[2730]	344	# escape [] so it isn't re-interpreted as metadata
	345	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	346	} else {
	347	$value = "No $name field";
	348	}
[1244]	349	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	350	}
[638]	351
[7703]	352
	353	# extract a message ID from the headers, if there is one, and we'll use
	354	# that as the greenstone doc ID. Having a predictable ID means we can
	355	# link to other messages, eg from In-Reply-To or References headers...
	356	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
	357	my $id=escape_msg_id($1);
	358	$doc_obj->{'msgid'}=$id;
	359	}
	360	# link to another message, if this is a reply
	361	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
	362	my $id=escape_msg_id($1);
	363	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	364	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
	365	# References can have multiple, get the last one
	366	my $id=escape_msg_id($1);
	367	# not necessarily in-reply-to, but same thread...
	368	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	369	}
	370
	371
	372
[2630]	373	my $mimetype="text/plain";
	374	my $mimeinfo="";
[3132]	375	my $charset = $default_header_encoding;
[3073]	376	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	377	# more than one parameter given to Content-type.
	378	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]	379	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
[2630]	380	{
	381	$mimetype=$1;
	382	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	383
	384	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	385	$mimetype = "text/plain";
	386	}
	387
[2630]	388	$mimeinfo=$2;
[3073]	389	if (!defined $mimeinfo) {
	390	$mimeinfo="";
	391	} else { # strip leading and trailing stuff
	392	$mimeinfo =~ s/^\;\s*//;
	393	$mimeinfo =~ s/\s*$//;
	394	}
	395	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	396	$charset = $1;
	397	}
[2630]	398	}
	399
[2680]	400	my $transfer_encoding="7bit";
	401	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	402	$transfer_encoding=$1;
	403	}
[6062]	404
[2886]	405	if ($mimetype eq "text/html") {
[9971]	406	$$textref= $self->text_from_part($$textref, $Headers);
[2886]	407	} elsif ($mimetype ne "text/plain") {
[3630]	408	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[6062]	409	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
	410	} else { # mimetype eq text/plain
	411
	412	if ($transfer_encoding =~ /quoted\-printable/) {
	413	$$textref=qp_decode($$textref);
	414	} elsif ($transfer_encoding =~ /base64/) {
	415	$$textref=base64_decode($$textref);
	416	}
[2886]	417	$self->convert2unicode($charset, $textref);
[8904]	418
	419	$$textref = &text_into_html($$textref);
	420	$$textref =~ s@_@\\_@g; # protect against GS macro language
	421
[2680]	422	}
[6062]	423
[2630]	424
[10218]	425	if ($self->{'headers'} && $self->{'headers'} == 1) {
[6916]	426	# Add "All headers" metadata
	427	$Headers = &text_into_html($Headers);
[2630]	428
[6916]	429	$Headers = "No headers" unless ($Headers =~ /\w/);
	430	$Headers =~ s/@/&#64\;/g;
	431	# escape [] so it isn't re-interpreted as metadata
	432	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
	433	$self->convert2unicode($charset, \$Headers);
[2754]	434
[6916]	435	$Headers =~ s@_@\\_@g; # protect against GS macro language
	436	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
	437	}
[6062]	438
	439
[2918]	440	# Add Title metadata
	441	my $Title = text_into_html($raw{'Subject'});
[7703]	442	$Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]	443	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	444	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	445
	446	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	447
[8121]	448	# Add FileFormat metadata
	449	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
[2918]	450
[1244]	451	# Add text to document object
	452	$$textref = "No message" unless ($$textref =~ /\w/);
[6062]	453
[1244]	454	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	455
[1244]	456	return 1;
[638]	457	}
	458
	459
	460	# Convert a text string into HTML.
	461	#
	462	# The HTML is going to be inserted into a GML file, so
	463	# we have to be careful not to use symbols like ">",
	464	# which ocurs frequently in email messages (and use
	465	# &gt instead.
	466	#
	467	# This function also turns links and email addresses into hyperlinks,
	468	# and replaces carriage returns with <BR> tags (and multiple carriage
	469	# returns with <P> tags).
	470
[1206]	471
[638]	472	sub text_into_html {
	473	my ($text) = @_;
	474
[1244]	475	# Convert problem characters into HTML symbols
[3132]	476	$text =~ s/&/&/g;
	477	$text =~ s/</</g;
	478	$text =~ s/>/>/g;
	479	$text =~ s/\"/"/g;
[638]	480
[2630]	481	# convert email addresses and URIs into links
	482	# don't markup email addresses for now
	483	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	484
[2918]	485	# try to munge email addresses a little bit...
	486	$text =~ s/@/@/;
[2730]	487	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	488	# assume URI doesn't finish with a '.'
[10827]	489	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
[2630]	490
	491
[638]	492	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	493	$text =~ s/ +/ /g;
	494	$text =~ s/\s*$//g;
	495	$text =~ s/^\s*//g;
	496	$text =~ s/\n/\n<br>/g;
	497	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	498
	499	return $text;
	500	}
	501
	502
[2630]	503
	504
	505	#Process a MIME message.
	506	# the textref we are given DOES NOT include the header.
	507	sub text_from_mime_message {
[2847]	508	my $self = shift(@_);
[6062]	509	my ($mimetype,$mimeinfo,$text)=(@_);
	510	my $outhandle=$self->{'outhandle'};
[2630]	511	# Check for multiparts - $mimeinfo will be a boundary
	512	if ($mimetype =~ /multipart/) {
[6062]	513	my $boundary="";
[2732]	514	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	515	$boundary=$1;
[2732]	516	if ($boundary =~ m@^\"@) {
	517	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	518	}
	519	} else {
	520	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
[2630]	521	}
	522	# parts start with "--$boundary"
	523	# message ends with "--$boundary--"
	524	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	525	# that perl might want to interpolate. Also allows spaces...
[2630]	526	$boundary=~s/\\/\\\\/g;
	527	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]	528	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	529	# remove first "part" and last "part" (final --)
	530	shift @message_parts;
	531	my $last=pop @message_parts;
[2680]	532	# if our boundaries are a bit dodgy and we only found 1 part...
	533	if (!defined($last)) {$last="";}
[2630]	534	# make sure it is only -- and whitespace
	535	if ($last !~ /^\-\-\s*$/ms) {
	536	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
	537	}
	538	foreach my $message_part (@message_parts) {
	539	# remove the leading newline left from split.
	540	$message_part=~s/^\r?\n//;
	541	}
	542	if ($mimetype eq "multipart/alternative") {
	543	# check for an HTML version first, then TEXT, otherwise use first.
	544	my $part_text="";
	545	foreach my $message_part (@message_parts) {
[10839]	546	if ($message_part =~ m@^content\-type:\s*text/html@i)
[2630]	547	{
	548	# Use the HTML version
[10834]	549	$part_text = $self->text_from_part($message_part);
[2630]	550	$mimetype="text/html";
	551	last;
	552	}
	553	}
	554	if ($part_text eq "") { # try getting a text part instead
	555	foreach my $message_part (@message_parts) {
[10839]	556	if ($message_part =~ m@^content\-type:\s*text/plain@i)
[2630]	557	{
	558	# Use the plain version
[10839]	559	$part_text = $self->text_from_part($message_part);
[2732]	560	if ($part_text =~/[^\s]/) {
[3721]	561	$part_text = text_into_html($part_text);
[2732]	562	}
[2630]	563	$mimetype="text/plain";
	564	last;
	565	}
	566	}
	567	}
[3721]	568	if ($part_text eq "") { #use first part (no html/text part found)
	569	$part_text = $self->text_from_part(shift @message_parts);
	570	$part_text = text_into_html($part_text);
[2630]	571	}
	572	if ($part_text eq "") { # we couldn't get anything!!!
	573	# or it was an empty message...
	574	# do nothing...
[7830]	575	gsprintf($outhandle, "{BasPlug.empty_file} - empty body?\n");
[2630]	576	} else {
[3721]	577	$text = $part_text;
[2630]	578	}
[3352]	579	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
[3721]	580	$text = "";
[3352]	581	# signed is for PGP/GPG messages... the last part is a hash
	582	if ($mimetype =~ m@multipart/signed@) {
	583	pop @message_parts;
	584	}
[6062]	585	my $is_first_part=1;
[2630]	586	foreach my $message_part (@message_parts) {
[6062]	587	if ($is_first_part && $text ne "") {$is_first_part=0;}
	588
[2630]	589	if ($mimetype eq "multipart/digest") {
[6062]	590	# default type - RTFRFC!! Set if not already set
	591	$message_part =~ m@^(.*)\n\r?\n@s;
	592	my $part_header=$1;
	593	if ($part_header !~ m@^content-type@mi) {
	594	$message_part="Content-type: message/rfc822\n"
	595	. $message_part; # prepend default type
[4224]	596	}
[2630]	597	}
	598
[6062]	599	$text .= $self->process_multipart_part($message_part,
	600	$is_first_part);
[2630]	601	} # foreach message part.
	602	} else {
	603	# we can't handle this multipart type (not mixed or alternative)
	604	# the RFC also mentions "parallel".
	605	}
[3627]	606	} # end of ($mimetype =~ multipart)
[2918]	607	elsif ($mimetype =~ m@message/rfc822@) {
	608	my $msg_header = $text;
	609	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	610	$text = $1;
	611
[3630]	612	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
[2918]	613	{
	614	$mimetype=$1;
[9971]	615	$mimeinfo=$2;
[2918]	616	$mimetype =~ tr/[A-Z]/[a-z]/;
[9971]	617
[2918]	618	my $msg_text;
	619	if ($mimetype =~ m@multipart/@) {
	620	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
[6062]	621	$text);
[9823]	622	} else {
[9971]	623	$msg_text=$self->text_from_part($text,$msg_header);
[9823]	624	}
[2918]	625
[3630]	626	my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]	627	$text= "\n<b><<attached message>></b><br>";
	628	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	629	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	630	. "</td></tr></table>";
	631	}
	632	} else {
[2886]	633	# we don't do any processing of the content.
	634	}
	635
[2630]	636	return $text;
	637	}
	638
	639
[6062]	640
[7703]	641	# used for turning a message id into a more friendly string for greenstone
	642	sub escape_msg_id {
	643	#msgid
	644	my $id=shift;
	645	chomp $id; $id =~ s!\s!!g; # remove spaces
	646	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
	647	$id =~ s![_&]!-!g; # replace symbols that might cause problems
[10834]	648	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
[7703]	649	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
	650	return $id;
	651	}
[6062]	652
[7703]	653
	654
[6062]	655	sub process_multipart_part {
	656	my $self = shift;
	657	my $message_part = shift;
	658	my $is_first_part = shift;
	659
	660	my $return_text="";
	661	my $part_header=$message_part;
	662	my $part_body;
	663	if ($message_part=~ /^\s*\n/) {
	664	# no header... use defaults
	665	$part_body=$message_part;
	666	$part_header="Content-type: text/plain; charset=us-ascii";
	667	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
	668	$part_body=$1;
	669	} else {
	670	# something's gone wrong...
	671	$part_header="";
	672	$part_body=$message_part;
	673	}
	674
	675	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	676	my $part_content_type="";
	677	my $part_content_info="";
	678
	679	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
	680	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	681	$part_content_info=$2;
	682	if (!defined($part_content_info)) {
	683	$part_content_info="";
	684	} else {
	685	$part_content_info =~ s/^\;\s*//;
	686	$part_content_info =~ s/\s*$//;
	687	}
	688	}
	689	my $filename="";
[6079]	690	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]	691	$filename=$1;
[6079]	692	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
[6062]	693	}
	694
	695	# disposition - either inline or attachment.
	696	# NOT CURRENTLY USED - we display all text types instead...
	697	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	698
	699	# add <<attachment>> to each part except the first...
	700	if (!$is_first_part) {
	701	$return_text.="\n<p><hr><strong><<attachment>>";
	702	# add part info header
[10827]	703	my $header_text = "<br>Type: $part_content_type<br>\n";
[6062]	704	if ($filename ne "") {
[10827]	705	$header_text .= "Filename: $filename\n";
[6062]	706	}
	707	$header_text =~ s@_@\\_@g;
[10827]	708	$return_text .= $header_text . "</strong></p>\n<p>\n";
[6062]	709	}
	710
	711	if ($part_content_type =~ m@text/@)
	712	{
[10827]	713	# $message_part includes the mime part headers
	714	my $part_text = $self->text_from_part($message_part);
[6062]	715	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	716	$part_text = text_into_html($part_text);
	717	}
	718	if ($part_text eq "") {
	719	$part_text = ' ';
	720	}
	721	$return_text .= $part_text;
	722	} elsif ($part_content_type =~ m@message/rfc822@) {
	723	# This is a forwarded message
	724	my $message_part_headers=$part_body;
	725	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	726	my $message_part_body=$1;
	727	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	728
	729	my $rfc822_formatted_body=""; # put result in here
	730	if ($message_part_headers =~
	731	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
	732	{
	733	# The message header uses MIME flags
	734	my $message_content_type=$1;
	735	my $message_content_info=$2;
	736	if (!defined($message_content_info)) {
	737	$message_content_info="";
	738	} else {
	739	$message_content_info =~ s/^\;\s*//;
	740	$message_content_info =~ s/\s*$//;
	741	}
	742	$message_content_type =~ tr/A-Z/a-z/;
	743	if ($message_content_type =~ /multipart/) {
	744	$rfc822_formatted_body=
	745	$self->text_from_mime_message($message_content_type,
	746	$message_content_info,
	747	$message_part_body);
	748	} else {
[9971]	749	$message_part_body=$self->text_from_part($part_body,
	750	$message_part_headers);
[6062]	751	$rfc822_formatted_body=text_into_html($message_part_body);
	752	}
	753	} else {
	754	# message doesn't use MIME flags
	755	$rfc822_formatted_body=text_into_html($message_part_body);
	756	$rfc822_formatted_body =~ s@_@\\_@g;
	757	}
	758	# Add the returned text to the output
	759	# don't put all the headers...
	760	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	761	my $brief_headers=get_brief_headers($message_part_headers);
	762	$return_text.=text_into_html($brief_headers);
	763	$return_text.="</p><p>\n";
	764	$return_text.=$rfc822_formatted_body;
	765	$return_text.="</p>\n";
	766	# end of message/rfc822
	767	} elsif ($part_content_type =~ /multipart/) {
	768	# recurse again
	769
	770	my $tmptext= $self->text_from_mime_message($part_content_type,
	771	$part_content_info,
	772	$part_body);
	773	$return_text.=$tmptext;
	774	} else {
	775	# this part isn't text/* or another message...
	776	if ($is_first_part) {
	777	# this is the first part of a multipart, or only part!
	778	$return_text="\n<p><hr><strong><<attachment>>";
	779	# add part info header
	780	my $header_text="<br>Type: $part_content_type<br>\n";
	781	$header_text.="Filename: $filename</strong></p>\n<p>\n";
	782	$header_text =~ s@_@\\_@g;
	783	$return_text.=$header_text;
	784	}
	785
	786	# save attachment by default
[10218]	787	if (!$self->{'no_attachments'}
[6062]	788	&& $filename ne "") { # this part has a file...
	789	my $encoding="8bit";
	790	if ($part_header =~
	791	/^content-transfer-encoding:\s*(\w+)/mi ) {
	792	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
	793	}
	794	my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
	795	my $save_filename=$filename;
	796
	797	# make sure we don't clobber files with same name;
	798	# need to keep state between .mbx files
	799	my $assoc_files=$self->{'assoc_filenames'};
	800	if ($assoc_files->{$filename}) { # it's been set...
	801	$assoc_files->{$filename}++;
	802	$filename =~ m/(.+)\.(\w+)$/;
	803	my ($filestem, $ext)=($1,$2);
	804	$save_filename="${filestem}_"
	805	. $assoc_files->{$filename} . ".$ext";
	806	} else { # first file with this name
	807	$assoc_files->{$filename}=1;
	808	}
	809	open (SAVE, ">$tmpdir/$save_filename") \|\|
	810	warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
	811	my $part_text = $message_part;
	812	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
	813	if ($encoding eq "base64") {
	814	print SAVE base64_decode($part_text);
	815	} elsif ($encoding eq "quoted-printable") {
	816	print SAVE qp_decode($part_text);
	817	} else { # 7bit, 8bit, binary, etc...
	818	print SAVE $part_text;
	819	}
	820	close SAVE;
	821	my $doc_obj=$self->{'doc_obj'};
	822	$doc_obj->associate_file("$tmpdir/$save_filename",
	823	"$save_filename",
	824	$part_content_type # mimetype
	825	);
	826	# clean up tmp area...
	827	# Can't do this as it hasn't been copied/linked yet!!!
	828	# &util::rm("$tmpdir/$save_filename");
	829	my $outhandle=$self->{'outhandle'};
	830	print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
	831
	832	# be nice if "download" was a translatable macro :(
	833	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
	834	} # end of save attachment
	835	} # end of !text/message part
	836
	837
	838	return $return_text;
	839	}
	840
	841
[3630]	842	# Return only the "important" headers from a set of message headers
	843	sub get_brief_headers {
	844	my $msg_header = shift;
	845	my $brief_header = "";
[2630]	846
[3630]	847	# Order matters!
	848	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
	849	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
	850	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
	851	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
	852	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]	853
[3630]	854	return $brief_header;
	855	}
[2630]	856
	857
	858	# Process a MIME part. Return "" if we can't decode it.
[6062]	859	# should only be called for parts with type "text/*" ?
[9971]	860	# Either pass the entire mime part (including the part's header),
	861	# or pass the mime part's text and optionally the part's header.
[2630]	862	sub text_from_part {
[3132]	863	my $self = shift;
[3136]	864	my $text = shift \|\| '';
[9971]	865	my $part_header = shift;
[3136]	866
[10834]	867
[9971]	868	my $type="text/plain"; # default, overridden from part header
[10827]	869	my $charset=undef; # convert2unicode() will guess if necessary
[9971]	870
	871	if (! $part_header) { # no header argument was given. check the body
	872	$part_header = $text;
	873	# check for empty part header (leading blank line)
	874	if ($text =~ /^\s*\r?\n/) {
	875	$part_header="Content-type: text/plain; charset=us-ascii";
	876	} else {
	877	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	878	$text=$1; if (!defined($text)) {$text="";}
	879	}
	880	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2681]	881	}
[9971]	882
	883	if ($part_header =~
	884	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
	885	$type=$1;
	886	$charset=$2;
	887	}
[2630]	888	my $encoding="";
[2638]	889	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	890	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	891	}
	892	# Content-Transfer-Encoding is per-part
	893	if ($encoding ne "") {
	894	if ($encoding =~ /quoted\-printable/) {
	895	$text=qp_decode($text);
	896	} elsif ($encoding =~ /base64/) {
	897	$text=base64_decode($text);
	898	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	899	# rfc2045 also allows binary, which we ignore (for now).
[3630]	900	my $outhandle=$self->{'outhandle'};
	901	print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
[2630]	902	return "";
	903	}
	904	}
[10834]	905
[2630]	906	if ($type eq "text/html") {
	907	# only get stuff between <body> tags, or <html> tags.
[2730]	908	$text =~ s@^.<html[^>]>@@is;
	909	$text =~ s@</html>.*$@@is;
	910	$text =~ s/^.?<body[^>]>//si;
	911	$text =~ s/<\/body>.*$//si;
[2630]	912	}
	913	elsif ($type eq "text/xml") {
	914	$text=~s/</</g;$text=~s/>/>/g;
	915	$text="<pre>\n$text\n</pre>\n";
	916	}
[2730]	917	# convert to unicode
[2847]	918	$self->convert2unicode($charset, \$text);
[6062]	919	$text =~ s@_@\\_@g; # protect against GS macro language
[2630]	920	return $text;
	921	}
	922
	923
[6062]	924
	925
[2630]	926	# decode quoted-printable text
	927	sub qp_decode {
	928	my $text=shift;
	929
	930	# if a line ends with "=\s*", it is a soft line break, otherwise
	931	# keep in any newline characters.
	932
[3627]	933	$text =~ s/=\s*\r?\n//mg;
	934	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]	935	return $text;
	936	}
	937
	938	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	939	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	940	# for us at least).
	941	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	942	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	943	# from each encoded byte.
[2630]	944
	945
	946	sub base64_decode {
	947	my $enc_text = shift;
	948	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	949	# also '=' is used for padding at the end, but we remove it anyway.
	950	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	951	# map each MIME char into it's value, for more efficient lookup.
	952	my %index;
	953	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	954	# remove all non-base64 chars. eval to get variable in transliteration...
	955	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	956	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	957	my $decoded="";
	958	while (length ($enc_text)>3)
	959	{
	960	my $fourchars=substr($enc_text,0,4,"");
	961	my @chars=(split '',$fourchars);
	962	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	963	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	964	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	965	}
	966	# if there are any input chars left, there are either
	967	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	968	my @chars=(split '',$enc_text);
	969	if (length($enc_text)) {
	970	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	971	}
	972	if (length($enc_text)==3) {
	973	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	974	}
	975	return $decoded;
	976	}
	977
[10834]	978	# returns 0 if valid utf-8, 1 if invalid
	979	sub is_utf8 {
	980	my $self = shift;
	981	my $textref = shift;
	982
	983	$$textref =~ m/^/g; # to set \G
	984	my $badbytesfound=0;
	985	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	986	my $highbytes=$1;
	987	my $highbyteslength=length($highbytes);
	988	# replace any non utf8 complaint bytes
	989	$highbytes =~ /^/g; # set pos()
	990	while ($highbytes =~
	991	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	992	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	993	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	994	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	995	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	996	)*([\x80-\xff])? !xg
	997	) {
	998	my $badbyte=$1;
	999	if (!defined $badbyte) {next} # hit end of string
	1000	return 1;
	1001	}
	1002	}
	1003	return 0;
	1004	}
	1005
	1006
	1007
[2847]	1008	sub convert2unicode {
	1009	my $self = shift(@_);
	1010	my ($charset, $textref) = @_;
[2630]	1011
[7703]	1012	if (!$$textref) {
	1013	# nothing to do!
	1014	return;
	1015	}
	1016
[10827]	1017	if (! defined $charset) {
	1018	# check if we have valid utf-8
[10834]	1019	if ($self->is_utf8($textref)) { $charset = "utf8" }
[10827]	1020
	1021	# default to latin
	1022	$charset = "iso_8859_1" if ! defined($charset);
	1023	}
	1024
[2847]	1025	# first get our character encoding name in the right form.
[7703]	1026	$charset =~ tr/A-Z/a-z/; # lowercase
	1027	$charset =~ s/\-/_/g;
[8902]	1028	if ($charset =~ /gb_?2312/) { $charset="gb" }
[2847]	1029	# assumes EUC-KR, not ISO-2022 !?
[7703]	1030	$charset =~ s/^ks_c_5601_1987/korean/;
	1031	if ($charset eq 'utf_8') {$charset='utf8'}
[2847]	1032
[7703]	1033	my $outhandle = $self->{'outhandle'};
	1034
	1035	if ($charset eq "utf8") {
	1036	# no conversion needed, but lets check that it's valid utf8
	1037	# see utf-8 manpage for valid ranges
	1038	$$textref =~ m/^/g; # to set \G
	1039	my $badbytesfound=0;
	1040	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1041	my $highbytes=$1;
	1042	my $highbyteslength=length($highbytes);
	1043	# replace any non utf8 complaint bytes
	1044	$highbytes =~ /^/g; # set pos()
	1045	while ($highbytes =~
[8904]	1046	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7703]	1047	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8904]	1048	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1049	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
[7703]	1050	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1051	)*([\x80-\xff])? !xg
	1052	) {
	1053	my $badbyte=$1;
	1054	if (!defined $badbyte) {next} # hit end of string
	1055	my $pos=pos($highbytes);
	1056	substr($highbytes, $pos-1, 1, "\xc2\x80");
	1057	# update the position to continue searching (for \G)
	1058	pos($highbytes) = $pos+1; # set to just after the \x80
	1059	$badbytesfound=1;
	1060	}
	1061	if ($badbytesfound==1) {
	1062	# claims to be utf8, but it isn't!
	1063	print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
	1064	. "detected and removed.\n";
	1065
	1066	my $replength=length($highbytes);
	1067	my $textpos=pos($$textref);
	1068	# replace bad bytes with good bytes
	1069	substr( $$textref, $textpos-$replength,
	1070	$replength, $highbytes);
	1071	# update the position to continue searching (for \G)
	1072	pos($$textref)=$textpos+($replength-$highbyteslength);
	1073	}
	1074	}
[3206]	1075	return;
	1076	}
	1077
[2847]	1078	# It appears that we can't always trust ascii text so we'll treat it
	1079	# as iso-8859-1 (letting characters above 0x80 through without
	1080	# converting them to utf-8 will result in invalid XML documents
	1081	# which can't be parsed at build time).
	1082	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	1083
[3073]	1084	if ($charset eq "iso_8859_1") {
	1085	# test if the mailer lied, and it has win1252 chars in it...
	1086	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	1087	if ($$textref =~ m/[\x80-\x9f]/) {
	1088	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
	1089	print $outhandle "codepage 1252 detected.\n";
	1090	$charset = "windows_1252";
	1091	}
	1092	}
[3351]	1093	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	1094
	1095	if ($utf8_text ne "") {
	1096	$$textref=$utf8_text;
	1097	} else {
	1098	# we didn't get any text... unsupported encoding perhaps? Or it is
	1099	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
	1100	# characters out here if this causes problems...
[3726]	1101	my $outhandle=$self->{'outhandle'};
	1102	print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
[3351]	1103	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
	1104
	1105	}
[2847]	1106	}
	1107
	1108
[7703]	1109	sub set_OID {
	1110	my $self = shift (@_);
	1111	my ($doc_obj, $id, $segment_number) = @_;
	1112
	1113	if ( exists $doc_obj->{'msgid'} ) {
	1114	$doc_obj->set_OID($doc_obj->{'msgid'});
	1115	} else {
	1116	$doc_obj->set_OID("$id\_$segment_number");
	1117	}
	1118	}
	1119
	1120
[1206]	1121	# Perl packages have to return true if they are run.
	1122	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: