Context Navigation

EmailPlugin.pm@ 32581

Last change on this file since 32581 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 38.1 KB

Rev	Line
[638]	1	###########################################################################
	2	#
[15872]	3	# EmailPlugin.pm - a plugin for parsing email files
[638]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[3630]	9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27
[1206]	28
[15872]	29	# EmailPlugin
[638]	30	#
[1206]	31	# by Gordon Paynter ([email protected])
[638]	32	#
[1206]	33	# Email plug reads email files. These are named with a simple
[2096]	34	# number (i.e. as they appear in maildir folders) or with the
	35	# extension .mbx (for mbox mail file format)
[638]	36	#
	37	# Document text:
[1206]	38	# The document text consists of all the text
	39	# after the first blank line in the document.
[638]	40	#
[2730]	41	# Metadata (not Dublin Core!):
[6916]	42	# $Headers All the header content (optional, not stored by default)
[638]	43	# $Subject Subject: header
	44	# $To To: header
[2730]	45	# $From From: header
[2630]	46	# $FromName Name of sender (where available)
	47	# $FromAddr E-mail address of sender
[638]	48	# $DateText Date: header
	49	# $Date Date: header in GSDL format (eg: 19990924)
[2630]	50	#
[2918]	51	# $Title made up of Subject, Date and Sender (for default formatting)
[19281]	52	# $InReplyTo Message id of the one this replies to
[2630]	53	#
	54	# John McPherson - June/July 2001
	55	# added (basic) MIME support and quoted-printable and base64 decodings.
	56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
	57	#
	58	# See: * RFC 822 - ARPA Internet Text Messages
	59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
	60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]	61	# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]	62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]	63
[16341]	64
[15872]	65	package EmailPlugin;
[3540]	66
[7830]	67	use strict;
	68	no strict "refs"; # so we can use a variable as a filehandle for print $out
	69
[638]	70
[15872]	71	use SplitTextFile;
[7830]	72	use unicode; # gs conv functions
	73	use gsprintf 'gsprintf'; # translations
[2730]	74
[638]	75	use sorttools;
[28563]	76	use FileUtils;
[638]	77
[10254]	78	sub BEGIN {
[15872]	79	@EmailPlugin::ISA = ('SplitTextFile');
[10254]	80	}
[638]	81
[17026]	82	my $extended_oidtype_list =
	83	[ {'name' => "message_id",
	84	'desc' => "{EmailPlugin.OIDtype.message_id}" }
	85	];
[638]	86
[31492]	87	# add in all the standard options from BaseImporter
	88	unshift (@$extended_oidtype_list, @{$BaseImporter::oidtype_list});
[17026]	89
[3540]	90	my $arguments =
[4744]	91	[ { 'name' => "process_exp",
[31492]	92	'desc' => "{BaseImporter.process_exp}",
[6408]	93	'type' => "regexp",
[3540]	94	'reqd' => "no",
[4744]	95	'deft' => &get_default_process_exp() },
	96	{ 'name' => "no_attachments",
[15872]	97	'desc' => "{EmailPlugin.no_attachments}",
[3630]	98	'type' => "flag",
[4744]	99	'reqd' => "no" },
[6916]	100	{ 'name' => "headers",
[15872]	101	'desc' => "{EmailPlugin.headers}",
[6916]	102	'type' => "flag",
	103	'reqd' => "no" },
[17026]	104	{ 'name' => "OIDtype",
	105	'desc' => "{import.OIDtype}",
	106	'type' => "enum",
	107	'list' => $extended_oidtype_list,
[19281]	108	'deft' => "message_id",
[18591]	109	'reqd' => "no" },
[17026]	110	{ 'name' => "OIDmetadata",
	111	'desc' => "{import.OIDmetadata}",
	112	'type' => "metadata",
	113	'deft' => "dc.Identifier",
[18591]	114	'reqd' => "no" },
[4744]	115	{ 'name' => "split_exp",
[15872]	116	'desc' => "{EmailPlugin.split_exp}",
[6408]	117	'type' => "regexp",
	118	'reqd' => "no",
	119	'deft' => &get_default_split_exp() }
	120	];
[3540]	121
[15872]	122	my $options = { 'name' => "EmailPlugin",
	123	'desc' => "{EmailPlugin.desc}",
[6408]	124	'abstract' => "no",
	125	'inherits' => "yes",
[4744]	126	'args' => $arguments };
[3540]	127
[638]	128	sub new {
[10218]	129	my ($class) = shift (@_);
	130	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	131	push(@$pluginlist, $class);
[3540]	132
[15872]	133	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	134	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	135
[15872]	136	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	137
[3630]	138	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
[16341]	139	$self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
[3630]	140
[2730]	141	# this might not actually be true at read-time, but after processing
	142	# it should all be utf8.
	143	$self->{'input_encoding'}="utf8";
[638]	144	return bless $self, $class;
	145	}
	146
[1244]	147	sub get_default_process_exp {
	148	my $self = shift (@_);
[2096]	149	# mbx/email for mailbox file format, \d+ for maildir (each message is
	150	# in a separate file, with a unique number for filename)
[3111]	151	# mozilla and IE will save individual mbx format files with a ".eml" ext.
[25741]	152	return q@([\\/]\d+\|\.(mbo?x\|email\|eml))$@;
[638]	153	}
	154
[1895]	155	# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]	156	# It is supposed to be "\n\nFrom ", but this isn't always used.
[9971]	157	# add \d{4} so that the line ends in a year (in case the text has an
	158	# unescaped "From " at the start of a line).
[1895]	159	sub get_default_split_exp {
[9971]	160	return q^\nFrom .*\d{4}\n^;
[3523]	161
[1895]	162	}
	163
[16677]	164	sub can_process_this_file {
	165	my $self = shift(@_);
	166	my ($filename) = @_;
[1895]	167
[16677]	168	# avoid any confusion between filenames matching \d+ (which are by default
	169	# matched by EmailPlugin) and directories that match \d+ (which should not)
	170
	171	return 0 if (-d $filename);
	172
	173	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
	174	return 1;
	175	}
	176	return 0;
	177
	178	}
	179
	180
[1244]	181	# do plugin specific processing of doc_obj
	182	sub process {
[2630]	183
[638]	184	my $self = shift (@_);
[6332]	185	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]	186	my $outhandle = $self->{'outhandle'};
	187
[1244]	188	# Check that we're dealing with a valid mail file
[3111]	189	# mbox message files start with "From "
	190	# maildir messages usually start with Return-Path and Delivered-To
	191	# mh is very similar to maildir
	192	my $startoffile=substr($$textref,0,256);
	193	if (($startoffile !~ /^(From )/) &&
	194	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
	195	return undef;
	196	}
[25741]	197
[1244]	198	my $cursection = $doc_obj->get_top_section();
[638]	199
[1206]	200	#
	201	# Parse the document's text and extract metadata
	202	#
[638]	203
[2652]	204	# Protect backslashes
	205	$$textref =~ s@\\@\\\\@g;
	206
[1206]	207	# Separate header from body of message
[1244]	208	my $Headers = $$textref;
[2630]	209	$Headers =~ s/\r?\n\r?\n(.*)$//s;
	210	$$textref = $1;
[4089]	211	$Headers .= "\n";
[2779]	212
[2630]	213	# Unfold headers - see rfc822
	214	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]	215	# Extract basic metadata from header
	216	my @headers = ("From", "To", "Subject", "Date");
	217	my %raw;
[1658]	218	foreach my $name (@headers) {
	219	$raw{$name} = "No $name value";
	220	}
[1206]	221
[3132]	222	# Get a default encoding for the header - RFC says should be ascii...
[6062]	223	my $default_header_encoding="iso_8859_1";
[3132]	224
	225	# We don't know what character set is the user's default...
	226	# We could use textcat to guess... for now we'll look at mime content-type
	227	# if ($Headers =~ /([[:^ascii:]])/) {
	228	# }
	229	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
	230	$default_header_encoding=$1;
	231	$default_header_encoding =~ s@\-@_@g;
	232	$default_header_encoding =~ tr/A-Z/a-z/;
	233	}
	234
	235
[1658]	236	# Examine each line of the headers
	237	my ($line, $name, $value);
	238	my @parts;
	239	foreach $line (split(/\n/, $Headers)) {
	240
	241	# Ignore lines with no content or which begin with whitespace
	242	next unless ($line =~ /:/);
	243	next if ($line =~ /^\s/);
	244
	245	# Find out what metadata is on this line
	246	@parts = split(/:/, $line);
	247	$name = shift @parts;
[3136]	248	# get fieldname in canonical form - first cap, then lower case.
[3134]	249	$name =~ tr/A-Z/a-z/;
[3136]	250	# uppercase the first character according to the current locale
[2630]	251	$name=~s/(.+)/\u$1/;
[1658]	252	next unless $name;
	253	next unless ($raw{$name});
	254
	255	# Find the value of that metadata
	256	$value = join(":", @parts);
	257	$value =~ s/^\s+//;
	258	$value =~ s/\s+$//;
[16341]	259	# decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
	260	$self->decode_header_value($default_header_encoding, \$value);
[2730]	261
[1658]	262	# Store the metadata
[6062]	263	$value =~ s@_@\\_@g; # protect against GS macro language
[1206]	264	$raw{$name} = $value;
	265	}
	266
[2630]	267	# Extract the name and e-mail address from the From metadata
[6062]	268	my $frommeta = $raw{"From"};
[2680]	269	my $fromnamemeta;
	270	my $fromaddrmeta;
	271
	272	$frommeta =~ s/\s*$//; # Remove trailing space, if any
	273
	274	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
	275	$fromnamemeta=$1;
	276	$fromaddrmeta=$2;
	277	} elsif ($frommeta =~ m/(.+@.+)\s+$(.*)$/) {
	278	$fromnamemeta=$2;
	279	$fromaddrmeta=$1;
[19283]	280	} elsif ($frommeta =~ m/(.+)\s+at\s+(.+)\s+$(.*)$/) {
	281	$fromnamemeta=$3;
	282	$fromaddrmeta="$1@$2";
	283	}
	284
[2630]	285	if (!defined($fromaddrmeta)) {
	286	$fromaddrmeta=$frommeta;
	287	}
[2680]	288	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]	289	# minor attempt to prevent spam-bots from harvesting addresses...
	290	$fromaddrmeta=~s/@/@/;
[6062]	291
[2630]	292	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
	293
[3328]	294	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]	295	$fromnamemeta =~ s/\"//g; # remove quotes
	296	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]	297	}
	298	else {
	299	$fromnamemeta = $fromaddrmeta;
	300	}
	301	# if name is an address
	302	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]	303	$fromnamemeta=~s/@/&#64\;/;
[2630]	304	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
	305
	306	$raw{"From"}=$frommeta;
	307
[1206]	308	# Process Date information
[1658]	309	if ($raw{"Date"} !~ /No Date/) {
[1206]	310	$raw{"DateText"} = $raw{"Date"};
[638]	311
[1206]	312	# Convert the date text to internal date format
	313	$value = $raw{"Date"};
[15212]	314	# proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
[1206]	315	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[15212]	316	if (!defined($day) \|\| !defined($month) \|\| !defined ($year)) {
	317	# try monthly archive format: Wed Apr 23 00:26:08 2008
	318	($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
	319	}
	320
[3143]	321	# make some assumptions about the year formatting...
	322	# some (old) software thinks 2001 is 101, some think 2001 is 01
	323	if ($year < 20) { $year += 2000; } # assume not really 1920...
	324	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]	325	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]	326
[1206]	327	} else {
	328	# We have not extracted a date
	329	$raw{"DateText"} = "Unknown.";
	330	$raw{"Date"} = "19000000";
[638]	331	}
	332
[1244]	333	# Add extracted metadata to document object
[1206]	334	foreach my $name (keys %raw) {
	335	$value = $raw{$name};
	336	if ($value) {
[2730]	337	# assume subject, etc headers have no special HTML meaning.
[1206]	338	$value = &text_into_html($value);
[2730]	339	# escape [] so it isn't re-interpreted as metadata
	340	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
[1206]	341	} else {
	342	$value = "No $name field";
	343	}
[1244]	344	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]	345	}
[638]	346
[7703]	347
	348	# extract a message ID from the headers, if there is one, and we'll use
	349	# that as the greenstone doc ID. Having a predictable ID means we can
	350	# link to other messages, eg from In-Reply-To or References headers...
	351	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
	352	my $id=escape_msg_id($1);
	353	$doc_obj->{'msgid'}=$id;
	354	}
	355	# link to another message, if this is a reply
	356	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
	357	my $id=escape_msg_id($1);
	358	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	359	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
	360	# References can have multiple, get the last one
	361	my $id=escape_msg_id($1);
	362	# not necessarily in-reply-to, but same thread...
	363	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
	364	}
	365
	366
	367
[2630]	368	my $mimetype="text/plain";
	369	my $mimeinfo="";
[3132]	370	my $charset = $default_header_encoding;
[3073]	371	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
	372	# more than one parameter given to Content-type.
	373	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]	374	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
[2630]	375	{
	376	$mimetype=$1;
	377	$mimetype =~ tr/[A-Z]/[a-z]/;
[3073]	378
	379	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
	380	$mimetype = "text/plain";
	381	}
	382
[2630]	383	$mimeinfo=$2;
[3073]	384	if (!defined $mimeinfo) {
	385	$mimeinfo="";
	386	} else { # strip leading and trailing stuff
	387	$mimeinfo =~ s/^\;\s*//;
	388	$mimeinfo =~ s/\s*$//;
	389	}
	390	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]	391	$charset = $1;
	392	}
[2630]	393	}
	394
[2680]	395	my $transfer_encoding="7bit";
	396	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
	397	$transfer_encoding=$1;
	398	}
[6062]	399
[2886]	400	if ($mimetype eq "text/html") {
[9971]	401	$$textref= $self->text_from_part($$textref, $Headers);
[2886]	402	} elsif ($mimetype ne "text/plain") {
[3630]	403	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[16341]	404	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
[6062]	405	} else { # mimetype eq text/plain
	406
	407	if ($transfer_encoding =~ /quoted\-printable/) {
	408	$$textref=qp_decode($$textref);
	409	} elsif ($transfer_encoding =~ /base64/) {
	410	$$textref=base64_decode($$textref);
	411	}
[2886]	412	$self->convert2unicode($charset, $textref);
[8904]	413
	414	$$textref = &text_into_html($$textref);
	415	$$textref =~ s@_@\\_@g; # protect against GS macro language
	416
[2680]	417	}
[6062]	418
[2630]	419
[10218]	420	if ($self->{'headers'} && $self->{'headers'} == 1) {
[6916]	421	# Add "All headers" metadata
	422	$Headers = &text_into_html($Headers);
[2630]	423
[6916]	424	$Headers = "No headers" unless ($Headers =~ /\w/);
	425	$Headers =~ s/@/&#64\;/g;
	426	# escape [] so it isn't re-interpreted as metadata
	427	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
	428	$self->convert2unicode($charset, \$Headers);
[2754]	429
[6916]	430	$Headers =~ s@_@\\_@g; # protect against GS macro language
	431	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
	432	}
[6062]	433
	434
[2918]	435	# Add Title metadata
	436	my $Title = text_into_html($raw{'Subject'});
[7703]	437	$Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]	438	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]	439	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
[2918]	440
	441	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
	442
[8121]	443	# Add FileFormat metadata
	444	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
[2918]	445
[1244]	446	# Add text to document object
	447	$$textref = "No message" unless ($$textref =~ /\w/);
[6062]	448
[1244]	449	$doc_obj->add_utf8_text($cursection, $$textref);
[638]	450
[1244]	451	return 1;
[638]	452	}
	453
[16341]	454	# delete any temp files that we have created
	455	sub clean_up_after_doc_obj_processing {
	456	my $self = shift(@_);
	457
	458	foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
	459	if (-e $tmp_file_path) {
[28563]	460	&FileUtils::removeFiles($tmp_file_path);
[16341]	461	}
	462	}
	463
	464	}
[638]	465
	466	# Convert a text string into HTML.
	467	#
	468	# The HTML is going to be inserted into a GML file, so
	469	# we have to be careful not to use symbols like ">",
	470	# which ocurs frequently in email messages (and use
	471	# &gt instead.
	472	#
	473	# This function also turns links and email addresses into hyperlinks,
	474	# and replaces carriage returns with <BR> tags (and multiple carriage
	475	# returns with <P> tags).
	476
[1206]	477
[638]	478	sub text_into_html {
	479	my ($text) = @_;
	480
[1244]	481	# Convert problem characters into HTML symbols
[3132]	482	$text =~ s/&/&/g;
	483	$text =~ s/</</g;
	484	$text =~ s/>/>/g;
	485	$text =~ s/\"/"/g;
[638]	486
[2630]	487	# convert email addresses and URIs into links
	488	# don't markup email addresses for now
	489	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]	490
[2918]	491	# try to munge email addresses a little bit...
	492	$text =~ s/@/@/;
[2730]	493	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
	494	# assume URI doesn't finish with a '.'
[10827]	495	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
[2630]	496
	497
[638]	498	# Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]	499	$text =~ s/ +/ /g;
	500	$text =~ s/\s*$//g;
	501	$text =~ s/^\s*//g;
	502	$text =~ s/\n/\n<br>/g;
	503	$text =~ s/<br>\s*<br>/<p>/gi;
[638]	504
	505	return $text;
	506	}
	507
	508
[2630]	509
	510
	511	#Process a MIME message.
	512	# the textref we are given DOES NOT include the header.
	513	sub text_from_mime_message {
[2847]	514	my $self = shift(@_);
[16341]	515	my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
[6062]	516	my $outhandle=$self->{'outhandle'};
[2630]	517	# Check for multiparts - $mimeinfo will be a boundary
	518	if ($mimetype =~ /multipart/) {
[6062]	519	my $boundary="";
[2732]	520	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
[2630]	521	$boundary=$1;
[2732]	522	if ($boundary =~ m@^\"@) {
	523	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
	524	}
	525	} else {
[15872]	526	print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
[2630]	527	}
	528	# parts start with "--$boundary"
	529	# message ends with "--$boundary--"
	530	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]	531	# that perl might want to interpolate. Also allows spaces...
[2630]	532	$boundary=~s/\\/\\\\/g;
	533	$boundary=~s/([\?\+\.\:\/\'])/\\$1/g;
[2681]	534	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]	535	# remove first "part" and last "part" (final --)
	536	shift @message_parts;
	537	my $last=pop @message_parts;
[2680]	538	# if our boundaries are a bit dodgy and we only found 1 part...
	539	if (!defined($last)) {$last="";}
[2630]	540	# make sure it is only -- and whitespace
	541	if ($last !~ /^\-\-\s*$/ms) {
[15872]	542	print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
[2630]	543	}
	544	foreach my $message_part (@message_parts) {
	545	# remove the leading newline left from split.
	546	$message_part=~s/^\r?\n//;
	547	}
	548	if ($mimetype eq "multipart/alternative") {
	549	# check for an HTML version first, then TEXT, otherwise use first.
	550	my $part_text="";
	551	foreach my $message_part (@message_parts) {
[10839]	552	if ($message_part =~ m@^content\-type:\s*text/html@i)
[2630]	553	{
	554	# Use the HTML version
[10834]	555	$part_text = $self->text_from_part($message_part);
[2630]	556	$mimetype="text/html";
	557	last;
	558	}
	559	}
	560	if ($part_text eq "") { # try getting a text part instead
	561	foreach my $message_part (@message_parts) {
[10839]	562	if ($message_part =~ m@^content\-type:\s*text/plain@i)
[2630]	563	{
	564	# Use the plain version
[10839]	565	$part_text = $self->text_from_part($message_part);
[2732]	566	if ($part_text =~/[^\s]/) {
[3721]	567	$part_text = text_into_html($part_text);
[2732]	568	}
[2630]	569	$mimetype="text/plain";
	570	last;
	571	}
	572	}
	573	}
[3721]	574	if ($part_text eq "") { #use first part (no html/text part found)
	575	$part_text = $self->text_from_part(shift @message_parts);
	576	$part_text = text_into_html($part_text);
[2630]	577	}
	578	if ($part_text eq "") { # we couldn't get anything!!!
	579	# or it was an empty message...
	580	# do nothing...
[16013]	581	gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
[2630]	582	} else {
[3721]	583	$text = $part_text;
[2630]	584	}
[3352]	585	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
[3721]	586	$text = "";
[3352]	587	# signed is for PGP/GPG messages... the last part is a hash
	588	if ($mimetype =~ m@multipart/signed@) {
	589	pop @message_parts;
	590	}
[6062]	591	my $is_first_part=1;
[2630]	592	foreach my $message_part (@message_parts) {
[6062]	593	if ($is_first_part && $text ne "") {$is_first_part=0;}
	594
[2630]	595	if ($mimetype eq "multipart/digest") {
[6062]	596	# default type - RTFRFC!! Set if not already set
	597	$message_part =~ m@^(.*)\n\r?\n@s;
	598	my $part_header=$1;
	599	if ($part_header !~ m@^content-type@mi) {
	600	$message_part="Content-type: message/rfc822\n"
	601	. $message_part; # prepend default type
[4224]	602	}
[2630]	603	}
	604
[16341]	605	$text .= $self->process_multipart_part($default_header_encoding,
	606	$message_part,
[6062]	607	$is_first_part);
[2630]	608	} # foreach message part.
	609	} else {
	610	# we can't handle this multipart type (not mixed or alternative)
	611	# the RFC also mentions "parallel".
	612	}
[3627]	613	} # end of ($mimetype =~ multipart)
[2918]	614	elsif ($mimetype =~ m@message/rfc822@) {
	615	my $msg_header = $text;
	616	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
	617	$text = $1;
	618
[3630]	619	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
[2918]	620	{
	621	$mimetype=$1;
[9971]	622	$mimeinfo=$2;
[2918]	623	$mimetype =~ tr/[A-Z]/[a-z]/;
[9971]	624
[2918]	625	my $msg_text;
	626	if ($mimetype =~ m@multipart/@) {
[16341]	627	$msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
	628	$default_header_encoding,
[6062]	629	$text);
[9823]	630	} else {
[9971]	631	$msg_text=$self->text_from_part($text,$msg_header);
[9823]	632	}
[2918]	633
[3630]	634	my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]	635	$text= "\n<b><<attached message>></b><br>";
	636	$text.= "<table><tr><td width=\"5%\"> </td>\n";
	637	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
	638	. "</td></tr></table>";
	639	}
	640	} else {
[2886]	641	# we don't do any processing of the content.
	642	}
	643
[2630]	644	return $text;
	645	}
	646
	647
[6062]	648
[7703]	649	# used for turning a message id into a more friendly string for greenstone
	650	sub escape_msg_id {
	651	#msgid
	652	my $id=shift;
	653	chomp $id; $id =~ s!\s!!g; # remove spaces
	654	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
	655	$id =~ s![_&]!-!g; # replace symbols that might cause problems
[10834]	656	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
[7703]	657	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
	658	return $id;
	659	}
[6062]	660
[7703]	661
	662
[6062]	663	sub process_multipart_part {
	664	my $self = shift;
[16341]	665	my $default_header_encoding = shift;
[6062]	666	my $message_part = shift;
	667	my $is_first_part = shift;
	668
	669	my $return_text="";
	670	my $part_header=$message_part;
	671	my $part_body;
	672	if ($message_part=~ /^\s*\n/) {
	673	# no header... use defaults
	674	$part_body=$message_part;
	675	$part_header="Content-type: text/plain; charset=us-ascii";
	676	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
	677	$part_body=$1;
	678	} else {
	679	# something's gone wrong...
	680	$part_header="";
	681	$part_body=$message_part;
	682	}
	683
	684	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	685	my $part_content_type="";
	686	my $part_content_info="";
	687
	688	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
	689	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
	690	$part_content_info=$2;
	691	if (!defined($part_content_info)) {
	692	$part_content_info="";
	693	} else {
	694	$part_content_info =~ s/^\;\s*//;
	695	$part_content_info =~ s/\s*$//;
	696	}
	697	}
	698	my $filename="";
[6079]	699	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]	700	$filename=$1;
[6079]	701	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
[16341]	702	# decode the filename
	703	$self->decode_header_value($default_header_encoding, \$filename);
	704
[6062]	705	}
	706
	707	# disposition - either inline or attachment.
	708	# NOT CURRENTLY USED - we display all text types instead...
	709	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
	710
	711	# add <<attachment>> to each part except the first...
	712	if (!$is_first_part) {
	713	$return_text.="\n<p><hr><strong><<attachment>>";
	714	# add part info header
[10827]	715	my $header_text = "<br>Type: $part_content_type<br>\n";
[6062]	716	if ($filename ne "") {
[10827]	717	$header_text .= "Filename: $filename\n";
[6062]	718	}
	719	$header_text =~ s@_@\\_@g;
[10827]	720	$return_text .= $header_text . "</strong></p>\n<p>\n";
[6062]	721	}
	722
	723	if ($part_content_type =~ m@text/@)
	724	{
[10827]	725	# $message_part includes the mime part headers
	726	my $part_text = $self->text_from_part($message_part);
[6062]	727	if ($part_content_type !~ m@text/(ht\|x)ml@) {
	728	$part_text = text_into_html($part_text);
	729	}
	730	if ($part_text eq "") {
	731	$part_text = ' ';
	732	}
	733	$return_text .= $part_text;
	734	} elsif ($part_content_type =~ m@message/rfc822@) {
	735	# This is a forwarded message
	736	my $message_part_headers=$part_body;
	737	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
	738	my $message_part_body=$1;
	739	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
	740
	741	my $rfc822_formatted_body=""; # put result in here
	742	if ($message_part_headers =~
	743	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
	744	{
	745	# The message header uses MIME flags
	746	my $message_content_type=$1;
	747	my $message_content_info=$2;
	748	if (!defined($message_content_info)) {
	749	$message_content_info="";
	750	} else {
	751	$message_content_info =~ s/^\;\s*//;
	752	$message_content_info =~ s/\s*$//;
	753	}
	754	$message_content_type =~ tr/A-Z/a-z/;
	755	if ($message_content_type =~ /multipart/) {
	756	$rfc822_formatted_body=
	757	$self->text_from_mime_message($message_content_type,
	758	$message_content_info,
[16341]	759	$default_header_encoding,
[6062]	760	$message_part_body);
	761	} else {
[9971]	762	$message_part_body=$self->text_from_part($part_body,
	763	$message_part_headers);
[6062]	764	$rfc822_formatted_body=text_into_html($message_part_body);
	765	}
	766	} else {
	767	# message doesn't use MIME flags
	768	$rfc822_formatted_body=text_into_html($message_part_body);
	769	$rfc822_formatted_body =~ s@_@\\_@g;
	770	}
	771	# Add the returned text to the output
	772	# don't put all the headers...
	773	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
	774	my $brief_headers=get_brief_headers($message_part_headers);
	775	$return_text.=text_into_html($brief_headers);
	776	$return_text.="</p><p>\n";
	777	$return_text.=$rfc822_formatted_body;
	778	$return_text.="</p>\n";
	779	# end of message/rfc822
	780	} elsif ($part_content_type =~ /multipart/) {
	781	# recurse again
	782
	783	my $tmptext= $self->text_from_mime_message($part_content_type,
	784	$part_content_info,
[16341]	785	$default_header_encoding,
[6062]	786	$part_body);
	787	$return_text.=$tmptext;
	788	} else {
	789	# this part isn't text/* or another message...
	790	if ($is_first_part) {
	791	# this is the first part of a multipart, or only part!
	792	$return_text="\n<p><hr><strong><<attachment>>";
	793	# add part info header
	794	my $header_text="<br>Type: $part_content_type<br>\n";
	795	$header_text.="Filename: $filename</strong></p>\n<p>\n";
	796	$header_text =~ s@_@\\_@g;
	797	$return_text.=$header_text;
	798	}
	799
	800	# save attachment by default
[10218]	801	if (!$self->{'no_attachments'}
[6062]	802	&& $filename ne "") { # this part has a file...
	803	my $encoding="8bit";
	804	if ($part_header =~
	805	/^content-transfer-encoding:\s*(\w+)/mi ) {
	806	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
	807	}
[28563]	808	my $tmpdir=&FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
[6062]	809	my $save_filename=$filename;
	810
	811	# make sure we don't clobber files with same name;
	812	# need to keep state between .mbx files
	813	my $assoc_files=$self->{'assoc_filenames'};
	814	if ($assoc_files->{$filename}) { # it's been set...
	815	$assoc_files->{$filename}++;
	816	$filename =~ m/(.+)\.(\w+)$/;
	817	my ($filestem, $ext)=($1,$2);
	818	$save_filename="${filestem}_"
	819	. $assoc_files->{$filename} . ".$ext";
	820	} else { # first file with this name
	821	$assoc_files->{$filename}=1;
	822	}
[28563]	823	my $tmp_filename = &FileUtils::filenameConcatenate($tmpdir, $save_filename);
[16341]	824	open (SAVE, ">$tmp_filename") \|\|
	825	warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
	826	binmode(SAVE); # needed on Windows
[6062]	827	my $part_text = $message_part;
	828	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
	829	if ($encoding eq "base64") {
	830	print SAVE base64_decode($part_text);
	831	} elsif ($encoding eq "quoted-printable") {
	832	print SAVE qp_decode($part_text);
	833	} else { # 7bit, 8bit, binary, etc...
	834	print SAVE $part_text;
	835	}
	836	close SAVE;
	837	my $doc_obj=$self->{'doc_obj'};
[16341]	838	$doc_obj->associate_file("$tmp_filename",
[6062]	839	"$save_filename",
	840	$part_content_type # mimetype
	841	);
[16341]	842	# add this file to the list of tmp files for deleting later
	843	push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
	844
[6062]	845	my $outhandle=$self->{'outhandle'};
[15872]	846	print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
[6062]	847
	848	# be nice if "download" was a translatable macro :(
	849	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
	850	} # end of save attachment
	851	} # end of !text/message part
	852
	853
	854	return $return_text;
	855	}
	856
	857
[3630]	858	# Return only the "important" headers from a set of message headers
	859	sub get_brief_headers {
	860	my $msg_header = shift;
	861	my $brief_header = "";
[2630]	862
[3630]	863	# Order matters!
	864	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
	865	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
	866	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
	867	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
	868	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]	869
[3630]	870	return $brief_header;
	871	}
[2630]	872
	873
	874	# Process a MIME part. Return "" if we can't decode it.
[6062]	875	# should only be called for parts with type "text/*" ?
[9971]	876	# Either pass the entire mime part (including the part's header),
	877	# or pass the mime part's text and optionally the part's header.
[2630]	878	sub text_from_part {
[3132]	879	my $self = shift;
[3136]	880	my $text = shift \|\| '';
[9971]	881	my $part_header = shift;
[3136]	882
[10834]	883
[9971]	884	my $type="text/plain"; # default, overridden from part header
[10827]	885	my $charset=undef; # convert2unicode() will guess if necessary
[9971]	886
	887	if (! $part_header) { # no header argument was given. check the body
	888	$part_header = $text;
	889	# check for empty part header (leading blank line)
	890	if ($text =~ /^\s*\r?\n/) {
	891	$part_header="Content-type: text/plain; charset=us-ascii";
	892	} else {
	893	$part_header =~ s/\r?\n\r?\n(.*)$//s;
	894	$text=$1; if (!defined($text)) {$text="";}
	895	}
	896	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2681]	897	}
[9971]	898
	899	if ($part_header =~
	900	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
	901	$type=$1;
	902	$charset=$2;
	903	}
[2630]	904	my $encoding="";
[2638]	905	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]	906	$encoding=$1; $encoding=~tr/A-Z/a-z/;
	907	}
	908	# Content-Transfer-Encoding is per-part
	909	if ($encoding ne "") {
	910	if ($encoding =~ /quoted\-printable/) {
	911	$text=qp_decode($text);
	912	} elsif ($encoding =~ /base64/) {
	913	$text=base64_decode($text);
	914	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
	915	# rfc2045 also allows binary, which we ignore (for now).
[3630]	916	my $outhandle=$self->{'outhandle'};
[15872]	917	print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
[2630]	918	return "";
	919	}
	920	}
[10834]	921
[2630]	922	if ($type eq "text/html") {
	923	# only get stuff between <body> tags, or <html> tags.
[2730]	924	$text =~ s@^.<html[^>]>@@is;
	925	$text =~ s@</html>.*$@@is;
	926	$text =~ s/^.?<body[^>]>//si;
	927	$text =~ s/<\/body>.*$//si;
[2630]	928	}
	929	elsif ($type eq "text/xml") {
	930	$text=~s/</</g;$text=~s/>/>/g;
	931	$text="<pre>\n$text\n</pre>\n";
	932	}
[2730]	933	# convert to unicode
[2847]	934	$self->convert2unicode($charset, \$text);
[6062]	935	$text =~ s@_@\\_@g; # protect against GS macro language
[2630]	936	return $text;
	937	}
	938
	939
[6062]	940
	941
[2630]	942	# decode quoted-printable text
	943	sub qp_decode {
	944	my $text=shift;
	945
	946	# if a line ends with "=\s*", it is a soft line break, otherwise
	947	# keep in any newline characters.
	948
[3627]	949	$text =~ s/=\s*\r?\n//mg;
	950	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]	951	return $text;
	952	}
	953
	954	# decode base64 text. This is fairly slow (since it's interpreted perl rather
	955	# than compiled XS stuff like in the ::MIME modules, but this is more portable
	956	# for us at least).
	957	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
	958	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]	959	# from each encoded byte.
[2630]	960
	961
	962	sub base64_decode {
	963	my $enc_text = shift;
	964	# A=>0, B=>1, ..., '+'=>62, '/'=>63
	965	# also '=' is used for padding at the end, but we remove it anyway.
	966	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	967	# map each MIME char into it's value, for more efficient lookup.
	968	my %index;
	969	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
	970	# remove all non-base64 chars. eval to get variable in transliteration...
	971	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
	972	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
	973	my $decoded="";
	974	while (length ($enc_text)>3)
	975	{
	976	my $fourchars=substr($enc_text,0,4,"");
	977	my @chars=(split '',$fourchars);
	978	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
	979	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	980	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
	981	}
	982	# if there are any input chars left, there are either
	983	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
	984	my @chars=(split '',$enc_text);
	985	if (length($enc_text)) {
	986	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
	987	}
	988	if (length($enc_text)==3) {
	989	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
	990	}
	991	return $decoded;
	992	}
	993
[10834]	994	# returns 0 if valid utf-8, 1 if invalid
	995	sub is_utf8 {
	996	my $self = shift;
	997	my $textref = shift;
	998
	999	$$textref =~ m/^/g; # to set \G
	1000	my $badbytesfound=0;
	1001	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1002	my $highbytes=$1;
	1003	my $highbyteslength=length($highbytes);
	1004	# replace any non utf8 complaint bytes
	1005	$highbytes =~ /^/g; # set pos()
	1006	while ($highbytes =~
	1007	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	1008	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	1009	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1010	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	1011	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1012	)*([\x80-\xff])? !xg
	1013	) {
	1014	my $badbyte=$1;
	1015	if (!defined $badbyte) {next} # hit end of string
	1016	return 1;
	1017	}
	1018	}
	1019	return 0;
	1020	}
	1021
[16341]	1022	# words with non ascii characters in header values must be encoded in the
	1023	# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
[10834]	1024
[16341]	1025	sub decode_header_value {
	1026	my $self = shift(@_);
	1027	my ($default_header_encoding, $textref) = @_;
	1028
	1029	if (!$$textref) {
	1030	# nothing to do!
	1031	return;
	1032	}
	1033	my $value = $$textref;
	1034	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
	1035	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
	1036	my $original_value=$value;
	1037	my $encoded=$value;
	1038	$value="";
	1039	# we should ignore spaces between consecutive encoded-texts
	1040	$encoded =~ s@\?=\s+=\?@\?==\?@g;
	1041	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
	1042	my ($charset, $encoding, $data)=($2,$3,$4);
	1043	my ($decoded_data);
	1044	my $leading_chars = "$1";
	1045	$self->convert2unicode($default_header_encoding, \$leading_chars);
	1046	$value.=$leading_chars;
	1047
	1048	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
	1049	chomp $data;
	1050	$encoding =~ tr/BQ/bq/;
	1051	if ($encoding eq "q") { # quoted printable
	1052	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
	1053	$decoded_data=qp_decode($data);
	1054	# qp_decode adds \n, which is default for body text
	1055	chomp($decoded_data);
	1056	} else { # base 64
	1057	$decoded_data=base64_decode($data);
	1058	}
	1059	$self->convert2unicode($charset, \$decoded_data);
	1060	$value .= $decoded_data;
	1061	} # end of while loop
	1062
	1063	# get any trailing characters
	1064	$self->convert2unicode($default_header_encoding, \$encoded);
	1065	$value.=$encoded;
	1066
	1067	if ($value =~ /^\s*$/) { # we couldn't extract anything...
	1068	$self->convert2unicode($default_header_encoding,
	1069	\$original_value);
	1070	$value=$original_value;
	1071	}
	1072	$$textref = $value;
	1073	} # end of if =?...?=
	1074
	1075	# In the absense of other charset information, assume the
	1076	# header is the default (usually "iso_8859_1") and convert to unicode.
	1077	else {
	1078	$self->convert2unicode($default_header_encoding, $textref);
	1079	}
	1080
	1081	}
[10834]	1082
[16341]	1083
	1084
[2847]	1085	sub convert2unicode {
	1086	my $self = shift(@_);
	1087	my ($charset, $textref) = @_;
[2630]	1088
[7703]	1089	if (!$$textref) {
	1090	# nothing to do!
	1091	return;
	1092	}
	1093
[10827]	1094	if (! defined $charset) {
	1095	# check if we have valid utf-8
[10834]	1096	if ($self->is_utf8($textref)) { $charset = "utf8" }
[10827]	1097
	1098	# default to latin
	1099	$charset = "iso_8859_1" if ! defined($charset);
	1100	}
	1101
[2847]	1102	# first get our character encoding name in the right form.
[7703]	1103	$charset =~ tr/A-Z/a-z/; # lowercase
	1104	$charset =~ s/\-/_/g;
[8902]	1105	if ($charset =~ /gb_?2312/) { $charset="gb" }
[2847]	1106	# assumes EUC-KR, not ISO-2022 !?
[7703]	1107	$charset =~ s/^ks_c_5601_1987/korean/;
	1108	if ($charset eq 'utf_8') {$charset='utf8'}
[2847]	1109
[7703]	1110	my $outhandle = $self->{'outhandle'};
	1111
	1112	if ($charset eq "utf8") {
	1113	# no conversion needed, but lets check that it's valid utf8
	1114	# see utf-8 manpage for valid ranges
	1115	$$textref =~ m/^/g; # to set \G
	1116	my $badbytesfound=0;
	1117	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
	1118	my $highbytes=$1;
	1119	my $highbyteslength=length($highbytes);
	1120	# replace any non utf8 complaint bytes
	1121	$highbytes =~ /^/g; # set pos()
	1122	while ($highbytes =~
[8904]	1123	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7703]	1124	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8904]	1125	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	1126	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
[7703]	1127	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
	1128	)*([\x80-\xff])? !xg
	1129	) {
	1130	my $badbyte=$1;
	1131	if (!defined $badbyte) {next} # hit end of string
	1132	my $pos=pos($highbytes);
	1133	substr($highbytes, $pos-1, 1, "\xc2\x80");
	1134	# update the position to continue searching (for \G)
	1135	pos($highbytes) = $pos+1; # set to just after the \x80
	1136	$badbytesfound=1;
	1137	}
	1138	if ($badbytesfound==1) {
	1139	# claims to be utf8, but it isn't!
[15872]	1140	print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
[7703]	1141	. "detected and removed.\n";
	1142
	1143	my $replength=length($highbytes);
	1144	my $textpos=pos($$textref);
	1145	# replace bad bytes with good bytes
	1146	substr( $$textref, $textpos-$replength,
	1147	$replength, $highbytes);
	1148	# update the position to continue searching (for \G)
	1149	pos($$textref)=$textpos+($replength-$highbyteslength);
	1150	}
	1151	}
[3206]	1152	return;
	1153	}
	1154
[2847]	1155	# It appears that we can't always trust ascii text so we'll treat it
	1156	# as iso-8859-1 (letting characters above 0x80 through without
	1157	# converting them to utf-8 will result in invalid XML documents
	1158	# which can't be parsed at build time).
	1159	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
	1160
[3073]	1161	if ($charset eq "iso_8859_1") {
	1162	# test if the mailer lied, and it has win1252 chars in it...
	1163	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
	1164	if ($$textref =~ m/[\x80-\x9f]/) {
[15872]	1165	print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
[3073]	1166	print $outhandle "codepage 1252 detected.\n";
	1167	$charset = "windows_1252";
	1168	}
	1169	}
[3351]	1170	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
	1171
	1172	if ($utf8_text ne "") {
	1173	$$textref=$utf8_text;
	1174	} else {
	1175	# we didn't get any text... unsupported encoding perhaps? Or it is
	1176	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
	1177	# characters out here if this causes problems...
[3726]	1178	my $outhandle=$self->{'outhandle'};
[15872]	1179	print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
[3351]	1180	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
	1181
	1182	}
[2847]	1183	}
	1184
[17026]	1185	sub get_base_OID {
	1186	my $self = shift(@_);
	1187	my ($doc_obj) = @_;
[2847]	1188
[17026]	1189	if ($self->{'OIDtype'} eq "message_id") {
	1190	# temporarily set OIDtype to hash to get a base id
	1191	$self->{'OIDtype'} = "hash_on_ga_xml";
[19282]	1192	my $id = $self->SUPER::get_base_OID(@_);
[17026]	1193	$self->{'OIDtype'} = "message_id";
	1194	return $id;
	1195	}
	1196	return $self->SUPER::get_base_OID(@_);
	1197	}
	1198
	1199
	1200	sub add_OID {
[7703]	1201	my $self = shift (@_);
	1202	my ($doc_obj, $id, $segment_number) = @_;
[17026]	1203	if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
[7703]	1204	$doc_obj->set_OID($doc_obj->{'msgid'});
[17026]	1205	}
	1206	else {
[7703]	1207	$doc_obj->set_OID("$id\_$segment_number");
	1208	}
	1209	}
	1210
	1211
[1206]	1212	# Perl packages have to return true if they are run.
	1213	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/EmailPlugin.pm@ 32581

Download in other formats: