Context Navigation

source: gsdl/trunk/perllib/plugins/EmailPlugin.pm@ 18327

Last change on this file since 18327 was 17053, checked in by kjdon, 16 years ago
made the default OIDtype auto, add message_id at end instead of start of list
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 38.0 KB

Line
1	###########################################################################
2	#
3	# EmailPlugin.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EmailPlugin
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content (optional, not stored by default)
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64
65	package EmailPlugin;
66
67	use strict;
68	no strict "refs"; # so we can use a variable as a filehandle for print $out
69
70
71	use SplitTextFile;
72	use unicode; # gs conv functions
73	use gsprintf 'gsprintf'; # translations
74
75	use sorttools;
76	use util;
77
78	sub BEGIN {
79	@EmailPlugin::ISA = ('SplitTextFile');
80	}
81
82	my $extended_oidtype_list =
83	[ {'name' => "message_id",
84	'desc' => "{EmailPlugin.OIDtype.message_id}" }
85	];
86
87	# add in all the standard options from BasePlugin
88	unshift (@$extended_oidtype_list, @{$BasePlugin::oidtype_list});
89
90	my $arguments =
91	[ { 'name' => "process_exp",
92	'desc' => "{BasePlugin.process_exp}",
93	'type' => "regexp",
94	'reqd' => "no",
95	'deft' => &get_default_process_exp() },
96	{ 'name' => "no_attachments",
97	'desc' => "{EmailPlugin.no_attachments}",
98	'type' => "flag",
99	'reqd' => "no" },
100	{ 'name' => "headers",
101	'desc' => "{EmailPlugin.headers}",
102	'type' => "flag",
103	'reqd' => "no" },
104	{ 'name' => "OIDtype",
105	'desc' => "{import.OIDtype}",
106	'type' => "enum",
107	'list' => $extended_oidtype_list,
108	'deft' => "auto",
109	'reqd' => "no",
110	'modegli' => "2" },
111	{ 'name' => "OIDmetadata",
112	'desc' => "{import.OIDmetadata}",
113	'type' => "metadata",
114	'deft' => "dc.Identifier",
115	'reqd' => "no",
116	'modegli' => "2" },
117	{ 'name' => "split_exp",
118	'desc' => "{EmailPlugin.split_exp}",
119	'type' => "regexp",
120	'reqd' => "no",
121	'deft' => &get_default_split_exp() }
122	];
123
124	my $options = { 'name' => "EmailPlugin",
125	'desc' => "{EmailPlugin.desc}",
126	'abstract' => "no",
127	'inherits' => "yes",
128	'args' => $arguments };
129
130	sub new {
131	my ($class) = shift (@_);
132	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
133	push(@$pluginlist, $class);
134
135	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
136	push(@{$hashArgOptLists->{"OptList"}},$options);
137
138	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
139
140	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
141	$self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
142
143	# this might not actually be true at read-time, but after processing
144	# it should all be utf8.
145	$self->{'input_encoding'}="utf8";
146	return bless $self, $class;
147	}
148
149	sub get_default_process_exp {
150	my $self = shift (@_);
151	# mbx/email for mailbox file format, \d+ for maildir (each message is
152	# in a separate file, with a unique number for filename)
153	# mozilla and IE will save individual mbx format files with a ".eml" ext.
154	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
155	}
156
157	# This plugin splits the mbox mail files at lines starting with From<sp>
158	# It is supposed to be "\n\nFrom ", but this isn't always used.
159	# add \d{4} so that the line ends in a year (in case the text has an
160	# unescaped "From " at the start of a line).
161	sub get_default_split_exp {
162	return q^\nFrom .*\d{4}\n^;
163
164	}
165
166	sub can_process_this_file {
167	my $self = shift(@_);
168	my ($filename) = @_;
169
170	# avoid any confusion between filenames matching \d+ (which are by default
171	# matched by EmailPlugin) and directories that match \d+ (which should not)
172
173	return 0 if (-d $filename);
174
175	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
176	return 1;
177	}
178	return 0;
179
180	}
181
182
183	# do plugin specific processing of doc_obj
184	sub process {
185
186	my $self = shift (@_);
187	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
188	my $outhandle = $self->{'outhandle'};
189
190	# Check that we're dealing with a valid mail file
191	# mbox message files start with "From "
192	# maildir messages usually start with Return-Path and Delivered-To
193	# mh is very similar to maildir
194	my $startoffile=substr($$textref,0,256);
195	if (($startoffile !~ /^(From )/) &&
196	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
197	return undef;
198	}
199
200	my $cursection = $doc_obj->get_top_section();
201
202	#
203	# Parse the document's text and extract metadata
204	#
205
206	# Protect backslashes
207	$$textref =~ s@\\@\\\\@g;
208
209	# Separate header from body of message
210	my $Headers = $$textref;
211	$Headers =~ s/\r?\n\r?\n(.*)$//s;
212	$$textref = $1;
213	$Headers .= "\n";
214
215	# Unfold headers - see rfc822
216	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
217	# Extract basic metadata from header
218	my @headers = ("From", "To", "Subject", "Date");
219	my %raw;
220	foreach my $name (@headers) {
221	$raw{$name} = "No $name value";
222	}
223
224	# Get a default encoding for the header - RFC says should be ascii...
225	my $default_header_encoding="iso_8859_1";
226
227	# We don't know what character set is the user's default...
228	# We could use textcat to guess... for now we'll look at mime content-type
229	# if ($Headers =~ /([[:^ascii:]])/) {
230	# }
231	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
232	$default_header_encoding=$1;
233	$default_header_encoding =~ s@\-@_@g;
234	$default_header_encoding =~ tr/A-Z/a-z/;
235	}
236
237
238	# Examine each line of the headers
239	my ($line, $name, $value);
240	my @parts;
241	foreach $line (split(/\n/, $Headers)) {
242
243	# Ignore lines with no content or which begin with whitespace
244	next unless ($line =~ /:/);
245	next if ($line =~ /^\s/);
246
247	# Find out what metadata is on this line
248	@parts = split(/:/, $line);
249	$name = shift @parts;
250	# get fieldname in canonical form - first cap, then lower case.
251	$name =~ tr/A-Z/a-z/;
252	# uppercase the first character according to the current locale
253	$name=~s/(.+)/\u$1/;
254	next unless $name;
255	next unless ($raw{$name});
256
257	# Find the value of that metadata
258	$value = join(":", @parts);
259	$value =~ s/^\s+//;
260	$value =~ s/\s+$//;
261	# decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
262	$self->decode_header_value($default_header_encoding, \$value);
263
264	# Store the metadata
265	$value =~ s@_@\\_@g; # protect against GS macro language
266	$raw{$name} = $value;
267	}
268
269	# Extract the name and e-mail address from the From metadata
270	my $frommeta = $raw{"From"};
271	my $fromnamemeta;
272	my $fromaddrmeta;
273
274	$frommeta =~ s/\s*$//; # Remove trailing space, if any
275
276	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
277	$fromnamemeta=$1;
278	$fromaddrmeta=$2;
279	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
280	$fromnamemeta=$2;
281	$fromaddrmeta=$1;
282	}
283	if (!defined($fromaddrmeta)) {
284	$fromaddrmeta=$frommeta;
285	}
286	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
287	# minor attempt to prevent spam-bots from harvesting addresses...
288	$fromaddrmeta=~s/@/@/;
289
290	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
291
292	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
293	$fromnamemeta =~ s/\"//g; # remove quotes
294	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
295	}
296	else {
297	$fromnamemeta = $fromaddrmeta;
298	}
299	# if name is an address
300	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
301	$fromnamemeta=~s/@/&#64\;/;
302	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
303
304	$raw{"From"}=$frommeta;
305
306	# Process Date information
307	if ($raw{"Date"} !~ /No Date/) {
308	$raw{"DateText"} = $raw{"Date"};
309
310	# Convert the date text to internal date format
311	$value = $raw{"Date"};
312	# proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
313	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
314	if (!defined($day) \|\| !defined($month) \|\| !defined ($year)) {
315	# try monthly archive format: Wed Apr 23 00:26:08 2008
316	($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
317	}
318
319	# make some assumptions about the year formatting...
320	# some (old) software thinks 2001 is 101, some think 2001 is 01
321	if ($year < 20) { $year += 2000; } # assume not really 1920...
322	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
323	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
324
325	} else {
326	# We have not extracted a date
327	$raw{"DateText"} = "Unknown.";
328	$raw{"Date"} = "19000000";
329	}
330
331	# Add extracted metadata to document object
332	foreach my $name (keys %raw) {
333	$value = $raw{$name};
334	if ($value) {
335	# assume subject, etc headers have no special HTML meaning.
336	$value = &text_into_html($value);
337	# escape [] so it isn't re-interpreted as metadata
338	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
339	} else {
340	$value = "No $name field";
341	}
342	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
343	}
344
345
346	# extract a message ID from the headers, if there is one, and we'll use
347	# that as the greenstone doc ID. Having a predictable ID means we can
348	# link to other messages, eg from In-Reply-To or References headers...
349	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
350	my $id=escape_msg_id($1);
351	$doc_obj->{'msgid'}=$id;
352	}
353	# link to another message, if this is a reply
354	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
355	my $id=escape_msg_id($1);
356	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
357	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
358	# References can have multiple, get the last one
359	my $id=escape_msg_id($1);
360	# not necessarily in-reply-to, but same thread...
361	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
362	}
363
364
365
366	my $mimetype="text/plain";
367	my $mimeinfo="";
368	my $charset = $default_header_encoding;
369	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
370	# more than one parameter given to Content-type.
371	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
372	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
373	{
374	$mimetype=$1;
375	$mimetype =~ tr/[A-Z]/[a-z]/;
376
377	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
378	$mimetype = "text/plain";
379	}
380
381	$mimeinfo=$2;
382	if (!defined $mimeinfo) {
383	$mimeinfo="";
384	} else { # strip leading and trailing stuff
385	$mimeinfo =~ s/^\;\s*//;
386	$mimeinfo =~ s/\s*$//;
387	}
388	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
389	$charset = $1;
390	}
391	}
392
393	my $transfer_encoding="7bit";
394	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
395	$transfer_encoding=$1;
396	}
397
398	if ($mimetype eq "text/html") {
399	$$textref= $self->text_from_part($$textref, $Headers);
400	} elsif ($mimetype ne "text/plain") {
401	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
402	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
403	} else { # mimetype eq text/plain
404
405	if ($transfer_encoding =~ /quoted\-printable/) {
406	$$textref=qp_decode($$textref);
407	} elsif ($transfer_encoding =~ /base64/) {
408	$$textref=base64_decode($$textref);
409	}
410	$self->convert2unicode($charset, $textref);
411
412	$$textref = &text_into_html($$textref);
413	$$textref =~ s@_@\\_@g; # protect against GS macro language
414
415	}
416
417
418	if ($self->{'headers'} && $self->{'headers'} == 1) {
419	# Add "All headers" metadata
420	$Headers = &text_into_html($Headers);
421
422	$Headers = "No headers" unless ($Headers =~ /\w/);
423	$Headers =~ s/@/&#64\;/g;
424	# escape [] so it isn't re-interpreted as metadata
425	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
426	$self->convert2unicode($charset, \$Headers);
427
428	$Headers =~ s@_@\\_@g; # protect against GS macro language
429	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
430	}
431
432
433	# Add Title metadata
434	my $Title = text_into_html($raw{'Subject'});
435	$Title .= "<br>From: " . text_into_html($fromnamemeta);
436	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
437	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
438
439	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
440
441	# Add FileFormat metadata
442	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
443
444	# Add text to document object
445	$$textref = "No message" unless ($$textref =~ /\w/);
446
447	$doc_obj->add_utf8_text($cursection, $$textref);
448
449	return 1;
450	}
451
452	# delete any temp files that we have created
453	sub clean_up_after_doc_obj_processing {
454	my $self = shift(@_);
455
456	foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
457	if (-e $tmp_file_path) {
458	&util::rm($tmp_file_path);
459	}
460	}
461
462	}
463
464	# Convert a text string into HTML.
465	#
466	# The HTML is going to be inserted into a GML file, so
467	# we have to be careful not to use symbols like ">",
468	# which ocurs frequently in email messages (and use
469	# &gt instead.
470	#
471	# This function also turns links and email addresses into hyperlinks,
472	# and replaces carriage returns with <BR> tags (and multiple carriage
473	# returns with <P> tags).
474
475
476	sub text_into_html {
477	my ($text) = @_;
478
479	# Convert problem characters into HTML symbols
480	$text =~ s/&/&/g;
481	$text =~ s/</</g;
482	$text =~ s/>/>/g;
483	$text =~ s/\"/"/g;
484
485	# convert email addresses and URIs into links
486	# don't markup email addresses for now
487	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
488
489	# try to munge email addresses a little bit...
490	$text =~ s/@/@/;
491	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
492	# assume URI doesn't finish with a '.'
493	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
494
495
496	# Clean up whitespace and convert \n charaters to <BR> or <P>
497	$text =~ s/ +/ /g;
498	$text =~ s/\s*$//g;
499	$text =~ s/^\s*//g;
500	$text =~ s/\n/\n<br>/g;
501	$text =~ s/<br>\s*<br>/<p>/gi;
502
503	return $text;
504	}
505
506
507
508
509	#Process a MIME message.
510	# the textref we are given DOES NOT include the header.
511	sub text_from_mime_message {
512	my $self = shift(@_);
513	my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
514	my $outhandle=$self->{'outhandle'};
515	# Check for multiparts - $mimeinfo will be a boundary
516	if ($mimetype =~ /multipart/) {
517	my $boundary="";
518	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
519	$boundary=$1;
520	if ($boundary =~ m@^\"@) {
521	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
522	}
523	} else {
524	print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
525	}
526	# parts start with "--$boundary"
527	# message ends with "--$boundary--"
528	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
529	# that perl might want to interpolate. Also allows spaces...
530	$boundary=~s/\\/\\\\/g;
531	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
532	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
533	# remove first "part" and last "part" (final --)
534	shift @message_parts;
535	my $last=pop @message_parts;
536	# if our boundaries are a bit dodgy and we only found 1 part...
537	if (!defined($last)) {$last="";}
538	# make sure it is only -- and whitespace
539	if ($last !~ /^\-\-\s*$/ms) {
540	print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
541	}
542	foreach my $message_part (@message_parts) {
543	# remove the leading newline left from split.
544	$message_part=~s/^\r?\n//;
545	}
546	if ($mimetype eq "multipart/alternative") {
547	# check for an HTML version first, then TEXT, otherwise use first.
548	my $part_text="";
549	foreach my $message_part (@message_parts) {
550	if ($message_part =~ m@^content\-type:\s*text/html@i)
551	{
552	# Use the HTML version
553	$part_text = $self->text_from_part($message_part);
554	$mimetype="text/html";
555	last;
556	}
557	}
558	if ($part_text eq "") { # try getting a text part instead
559	foreach my $message_part (@message_parts) {
560	if ($message_part =~ m@^content\-type:\s*text/plain@i)
561	{
562	# Use the plain version
563	$part_text = $self->text_from_part($message_part);
564	if ($part_text =~/[^\s]/) {
565	$part_text = text_into_html($part_text);
566	}
567	$mimetype="text/plain";
568	last;
569	}
570	}
571	}
572	if ($part_text eq "") { #use first part (no html/text part found)
573	$part_text = $self->text_from_part(shift @message_parts);
574	$part_text = text_into_html($part_text);
575	}
576	if ($part_text eq "") { # we couldn't get anything!!!
577	# or it was an empty message...
578	# do nothing...
579	gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
580	} else {
581	$text = $part_text;
582	}
583	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
584	$text = "";
585	# signed is for PGP/GPG messages... the last part is a hash
586	if ($mimetype =~ m@multipart/signed@) {
587	pop @message_parts;
588	}
589	my $is_first_part=1;
590	foreach my $message_part (@message_parts) {
591	if ($is_first_part && $text ne "") {$is_first_part=0;}
592
593	if ($mimetype eq "multipart/digest") {
594	# default type - RTFRFC!! Set if not already set
595	$message_part =~ m@^(.*)\n\r?\n@s;
596	my $part_header=$1;
597	if ($part_header !~ m@^content-type@mi) {
598	$message_part="Content-type: message/rfc822\n"
599	. $message_part; # prepend default type
600	}
601	}
602
603	$text .= $self->process_multipart_part($default_header_encoding,
604	$message_part,
605	$is_first_part);
606	} # foreach message part.
607	} else {
608	# we can't handle this multipart type (not mixed or alternative)
609	# the RFC also mentions "parallel".
610	}
611	} # end of ($mimetype =~ multipart)
612	elsif ($mimetype =~ m@message/rfc822@) {
613	my $msg_header = $text;
614	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
615	$text = $1;
616
617	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
618	{
619	$mimetype=$1;
620	$mimeinfo=$2;
621	$mimetype =~ tr/[A-Z]/[a-z]/;
622
623	my $msg_text;
624	if ($mimetype =~ m@multipart/@) {
625	$msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
626	$default_header_encoding,
627	$text);
628	} else {
629	$msg_text=$self->text_from_part($text,$msg_header);
630	}
631
632	my $brief_header=text_into_html(get_brief_headers($msg_header));
633	$text= "\n<b><<attached message>></b><br>";
634	$text.= "<table><tr><td width=\"5%\"> </td>\n";
635	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
636	. "</td></tr></table>";
637	}
638	} else {
639	# we don't do any processing of the content.
640	}
641
642	return $text;
643	}
644
645
646
647	# used for turning a message id into a more friendly string for greenstone
648	sub escape_msg_id {
649	#msgid
650	my $id=shift;
651	chomp $id; $id =~ s!\s!!g; # remove spaces
652	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
653	$id =~ s![_&]!-!g; # replace symbols that might cause problems
654	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
655	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
656	return $id;
657	}
658
659
660
661	sub process_multipart_part {
662	my $self = shift;
663	my $default_header_encoding = shift;
664	my $message_part = shift;
665	my $is_first_part = shift;
666
667	my $return_text="";
668	my $part_header=$message_part;
669	my $part_body;
670	if ($message_part=~ /^\s*\n/) {
671	# no header... use defaults
672	$part_body=$message_part;
673	$part_header="Content-type: text/plain; charset=us-ascii";
674	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
675	$part_body=$1;
676	} else {
677	# something's gone wrong...
678	$part_header="";
679	$part_body=$message_part;
680	}
681
682	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
683	my $part_content_type="";
684	my $part_content_info="";
685
686	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
687	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
688	$part_content_info=$2;
689	if (!defined($part_content_info)) {
690	$part_content_info="";
691	} else {
692	$part_content_info =~ s/^\;\s*//;
693	$part_content_info =~ s/\s*$//;
694	}
695	}
696	my $filename="";
697	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
698	$filename=$1;
699	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
700	# decode the filename
701	$self->decode_header_value($default_header_encoding, \$filename);
702
703	}
704
705	# disposition - either inline or attachment.
706	# NOT CURRENTLY USED - we display all text types instead...
707	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
708
709	# add <<attachment>> to each part except the first...
710	if (!$is_first_part) {
711	$return_text.="\n<p><hr><strong><<attachment>>";
712	# add part info header
713	my $header_text = "<br>Type: $part_content_type<br>\n";
714	if ($filename ne "") {
715	$header_text .= "Filename: $filename\n";
716	}
717	$header_text =~ s@_@\\_@g;
718	$return_text .= $header_text . "</strong></p>\n<p>\n";
719	}
720
721	if ($part_content_type =~ m@text/@)
722	{
723	# $message_part includes the mime part headers
724	my $part_text = $self->text_from_part($message_part);
725	if ($part_content_type !~ m@text/(ht\|x)ml@) {
726	$part_text = text_into_html($part_text);
727	}
728	if ($part_text eq "") {
729	$part_text = ' ';
730	}
731	$return_text .= $part_text;
732	} elsif ($part_content_type =~ m@message/rfc822@) {
733	# This is a forwarded message
734	my $message_part_headers=$part_body;
735	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
736	my $message_part_body=$1;
737	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
738
739	my $rfc822_formatted_body=""; # put result in here
740	if ($message_part_headers =~
741	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
742	{
743	# The message header uses MIME flags
744	my $message_content_type=$1;
745	my $message_content_info=$2;
746	if (!defined($message_content_info)) {
747	$message_content_info="";
748	} else {
749	$message_content_info =~ s/^\;\s*//;
750	$message_content_info =~ s/\s*$//;
751	}
752	$message_content_type =~ tr/A-Z/a-z/;
753	if ($message_content_type =~ /multipart/) {
754	$rfc822_formatted_body=
755	$self->text_from_mime_message($message_content_type,
756	$message_content_info,
757	$default_header_encoding,
758	$message_part_body);
759	} else {
760	$message_part_body=$self->text_from_part($part_body,
761	$message_part_headers);
762	$rfc822_formatted_body=text_into_html($message_part_body);
763	}
764	} else {
765	# message doesn't use MIME flags
766	$rfc822_formatted_body=text_into_html($message_part_body);
767	$rfc822_formatted_body =~ s@_@\\_@g;
768	}
769	# Add the returned text to the output
770	# don't put all the headers...
771	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
772	my $brief_headers=get_brief_headers($message_part_headers);
773	$return_text.=text_into_html($brief_headers);
774	$return_text.="</p><p>\n";
775	$return_text.=$rfc822_formatted_body;
776	$return_text.="</p>\n";
777	# end of message/rfc822
778	} elsif ($part_content_type =~ /multipart/) {
779	# recurse again
780
781	my $tmptext= $self->text_from_mime_message($part_content_type,
782	$part_content_info,
783	$default_header_encoding,
784	$part_body);
785	$return_text.=$tmptext;
786	} else {
787	# this part isn't text/* or another message...
788	if ($is_first_part) {
789	# this is the first part of a multipart, or only part!
790	$return_text="\n<p><hr><strong><<attachment>>";
791	# add part info header
792	my $header_text="<br>Type: $part_content_type<br>\n";
793	$header_text.="Filename: $filename</strong></p>\n<p>\n";
794	$header_text =~ s@_@\\_@g;
795	$return_text.=$header_text;
796	}
797
798	# save attachment by default
799	if (!$self->{'no_attachments'}
800	&& $filename ne "") { # this part has a file...
801	my $encoding="8bit";
802	if ($part_header =~
803	/^content-transfer-encoding:\s*(\w+)/mi ) {
804	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
805	}
806	my $tmpdir=&util::filename_cat($ENV{'GSDLHOME'}, "tmp");
807	my $save_filename=$filename;
808
809	# make sure we don't clobber files with same name;
810	# need to keep state between .mbx files
811	my $assoc_files=$self->{'assoc_filenames'};
812	if ($assoc_files->{$filename}) { # it's been set...
813	$assoc_files->{$filename}++;
814	$filename =~ m/(.+)\.(\w+)$/;
815	my ($filestem, $ext)=($1,$2);
816	$save_filename="${filestem}_"
817	. $assoc_files->{$filename} . ".$ext";
818	} else { # first file with this name
819	$assoc_files->{$filename}=1;
820	}
821	my $tmp_filename = &util::filename_cat($tmpdir, $save_filename);
822	open (SAVE, ">$tmp_filename") \|\|
823	warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
824	binmode(SAVE); # needed on Windows
825	my $part_text = $message_part;
826	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
827	if ($encoding eq "base64") {
828	print SAVE base64_decode($part_text);
829	} elsif ($encoding eq "quoted-printable") {
830	print SAVE qp_decode($part_text);
831	} else { # 7bit, 8bit, binary, etc...
832	print SAVE $part_text;
833	}
834	close SAVE;
835	my $doc_obj=$self->{'doc_obj'};
836	$doc_obj->associate_file("$tmp_filename",
837	"$save_filename",
838	$part_content_type # mimetype
839	);
840	# add this file to the list of tmp files for deleting later
841	push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
842
843	my $outhandle=$self->{'outhandle'};
844	print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
845
846	# be nice if "download" was a translatable macro :(
847	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
848	} # end of save attachment
849	} # end of !text/message part
850
851
852	return $return_text;
853	}
854
855
856	# Return only the "important" headers from a set of message headers
857	sub get_brief_headers {
858	my $msg_header = shift;
859	my $brief_header = "";
860
861	# Order matters!
862	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
863	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
864	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
865	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
866	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
867
868	return $brief_header;
869	}
870
871
872	# Process a MIME part. Return "" if we can't decode it.
873	# should only be called for parts with type "text/*" ?
874	# Either pass the entire mime part (including the part's header),
875	# or pass the mime part's text and optionally the part's header.
876	sub text_from_part {
877	my $self = shift;
878	my $text = shift \|\| '';
879	my $part_header = shift;
880
881
882	my $type="text/plain"; # default, overridden from part header
883	my $charset=undef; # convert2unicode() will guess if necessary
884
885	if (! $part_header) { # no header argument was given. check the body
886	$part_header = $text;
887	# check for empty part header (leading blank line)
888	if ($text =~ /^\s*\r?\n/) {
889	$part_header="Content-type: text/plain; charset=us-ascii";
890	} else {
891	$part_header =~ s/\r?\n\r?\n(.*)$//s;
892	$text=$1; if (!defined($text)) {$text="";}
893	}
894	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
895	}
896
897	if ($part_header =~
898	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
899	$type=$1;
900	$charset=$2;
901	}
902	my $encoding="";
903	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
904	$encoding=$1; $encoding=~tr/A-Z/a-z/;
905	}
906	# Content-Transfer-Encoding is per-part
907	if ($encoding ne "") {
908	if ($encoding =~ /quoted\-printable/) {
909	$text=qp_decode($text);
910	} elsif ($encoding =~ /base64/) {
911	$text=base64_decode($text);
912	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
913	# rfc2045 also allows binary, which we ignore (for now).
914	my $outhandle=$self->{'outhandle'};
915	print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
916	return "";
917	}
918	}
919
920	if ($type eq "text/html") {
921	# only get stuff between <body> tags, or <html> tags.
922	$text =~ s@^.<html[^>]>@@is;
923	$text =~ s@</html>.*$@@is;
924	$text =~ s/^.?<body[^>]>//si;
925	$text =~ s/<\/body>.*$//si;
926	}
927	elsif ($type eq "text/xml") {
928	$text=~s/</</g;$text=~s/>/>/g;
929	$text="<pre>\n$text\n</pre>\n";
930	}
931	# convert to unicode
932	$self->convert2unicode($charset, \$text);
933	$text =~ s@_@\\_@g; # protect against GS macro language
934	return $text;
935	}
936
937
938
939
940	# decode quoted-printable text
941	sub qp_decode {
942	my $text=shift;
943
944	# if a line ends with "=\s*", it is a soft line break, otherwise
945	# keep in any newline characters.
946
947	$text =~ s/=\s*\r?\n//mg;
948	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
949	return $text;
950	}
951
952	# decode base64 text. This is fairly slow (since it's interpreted perl rather
953	# than compiled XS stuff like in the ::MIME modules, but this is more portable
954	# for us at least).
955	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
956	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
957	# from each encoded byte.
958
959
960	sub base64_decode {
961	my $enc_text = shift;
962	# A=>0, B=>1, ..., '+'=>62, '/'=>63
963	# also '=' is used for padding at the end, but we remove it anyway.
964	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
965	# map each MIME char into it's value, for more efficient lookup.
966	my %index;
967	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
968	# remove all non-base64 chars. eval to get variable in transliteration...
969	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
970	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
971	my $decoded="";
972	while (length ($enc_text)>3)
973	{
974	my $fourchars=substr($enc_text,0,4,"");
975	my @chars=(split '',$fourchars);
976	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
977	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
978	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
979	}
980	# if there are any input chars left, there are either
981	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
982	my @chars=(split '',$enc_text);
983	if (length($enc_text)) {
984	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
985	}
986	if (length($enc_text)==3) {
987	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
988	}
989	return $decoded;
990	}
991
992	# returns 0 if valid utf-8, 1 if invalid
993	sub is_utf8 {
994	my $self = shift;
995	my $textref = shift;
996
997	$$textref =~ m/^/g; # to set \G
998	my $badbytesfound=0;
999	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1000	my $highbytes=$1;
1001	my $highbyteslength=length($highbytes);
1002	# replace any non utf8 complaint bytes
1003	$highbytes =~ /^/g; # set pos()
1004	while ($highbytes =~
1005	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
1006	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
1007	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
1008	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
1009	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1010	)*([\x80-\xff])? !xg
1011	) {
1012	my $badbyte=$1;
1013	if (!defined $badbyte) {next} # hit end of string
1014	return 1;
1015	}
1016	}
1017	return 0;
1018	}
1019
1020	# words with non ascii characters in header values must be encoded in the
1021	# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
1022
1023	sub decode_header_value {
1024	my $self = shift(@_);
1025	my ($default_header_encoding, $textref) = @_;
1026
1027	if (!$$textref) {
1028	# nothing to do!
1029	return;
1030	}
1031	my $value = $$textref;
1032	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
1033	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
1034	my $original_value=$value;
1035	my $encoded=$value;
1036	$value="";
1037	# we should ignore spaces between consecutive encoded-texts
1038	$encoded =~ s@\?=\s+=\?@\?==\?@g;
1039	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
1040	my ($charset, $encoding, $data)=($2,$3,$4);
1041	my ($decoded_data);
1042	my $leading_chars = "$1";
1043	$self->convert2unicode($default_header_encoding, \$leading_chars);
1044	$value.=$leading_chars;
1045
1046	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
1047	chomp $data;
1048	$encoding =~ tr/BQ/bq/;
1049	if ($encoding eq "q") { # quoted printable
1050	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
1051	$decoded_data=qp_decode($data);
1052	# qp_decode adds \n, which is default for body text
1053	chomp($decoded_data);
1054	} else { # base 64
1055	$decoded_data=base64_decode($data);
1056	}
1057	$self->convert2unicode($charset, \$decoded_data);
1058	$value .= $decoded_data;
1059	} # end of while loop
1060
1061	# get any trailing characters
1062	$self->convert2unicode($default_header_encoding, \$encoded);
1063	$value.=$encoded;
1064
1065	if ($value =~ /^\s*$/) { # we couldn't extract anything...
1066	$self->convert2unicode($default_header_encoding,
1067	\$original_value);
1068	$value=$original_value;
1069	}
1070	$$textref = $value;
1071	} # end of if =?...?=
1072
1073	# In the absense of other charset information, assume the
1074	# header is the default (usually "iso_8859_1") and convert to unicode.
1075	else {
1076	$self->convert2unicode($default_header_encoding, $textref);
1077	}
1078
1079	}
1080
1081
1082
1083	sub convert2unicode {
1084	my $self = shift(@_);
1085	my ($charset, $textref) = @_;
1086
1087	if (!$$textref) {
1088	# nothing to do!
1089	return;
1090	}
1091
1092	if (! defined $charset) {
1093	# check if we have valid utf-8
1094	if ($self->is_utf8($textref)) { $charset = "utf8" }
1095
1096	# default to latin
1097	$charset = "iso_8859_1" if ! defined($charset);
1098	}
1099
1100	# first get our character encoding name in the right form.
1101	$charset =~ tr/A-Z/a-z/; # lowercase
1102	$charset =~ s/\-/_/g;
1103	if ($charset =~ /gb_?2312/) { $charset="gb" }
1104	# assumes EUC-KR, not ISO-2022 !?
1105	$charset =~ s/^ks_c_5601_1987/korean/;
1106	if ($charset eq 'utf_8') {$charset='utf8'}
1107
1108	my $outhandle = $self->{'outhandle'};
1109
1110	if ($charset eq "utf8") {
1111	# no conversion needed, but lets check that it's valid utf8
1112	# see utf-8 manpage for valid ranges
1113	$$textref =~ m/^/g; # to set \G
1114	my $badbytesfound=0;
1115	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1116	my $highbytes=$1;
1117	my $highbyteslength=length($highbytes);
1118	# replace any non utf8 complaint bytes
1119	$highbytes =~ /^/g; # set pos()
1120	while ($highbytes =~
1121	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
1122	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
1123	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
1124	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
1125	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1126	)*([\x80-\xff])? !xg
1127	) {
1128	my $badbyte=$1;
1129	if (!defined $badbyte) {next} # hit end of string
1130	my $pos=pos($highbytes);
1131	substr($highbytes, $pos-1, 1, "\xc2\x80");
1132	# update the position to continue searching (for \G)
1133	pos($highbytes) = $pos+1; # set to just after the \x80
1134	$badbytesfound=1;
1135	}
1136	if ($badbytesfound==1) {
1137	# claims to be utf8, but it isn't!
1138	print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
1139	. "detected and removed.\n";
1140
1141	my $replength=length($highbytes);
1142	my $textpos=pos($$textref);
1143	# replace bad bytes with good bytes
1144	substr( $$textref, $textpos-$replength,
1145	$replength, $highbytes);
1146	# update the position to continue searching (for \G)
1147	pos($$textref)=$textpos+($replength-$highbyteslength);
1148	}
1149	}
1150	return;
1151	}
1152
1153	# It appears that we can't always trust ascii text so we'll treat it
1154	# as iso-8859-1 (letting characters above 0x80 through without
1155	# converting them to utf-8 will result in invalid XML documents
1156	# which can't be parsed at build time).
1157	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
1158
1159	if ($charset eq "iso_8859_1") {
1160	# test if the mailer lied, and it has win1252 chars in it...
1161	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1162	if ($$textref =~ m/[\x80-\x9f]/) {
1163	print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
1164	print $outhandle "codepage 1252 detected.\n";
1165	$charset = "windows_1252";
1166	}
1167	}
1168	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1169
1170	if ($utf8_text ne "") {
1171	$$textref=$utf8_text;
1172	} else {
1173	# we didn't get any text... unsupported encoding perhaps? Or it is
1174	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
1175	# characters out here if this causes problems...
1176	my $outhandle=$self->{'outhandle'};
1177	print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
1178	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1179
1180	}
1181	}
1182
1183	sub get_base_OID {
1184	my $self = shift(@_);
1185	my ($doc_obj) = @_;
1186
1187	## TODO: need a valid id in case there is no message id...
1188	if ($self->{'OIDtype'} eq "message_id") {
1189	# temporarily set OIDtype to hash to get a base id
1190	$self->{'OIDtype'} = "hash_on_ga_xml";
1191	$self->add_OID($doc_obj);
1192	my $id = $doc_obj->get_OID();
1193	$self->{'OIDtype'} = "message_id";
1194	return $id;
1195	}
1196	return $self->SUPER::get_base_OID(@_);
1197	}
1198
1199
1200	sub add_OID {
1201	my $self = shift (@_);
1202	my ($doc_obj, $id, $segment_number) = @_;
1203
1204	if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
1205	$doc_obj->set_OID($doc_obj->{'msgid'});
1206	}
1207	else {
1208	$doc_obj->set_OID("$id\_$segment_number");
1209	}
1210	}
1211
1212
1213	# Perl packages have to return true if they are run.
1214	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: