Context Navigation

source: gsdl/trunk/perllib/plugins/EmailPlugin.pm@ 16677

Last change on this file since 16677 was 16677, checked in by davidb, 16 years ago
Minor tweak to EmailPlugin to avoid directories that match \d+ being confused as e-mail files
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 37.0 KB

Line
1	###########################################################################
2	#
3	# EmailPlugin.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EmailPlugin
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content (optional, not stored by default)
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64
65	package EmailPlugin;
66
67	use strict;
68	no strict "refs"; # so we can use a variable as a filehandle for print $out
69
70
71	use SplitTextFile;
72	use unicode; # gs conv functions
73	use gsprintf 'gsprintf'; # translations
74
75	use sorttools;
76	use util;
77
78	sub BEGIN {
79	@EmailPlugin::ISA = ('SplitTextFile');
80	}
81
82
83	my $arguments =
84	[ { 'name' => "process_exp",
85	'desc' => "{BasePlugin.process_exp}",
86	'type' => "regexp",
87	'reqd' => "no",
88	'deft' => &get_default_process_exp() },
89	{ 'name' => "no_attachments",
90	'desc' => "{EmailPlugin.no_attachments}",
91	'type' => "flag",
92	'reqd' => "no" },
93	{ 'name' => "headers",
94	'desc' => "{EmailPlugin.headers}",
95	'type' => "flag",
96	'reqd' => "no" },
97	{ 'name' => "split_exp",
98	'desc' => "{EmailPlugin.split_exp}",
99	'type' => "regexp",
100	'reqd' => "no",
101	'deft' => &get_default_split_exp() }
102	];
103
104	my $options = { 'name' => "EmailPlugin",
105	'desc' => "{EmailPlugin.desc}",
106	'abstract' => "no",
107	'inherits' => "yes",
108	'args' => $arguments };
109
110	sub new {
111	my ($class) = shift (@_);
112	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
113	push(@$pluginlist, $class);
114
115	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
116	push(@{$hashArgOptLists->{"OptList"}},$options);
117
118	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
119
120	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
121	$self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
122
123	# this might not actually be true at read-time, but after processing
124	# it should all be utf8.
125	$self->{'input_encoding'}="utf8";
126	return bless $self, $class;
127	}
128
129	sub get_default_process_exp {
130	my $self = shift (@_);
131	# mbx/email for mailbox file format, \d+ for maildir (each message is
132	# in a separate file, with a unique number for filename)
133	# mozilla and IE will save individual mbx format files with a ".eml" ext.
134	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
135	}
136
137	# This plugin splits the mbox mail files at lines starting with From<sp>
138	# It is supposed to be "\n\nFrom ", but this isn't always used.
139	# add \d{4} so that the line ends in a year (in case the text has an
140	# unescaped "From " at the start of a line).
141	sub get_default_split_exp {
142	return q^\nFrom .*\d{4}\n^;
143
144	}
145
146	sub can_process_this_file {
147	my $self = shift(@_);
148	my ($filename) = @_;
149
150	# avoid any confusion between filenames matching \d+ (which are by default
151	# matched by EmailPlugin) and directories that match \d+ (which should not)
152
153	return 0 if (-d $filename);
154
155	if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
156	return 1;
157	}
158	return 0;
159
160	}
161
162
163	# do plugin specific processing of doc_obj
164	sub process {
165
166	my $self = shift (@_);
167	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
168	my $outhandle = $self->{'outhandle'};
169
170	# Check that we're dealing with a valid mail file
171	# mbox message files start with "From "
172	# maildir messages usually start with Return-Path and Delivered-To
173	# mh is very similar to maildir
174	my $startoffile=substr($$textref,0,256);
175	if (($startoffile !~ /^(From )/) &&
176	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
177	return undef;
178	}
179
180	my $cursection = $doc_obj->get_top_section();
181
182	#
183	# Parse the document's text and extract metadata
184	#
185
186	# Protect backslashes
187	$$textref =~ s@\\@\\\\@g;
188
189	# Separate header from body of message
190	my $Headers = $$textref;
191	$Headers =~ s/\r?\n\r?\n(.*)$//s;
192	$$textref = $1;
193	$Headers .= "\n";
194
195	# Unfold headers - see rfc822
196	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
197	# Extract basic metadata from header
198	my @headers = ("From", "To", "Subject", "Date");
199	my %raw;
200	foreach my $name (@headers) {
201	$raw{$name} = "No $name value";
202	}
203
204	# Get a default encoding for the header - RFC says should be ascii...
205	my $default_header_encoding="iso_8859_1";
206
207	# We don't know what character set is the user's default...
208	# We could use textcat to guess... for now we'll look at mime content-type
209	# if ($Headers =~ /([[:^ascii:]])/) {
210	# }
211	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
212	$default_header_encoding=$1;
213	$default_header_encoding =~ s@\-@_@g;
214	$default_header_encoding =~ tr/A-Z/a-z/;
215	}
216
217
218	# Examine each line of the headers
219	my ($line, $name, $value);
220	my @parts;
221	foreach $line (split(/\n/, $Headers)) {
222
223	# Ignore lines with no content or which begin with whitespace
224	next unless ($line =~ /:/);
225	next if ($line =~ /^\s/);
226
227	# Find out what metadata is on this line
228	@parts = split(/:/, $line);
229	$name = shift @parts;
230	# get fieldname in canonical form - first cap, then lower case.
231	$name =~ tr/A-Z/a-z/;
232	# uppercase the first character according to the current locale
233	$name=~s/(.+)/\u$1/;
234	next unless $name;
235	next unless ($raw{$name});
236
237	# Find the value of that metadata
238	$value = join(":", @parts);
239	$value =~ s/^\s+//;
240	$value =~ s/\s+$//;
241	# decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
242	$self->decode_header_value($default_header_encoding, \$value);
243
244	# Store the metadata
245	$value =~ s@_@\\_@g; # protect against GS macro language
246	$raw{$name} = $value;
247	}
248
249	# Extract the name and e-mail address from the From metadata
250	my $frommeta = $raw{"From"};
251	my $fromnamemeta;
252	my $fromaddrmeta;
253
254	$frommeta =~ s/\s*$//; # Remove trailing space, if any
255
256	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
257	$fromnamemeta=$1;
258	$fromaddrmeta=$2;
259	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
260	$fromnamemeta=$2;
261	$fromaddrmeta=$1;
262	}
263	if (!defined($fromaddrmeta)) {
264	$fromaddrmeta=$frommeta;
265	}
266	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
267	# minor attempt to prevent spam-bots from harvesting addresses...
268	$fromaddrmeta=~s/@/@/;
269
270	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
271
272	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
273	$fromnamemeta =~ s/\"//g; # remove quotes
274	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
275	}
276	else {
277	$fromnamemeta = $fromaddrmeta;
278	}
279	# if name is an address
280	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
281	$fromnamemeta=~s/@/&#64\;/;
282	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
283
284	$raw{"From"}=$frommeta;
285
286	# Process Date information
287	if ($raw{"Date"} !~ /No Date/) {
288	$raw{"DateText"} = $raw{"Date"};
289
290	# Convert the date text to internal date format
291	$value = $raw{"Date"};
292	# proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
293	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
294	if (!defined($day) \|\| !defined($month) \|\| !defined ($year)) {
295	# try monthly archive format: Wed Apr 23 00:26:08 2008
296	($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
297	}
298
299	# make some assumptions about the year formatting...
300	# some (old) software thinks 2001 is 101, some think 2001 is 01
301	if ($year < 20) { $year += 2000; } # assume not really 1920...
302	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
303	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
304
305	} else {
306	# We have not extracted a date
307	$raw{"DateText"} = "Unknown.";
308	$raw{"Date"} = "19000000";
309	}
310
311	# Add extracted metadata to document object
312	foreach my $name (keys %raw) {
313	$value = $raw{$name};
314	if ($value) {
315	# assume subject, etc headers have no special HTML meaning.
316	$value = &text_into_html($value);
317	# escape [] so it isn't re-interpreted as metadata
318	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
319	} else {
320	$value = "No $name field";
321	}
322	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
323	}
324
325
326	# extract a message ID from the headers, if there is one, and we'll use
327	# that as the greenstone doc ID. Having a predictable ID means we can
328	# link to other messages, eg from In-Reply-To or References headers...
329	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
330	my $id=escape_msg_id($1);
331	$doc_obj->{'msgid'}=$id;
332	}
333	# link to another message, if this is a reply
334	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
335	my $id=escape_msg_id($1);
336	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
337	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
338	# References can have multiple, get the last one
339	my $id=escape_msg_id($1);
340	# not necessarily in-reply-to, but same thread...
341	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
342	}
343
344
345
346	my $mimetype="text/plain";
347	my $mimeinfo="";
348	my $charset = $default_header_encoding;
349	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
350	# more than one parameter given to Content-type.
351	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
352	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
353	{
354	$mimetype=$1;
355	$mimetype =~ tr/[A-Z]/[a-z]/;
356
357	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
358	$mimetype = "text/plain";
359	}
360
361	$mimeinfo=$2;
362	if (!defined $mimeinfo) {
363	$mimeinfo="";
364	} else { # strip leading and trailing stuff
365	$mimeinfo =~ s/^\;\s*//;
366	$mimeinfo =~ s/\s*$//;
367	}
368	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
369	$charset = $1;
370	}
371	}
372
373	my $transfer_encoding="7bit";
374	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
375	$transfer_encoding=$1;
376	}
377
378	if ($mimetype eq "text/html") {
379	$$textref= $self->text_from_part($$textref, $Headers);
380	} elsif ($mimetype ne "text/plain") {
381	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
382	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
383	} else { # mimetype eq text/plain
384
385	if ($transfer_encoding =~ /quoted\-printable/) {
386	$$textref=qp_decode($$textref);
387	} elsif ($transfer_encoding =~ /base64/) {
388	$$textref=base64_decode($$textref);
389	}
390	$self->convert2unicode($charset, $textref);
391
392	$$textref = &text_into_html($$textref);
393	$$textref =~ s@_@\\_@g; # protect against GS macro language
394
395	}
396
397
398	if ($self->{'headers'} && $self->{'headers'} == 1) {
399	# Add "All headers" metadata
400	$Headers = &text_into_html($Headers);
401
402	$Headers = "No headers" unless ($Headers =~ /\w/);
403	$Headers =~ s/@/&#64\;/g;
404	# escape [] so it isn't re-interpreted as metadata
405	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
406	$self->convert2unicode($charset, \$Headers);
407
408	$Headers =~ s@_@\\_@g; # protect against GS macro language
409	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
410	}
411
412
413	# Add Title metadata
414	my $Title = text_into_html($raw{'Subject'});
415	$Title .= "<br>From: " . text_into_html($fromnamemeta);
416	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
417	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
418
419	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
420
421	# Add FileFormat metadata
422	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
423
424	# Add text to document object
425	$$textref = "No message" unless ($$textref =~ /\w/);
426
427	$doc_obj->add_utf8_text($cursection, $$textref);
428
429	return 1;
430	}
431
432	# delete any temp files that we have created
433	sub clean_up_after_doc_obj_processing {
434	my $self = shift(@_);
435
436	foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
437	if (-e $tmp_file_path) {
438	&util::rm($tmp_file_path);
439	}
440	}
441
442	}
443
444	# Convert a text string into HTML.
445	#
446	# The HTML is going to be inserted into a GML file, so
447	# we have to be careful not to use symbols like ">",
448	# which ocurs frequently in email messages (and use
449	# &gt instead.
450	#
451	# This function also turns links and email addresses into hyperlinks,
452	# and replaces carriage returns with <BR> tags (and multiple carriage
453	# returns with <P> tags).
454
455
456	sub text_into_html {
457	my ($text) = @_;
458
459	# Convert problem characters into HTML symbols
460	$text =~ s/&/&/g;
461	$text =~ s/</</g;
462	$text =~ s/>/>/g;
463	$text =~ s/\"/"/g;
464
465	# convert email addresses and URIs into links
466	# don't markup email addresses for now
467	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
468
469	# try to munge email addresses a little bit...
470	$text =~ s/@/@/;
471	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
472	# assume URI doesn't finish with a '.'
473	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.\|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
474
475
476	# Clean up whitespace and convert \n charaters to <BR> or <P>
477	$text =~ s/ +/ /g;
478	$text =~ s/\s*$//g;
479	$text =~ s/^\s*//g;
480	$text =~ s/\n/\n<br>/g;
481	$text =~ s/<br>\s*<br>/<p>/gi;
482
483	return $text;
484	}
485
486
487
488
489	#Process a MIME message.
490	# the textref we are given DOES NOT include the header.
491	sub text_from_mime_message {
492	my $self = shift(@_);
493	my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
494	my $outhandle=$self->{'outhandle'};
495	# Check for multiparts - $mimeinfo will be a boundary
496	if ($mimetype =~ /multipart/) {
497	my $boundary="";
498	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
499	$boundary=$1;
500	if ($boundary =~ m@^\"@) {
501	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
502	}
503	} else {
504	print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
505	}
506	# parts start with "--$boundary"
507	# message ends with "--$boundary--"
508	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
509	# that perl might want to interpolate. Also allows spaces...
510	$boundary=~s/\\/\\\\/g;
511	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
512	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
513	# remove first "part" and last "part" (final --)
514	shift @message_parts;
515	my $last=pop @message_parts;
516	# if our boundaries are a bit dodgy and we only found 1 part...
517	if (!defined($last)) {$last="";}
518	# make sure it is only -- and whitespace
519	if ($last !~ /^\-\-\s*$/ms) {
520	print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
521	}
522	foreach my $message_part (@message_parts) {
523	# remove the leading newline left from split.
524	$message_part=~s/^\r?\n//;
525	}
526	if ($mimetype eq "multipart/alternative") {
527	# check for an HTML version first, then TEXT, otherwise use first.
528	my $part_text="";
529	foreach my $message_part (@message_parts) {
530	if ($message_part =~ m@^content\-type:\s*text/html@i)
531	{
532	# Use the HTML version
533	$part_text = $self->text_from_part($message_part);
534	$mimetype="text/html";
535	last;
536	}
537	}
538	if ($part_text eq "") { # try getting a text part instead
539	foreach my $message_part (@message_parts) {
540	if ($message_part =~ m@^content\-type:\s*text/plain@i)
541	{
542	# Use the plain version
543	$part_text = $self->text_from_part($message_part);
544	if ($part_text =~/[^\s]/) {
545	$part_text = text_into_html($part_text);
546	}
547	$mimetype="text/plain";
548	last;
549	}
550	}
551	}
552	if ($part_text eq "") { #use first part (no html/text part found)
553	$part_text = $self->text_from_part(shift @message_parts);
554	$part_text = text_into_html($part_text);
555	}
556	if ($part_text eq "") { # we couldn't get anything!!!
557	# or it was an empty message...
558	# do nothing...
559	gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
560	} else {
561	$text = $part_text;
562	}
563	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
564	$text = "";
565	# signed is for PGP/GPG messages... the last part is a hash
566	if ($mimetype =~ m@multipart/signed@) {
567	pop @message_parts;
568	}
569	my $is_first_part=1;
570	foreach my $message_part (@message_parts) {
571	if ($is_first_part && $text ne "") {$is_first_part=0;}
572
573	if ($mimetype eq "multipart/digest") {
574	# default type - RTFRFC!! Set if not already set
575	$message_part =~ m@^(.*)\n\r?\n@s;
576	my $part_header=$1;
577	if ($part_header !~ m@^content-type@mi) {
578	$message_part="Content-type: message/rfc822\n"
579	. $message_part; # prepend default type
580	}
581	}
582
583	$text .= $self->process_multipart_part($default_header_encoding,
584	$message_part,
585	$is_first_part);
586	} # foreach message part.
587	} else {
588	# we can't handle this multipart type (not mixed or alternative)
589	# the RFC also mentions "parallel".
590	}
591	} # end of ($mimetype =~ multipart)
592	elsif ($mimetype =~ m@message/rfc822@) {
593	my $msg_header = $text;
594	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
595	$text = $1;
596
597	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
598	{
599	$mimetype=$1;
600	$mimeinfo=$2;
601	$mimetype =~ tr/[A-Z]/[a-z]/;
602
603	my $msg_text;
604	if ($mimetype =~ m@multipart/@) {
605	$msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
606	$default_header_encoding,
607	$text);
608	} else {
609	$msg_text=$self->text_from_part($text,$msg_header);
610	}
611
612	my $brief_header=text_into_html(get_brief_headers($msg_header));
613	$text= "\n<b><<attached message>></b><br>";
614	$text.= "<table><tr><td width=\"5%\"> </td>\n";
615	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
616	. "</td></tr></table>";
617	}
618	} else {
619	# we don't do any processing of the content.
620	}
621
622	return $text;
623	}
624
625
626
627	# used for turning a message id into a more friendly string for greenstone
628	sub escape_msg_id {
629	#msgid
630	my $id=shift;
631	chomp $id; $id =~ s!\s!!g; # remove spaces
632	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
633	$id =~ s![_&]!-!g; # replace symbols that might cause problems
634	$id =~ s!\.!-!g; # . means section to greenstone doc ids!
635	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
636	return $id;
637	}
638
639
640
641	sub process_multipart_part {
642	my $self = shift;
643	my $default_header_encoding = shift;
644	my $message_part = shift;
645	my $is_first_part = shift;
646
647	my $return_text="";
648	my $part_header=$message_part;
649	my $part_body;
650	if ($message_part=~ /^\s*\n/) {
651	# no header... use defaults
652	$part_body=$message_part;
653	$part_header="Content-type: text/plain; charset=us-ascii";
654	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
655	$part_body=$1;
656	} else {
657	# something's gone wrong...
658	$part_header="";
659	$part_body=$message_part;
660	}
661
662	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
663	my $part_content_type="";
664	my $part_content_info="";
665
666	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
667	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
668	$part_content_info=$2;
669	if (!defined($part_content_info)) {
670	$part_content_info="";
671	} else {
672	$part_content_info =~ s/^\;\s*//;
673	$part_content_info =~ s/\s*$//;
674	}
675	}
676	my $filename="";
677	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
678	$filename=$1;
679	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
680	# decode the filename
681	$self->decode_header_value($default_header_encoding, \$filename);
682
683	}
684
685	# disposition - either inline or attachment.
686	# NOT CURRENTLY USED - we display all text types instead...
687	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
688
689	# add <<attachment>> to each part except the first...
690	if (!$is_first_part) {
691	$return_text.="\n<p><hr><strong><<attachment>>";
692	# add part info header
693	my $header_text = "<br>Type: $part_content_type<br>\n";
694	if ($filename ne "") {
695	$header_text .= "Filename: $filename\n";
696	}
697	$header_text =~ s@_@\\_@g;
698	$return_text .= $header_text . "</strong></p>\n<p>\n";
699	}
700
701	if ($part_content_type =~ m@text/@)
702	{
703	# $message_part includes the mime part headers
704	my $part_text = $self->text_from_part($message_part);
705	if ($part_content_type !~ m@text/(ht\|x)ml@) {
706	$part_text = text_into_html($part_text);
707	}
708	if ($part_text eq "") {
709	$part_text = ' ';
710	}
711	$return_text .= $part_text;
712	} elsif ($part_content_type =~ m@message/rfc822@) {
713	# This is a forwarded message
714	my $message_part_headers=$part_body;
715	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
716	my $message_part_body=$1;
717	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
718
719	my $rfc822_formatted_body=""; # put result in here
720	if ($message_part_headers =~
721	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
722	{
723	# The message header uses MIME flags
724	my $message_content_type=$1;
725	my $message_content_info=$2;
726	if (!defined($message_content_info)) {
727	$message_content_info="";
728	} else {
729	$message_content_info =~ s/^\;\s*//;
730	$message_content_info =~ s/\s*$//;
731	}
732	$message_content_type =~ tr/A-Z/a-z/;
733	if ($message_content_type =~ /multipart/) {
734	$rfc822_formatted_body=
735	$self->text_from_mime_message($message_content_type,
736	$message_content_info,
737	$default_header_encoding,
738	$message_part_body);
739	} else {
740	$message_part_body=$self->text_from_part($part_body,
741	$message_part_headers);
742	$rfc822_formatted_body=text_into_html($message_part_body);
743	}
744	} else {
745	# message doesn't use MIME flags
746	$rfc822_formatted_body=text_into_html($message_part_body);
747	$rfc822_formatted_body =~ s@_@\\_@g;
748	}
749	# Add the returned text to the output
750	# don't put all the headers...
751	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
752	my $brief_headers=get_brief_headers($message_part_headers);
753	$return_text.=text_into_html($brief_headers);
754	$return_text.="</p><p>\n";
755	$return_text.=$rfc822_formatted_body;
756	$return_text.="</p>\n";
757	# end of message/rfc822
758	} elsif ($part_content_type =~ /multipart/) {
759	# recurse again
760
761	my $tmptext= $self->text_from_mime_message($part_content_type,
762	$part_content_info,
763	$default_header_encoding,
764	$part_body);
765	$return_text.=$tmptext;
766	} else {
767	# this part isn't text/* or another message...
768	if ($is_first_part) {
769	# this is the first part of a multipart, or only part!
770	$return_text="\n<p><hr><strong><<attachment>>";
771	# add part info header
772	my $header_text="<br>Type: $part_content_type<br>\n";
773	$header_text.="Filename: $filename</strong></p>\n<p>\n";
774	$header_text =~ s@_@\\_@g;
775	$return_text.=$header_text;
776	}
777
778	# save attachment by default
779	if (!$self->{'no_attachments'}
780	&& $filename ne "") { # this part has a file...
781	my $encoding="8bit";
782	if ($part_header =~
783	/^content-transfer-encoding:\s*(\w+)/mi ) {
784	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
785	}
786	my $tmpdir=&util::filename_cat($ENV{'GSDLHOME'}, "tmp");
787	my $save_filename=$filename;
788
789	# make sure we don't clobber files with same name;
790	# need to keep state between .mbx files
791	my $assoc_files=$self->{'assoc_filenames'};
792	if ($assoc_files->{$filename}) { # it's been set...
793	$assoc_files->{$filename}++;
794	$filename =~ m/(.+)\.(\w+)$/;
795	my ($filestem, $ext)=($1,$2);
796	$save_filename="${filestem}_"
797	. $assoc_files->{$filename} . ".$ext";
798	} else { # first file with this name
799	$assoc_files->{$filename}=1;
800	}
801	my $tmp_filename = &util::filename_cat($tmpdir, $save_filename);
802	open (SAVE, ">$tmp_filename") \|\|
803	warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
804	binmode(SAVE); # needed on Windows
805	my $part_text = $message_part;
806	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
807	if ($encoding eq "base64") {
808	print SAVE base64_decode($part_text);
809	} elsif ($encoding eq "quoted-printable") {
810	print SAVE qp_decode($part_text);
811	} else { # 7bit, 8bit, binary, etc...
812	print SAVE $part_text;
813	}
814	close SAVE;
815	my $doc_obj=$self->{'doc_obj'};
816	$doc_obj->associate_file("$tmp_filename",
817	"$save_filename",
818	$part_content_type # mimetype
819	);
820	# add this file to the list of tmp files for deleting later
821	push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
822
823	my $outhandle=$self->{'outhandle'};
824	print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
825
826	# be nice if "download" was a translatable macro :(
827	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
828	} # end of save attachment
829	} # end of !text/message part
830
831
832	return $return_text;
833	}
834
835
836	# Return only the "important" headers from a set of message headers
837	sub get_brief_headers {
838	my $msg_header = shift;
839	my $brief_header = "";
840
841	# Order matters!
842	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
843	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
844	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
845	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
846	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
847
848	return $brief_header;
849	}
850
851
852	# Process a MIME part. Return "" if we can't decode it.
853	# should only be called for parts with type "text/*" ?
854	# Either pass the entire mime part (including the part's header),
855	# or pass the mime part's text and optionally the part's header.
856	sub text_from_part {
857	my $self = shift;
858	my $text = shift \|\| '';
859	my $part_header = shift;
860
861
862	my $type="text/plain"; # default, overridden from part header
863	my $charset=undef; # convert2unicode() will guess if necessary
864
865	if (! $part_header) { # no header argument was given. check the body
866	$part_header = $text;
867	# check for empty part header (leading blank line)
868	if ($text =~ /^\s*\r?\n/) {
869	$part_header="Content-type: text/plain; charset=us-ascii";
870	} else {
871	$part_header =~ s/\r?\n\r?\n(.*)$//s;
872	$text=$1; if (!defined($text)) {$text="";}
873	}
874	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
875	}
876
877	if ($part_header =~
878	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
879	$type=$1;
880	$charset=$2;
881	}
882	my $encoding="";
883	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
884	$encoding=$1; $encoding=~tr/A-Z/a-z/;
885	}
886	# Content-Transfer-Encoding is per-part
887	if ($encoding ne "") {
888	if ($encoding =~ /quoted\-printable/) {
889	$text=qp_decode($text);
890	} elsif ($encoding =~ /base64/) {
891	$text=base64_decode($text);
892	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
893	# rfc2045 also allows binary, which we ignore (for now).
894	my $outhandle=$self->{'outhandle'};
895	print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
896	return "";
897	}
898	}
899
900	if ($type eq "text/html") {
901	# only get stuff between <body> tags, or <html> tags.
902	$text =~ s@^.<html[^>]>@@is;
903	$text =~ s@</html>.*$@@is;
904	$text =~ s/^.?<body[^>]>//si;
905	$text =~ s/<\/body>.*$//si;
906	}
907	elsif ($type eq "text/xml") {
908	$text=~s/</</g;$text=~s/>/>/g;
909	$text="<pre>\n$text\n</pre>\n";
910	}
911	# convert to unicode
912	$self->convert2unicode($charset, \$text);
913	$text =~ s@_@\\_@g; # protect against GS macro language
914	return $text;
915	}
916
917
918
919
920	# decode quoted-printable text
921	sub qp_decode {
922	my $text=shift;
923
924	# if a line ends with "=\s*", it is a soft line break, otherwise
925	# keep in any newline characters.
926
927	$text =~ s/=\s*\r?\n//mg;
928	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
929	return $text;
930	}
931
932	# decode base64 text. This is fairly slow (since it's interpreted perl rather
933	# than compiled XS stuff like in the ::MIME modules, but this is more portable
934	# for us at least).
935	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
936	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
937	# from each encoded byte.
938
939
940	sub base64_decode {
941	my $enc_text = shift;
942	# A=>0, B=>1, ..., '+'=>62, '/'=>63
943	# also '=' is used for padding at the end, but we remove it anyway.
944	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
945	# map each MIME char into it's value, for more efficient lookup.
946	my %index;
947	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
948	# remove all non-base64 chars. eval to get variable in transliteration...
949	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
950	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
951	my $decoded="";
952	while (length ($enc_text)>3)
953	{
954	my $fourchars=substr($enc_text,0,4,"");
955	my @chars=(split '',$fourchars);
956	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
957	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
958	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
959	}
960	# if there are any input chars left, there are either
961	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
962	my @chars=(split '',$enc_text);
963	if (length($enc_text)) {
964	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
965	}
966	if (length($enc_text)==3) {
967	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
968	}
969	return $decoded;
970	}
971
972	# returns 0 if valid utf-8, 1 if invalid
973	sub is_utf8 {
974	my $self = shift;
975	my $textref = shift;
976
977	$$textref =~ m/^/g; # to set \G
978	my $badbytesfound=0;
979	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
980	my $highbytes=$1;
981	my $highbyteslength=length($highbytes);
982	# replace any non utf8 complaint bytes
983	$highbytes =~ /^/g; # set pos()
984	while ($highbytes =~
985	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
986	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
987	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
988	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
989	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
990	)*([\x80-\xff])? !xg
991	) {
992	my $badbyte=$1;
993	if (!defined $badbyte) {next} # hit end of string
994	return 1;
995	}
996	}
997	return 0;
998	}
999
1000	# words with non ascii characters in header values must be encoded in the
1001	# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
1002
1003	sub decode_header_value {
1004	my $self = shift(@_);
1005	my ($default_header_encoding, $textref) = @_;
1006
1007	if (!$$textref) {
1008	# nothing to do!
1009	return;
1010	}
1011	my $value = $$textref;
1012	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
1013	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
1014	my $original_value=$value;
1015	my $encoded=$value;
1016	$value="";
1017	# we should ignore spaces between consecutive encoded-texts
1018	$encoded =~ s@\?=\s+=\?@\?==\?@g;
1019	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
1020	my ($charset, $encoding, $data)=($2,$3,$4);
1021	my ($decoded_data);
1022	my $leading_chars = "$1";
1023	$self->convert2unicode($default_header_encoding, \$leading_chars);
1024	$value.=$leading_chars;
1025
1026	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
1027	chomp $data;
1028	$encoding =~ tr/BQ/bq/;
1029	if ($encoding eq "q") { # quoted printable
1030	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
1031	$decoded_data=qp_decode($data);
1032	# qp_decode adds \n, which is default for body text
1033	chomp($decoded_data);
1034	} else { # base 64
1035	$decoded_data=base64_decode($data);
1036	}
1037	$self->convert2unicode($charset, \$decoded_data);
1038	$value .= $decoded_data;
1039	} # end of while loop
1040
1041	# get any trailing characters
1042	$self->convert2unicode($default_header_encoding, \$encoded);
1043	$value.=$encoded;
1044
1045	if ($value =~ /^\s*$/) { # we couldn't extract anything...
1046	$self->convert2unicode($default_header_encoding,
1047	\$original_value);
1048	$value=$original_value;
1049	}
1050	$$textref = $value;
1051	} # end of if =?...?=
1052
1053	# In the absense of other charset information, assume the
1054	# header is the default (usually "iso_8859_1") and convert to unicode.
1055	else {
1056	$self->convert2unicode($default_header_encoding, $textref);
1057	}
1058
1059	}
1060
1061
1062
1063	sub convert2unicode {
1064	my $self = shift(@_);
1065	my ($charset, $textref) = @_;
1066
1067	if (!$$textref) {
1068	# nothing to do!
1069	return;
1070	}
1071
1072	if (! defined $charset) {
1073	# check if we have valid utf-8
1074	if ($self->is_utf8($textref)) { $charset = "utf8" }
1075
1076	# default to latin
1077	$charset = "iso_8859_1" if ! defined($charset);
1078	}
1079
1080	# first get our character encoding name in the right form.
1081	$charset =~ tr/A-Z/a-z/; # lowercase
1082	$charset =~ s/\-/_/g;
1083	if ($charset =~ /gb_?2312/) { $charset="gb" }
1084	# assumes EUC-KR, not ISO-2022 !?
1085	$charset =~ s/^ks_c_5601_1987/korean/;
1086	if ($charset eq 'utf_8') {$charset='utf8'}
1087
1088	my $outhandle = $self->{'outhandle'};
1089
1090	if ($charset eq "utf8") {
1091	# no conversion needed, but lets check that it's valid utf8
1092	# see utf-8 manpage for valid ranges
1093	$$textref =~ m/^/g; # to set \G
1094	my $badbytesfound=0;
1095	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1096	my $highbytes=$1;
1097	my $highbyteslength=length($highbytes);
1098	# replace any non utf8 complaint bytes
1099	$highbytes =~ /^/g; # set pos()
1100	while ($highbytes =~
1101	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
1102	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
1103	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
1104	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
1105	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1106	)*([\x80-\xff])? !xg
1107	) {
1108	my $badbyte=$1;
1109	if (!defined $badbyte) {next} # hit end of string
1110	my $pos=pos($highbytes);
1111	substr($highbytes, $pos-1, 1, "\xc2\x80");
1112	# update the position to continue searching (for \G)
1113	pos($highbytes) = $pos+1; # set to just after the \x80
1114	$badbytesfound=1;
1115	}
1116	if ($badbytesfound==1) {
1117	# claims to be utf8, but it isn't!
1118	print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
1119	. "detected and removed.\n";
1120
1121	my $replength=length($highbytes);
1122	my $textpos=pos($$textref);
1123	# replace bad bytes with good bytes
1124	substr( $$textref, $textpos-$replength,
1125	$replength, $highbytes);
1126	# update the position to continue searching (for \G)
1127	pos($$textref)=$textpos+($replength-$highbyteslength);
1128	}
1129	}
1130	return;
1131	}
1132
1133	# It appears that we can't always trust ascii text so we'll treat it
1134	# as iso-8859-1 (letting characters above 0x80 through without
1135	# converting them to utf-8 will result in invalid XML documents
1136	# which can't be parsed at build time).
1137	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
1138
1139	if ($charset eq "iso_8859_1") {
1140	# test if the mailer lied, and it has win1252 chars in it...
1141	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1142	if ($$textref =~ m/[\x80-\x9f]/) {
1143	print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
1144	print $outhandle "codepage 1252 detected.\n";
1145	$charset = "windows_1252";
1146	}
1147	}
1148	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1149
1150	if ($utf8_text ne "") {
1151	$$textref=$utf8_text;
1152	} else {
1153	# we didn't get any text... unsupported encoding perhaps? Or it is
1154	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
1155	# characters out here if this causes problems...
1156	my $outhandle=$self->{'outhandle'};
1157	print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
1158	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1159
1160	}
1161	}
1162
1163
1164	sub set_OID {
1165	my $self = shift (@_);
1166	my ($doc_obj, $id, $segment_number) = @_;
1167
1168	if ( exists $doc_obj->{'msgid'} ) {
1169	$doc_obj->set_OID($doc_obj->{'msgid'});
1170	} else {
1171	$doc_obj->set_OID("$id\_$segment_number");
1172	}
1173	}
1174
1175
1176	# Perl packages have to return true if they are run.
1177	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: