Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 10504

Last change on this file since 10504 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 34.4 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata (not Dublin Core!):
42	# $Headers All the header content (optional, not stored by default)
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header
46	# $FromName Name of sender (where available)
47	# $FromAddr E-mail address of sender
48	# $DateText Date: header
49	# $Date Date: header in GSDL format (eg: 19990924)
50	#
51	# $Title made up of Subject, Date and Sender (for default formatting)
52	#
53	#
54	# John McPherson - June/July 2001
55	# added (basic) MIME support and quoted-printable and base64 decodings.
56	# Minor fixes for names that are actually email addresses (ie <...> was lost)
57	#
58	# See: * RFC 822 - ARPA Internet Text Messages
59	# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60	# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61	# * RFC 2047 - MIME (part 3) Message Header Extensions
62	# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64	# 12/05/02 Added usage datastructure - John Thompson
65	package EMAILPlug;
66
67	use strict;
68	no strict "refs"; # so we can use a variable as a filehandle for print $out
69
70
71	use SplitPlug;
72	use unicode; # gs conv functions
73	use gsprintf 'gsprintf'; # translations
74
75	use sorttools;
76	use util;
77
78	sub BEGIN {
79	@EMAILPlug::ISA = ('SplitPlug');
80	}
81
82
83	my $arguments =
84	[ { 'name' => "process_exp",
85	'desc' => "{BasPlug.process_exp}",
86	'type' => "regexp",
87	'reqd' => "no",
88	'deft' => &get_default_process_exp() },
89	{ 'name' => "no_attachments",
90	'desc' => "{EMAILPlug.no_attachments}",
91	'type' => "flag",
92	'reqd' => "no" },
93	{ 'name' => "headers",
94	'desc' => "{EMAILPlug.headers}",
95	'type' => "flag",
96	'reqd' => "no" },
97	{ 'name' => "split_exp",
98	'desc' => "{EMAILPlug.split_exp}",
99	'type' => "regexp",
100	'reqd' => "no",
101	'deft' => &get_default_split_exp() }
102	];
103
104	my $options = { 'name' => "EMAILPlug",
105	'desc' => "{EMAILPlug.desc}",
106	'abstract' => "no",
107	'inherits' => "yes",
108	'args' => $arguments };
109
110	# Create a new EMAILPlug object with which to parse a file.
111	# Accomplished by creating a new BasPlug and using bless to
112	# turn it into an EMAILPlug.
113
114	sub new {
115	my ($class) = shift (@_);
116	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
117	push(@$pluginlist, $class);
118
119	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
120	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
121
122	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
123
124	$self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
125
126	# this might not actually be true at read-time, but after processing
127	# it should all be utf8.
128	$self->{'input_encoding'}="utf8";
129	return bless $self, $class;
130	}
131
132	sub get_default_process_exp {
133	my $self = shift (@_);
134	# mbx/email for mailbox file format, \d+ for maildir (each message is
135	# in a separate file, with a unique number for filename)
136	# mozilla and IE will save individual mbx format files with a ".eml" ext.
137	return q@([\\/]\d+\|\.(mbx\|email\|eml))$@;
138	}
139
140	# This plugin splits the mbox mail files at lines starting with From<sp>
141	# It is supposed to be "\n\nFrom ", but this isn't always used.
142	# add \d{4} so that the line ends in a year (in case the text has an
143	# unescaped "From " at the start of a line).
144	sub get_default_split_exp {
145	return q^\nFrom .*\d{4}\n^;
146
147	}
148
149
150	# do plugin specific processing of doc_obj
151	sub process {
152
153	my $self = shift (@_);
154	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
155	my $outhandle = $self->{'outhandle'};
156
157	# Check that we're dealing with a valid mail file
158	# mbox message files start with "From "
159	# maildir messages usually start with Return-Path and Delivered-To
160	# mh is very similar to maildir
161	my $startoffile=substr($$textref,0,256);
162	if (($startoffile !~ /^(From )/) &&
163	($startoffile !~ /^(From\|To\|Envelope.\|Received\|Return-Path\|Date\|Subject\|Content\-.\|MIME-Version\|Forwarded):/im)) {
164	return undef;
165	}
166
167
168	print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
169
170	gsprintf($outhandle, "EMAILPlug: {common.processing} $file\n")
171	if $self->{'verbosity'} > 1;
172
173	my $cursection = $doc_obj->get_top_section();
174
175	#
176	# Parse the document's text and extract metadata
177	#
178
179	# Protect backslashes
180	$$textref =~ s@\\@\\\\@g;
181
182	# Separate header from body of message
183	my $Headers = $$textref;
184	$Headers =~ s/\r?\n\r?\n(.*)$//s;
185	$$textref = $1;
186	$Headers .= "\n";
187
188	# Unfold headers - see rfc822
189	$Headers =~ s/\r?\n[\t\ ]+/ /gs;
190	# Extract basic metadata from header
191	my @headers = ("From", "To", "Subject", "Date");
192	my %raw;
193	foreach my $name (@headers) {
194	$raw{$name} = "No $name value";
195	}
196
197	# Get a default encoding for the header - RFC says should be ascii...
198	my $default_header_encoding="iso_8859_1";
199
200	# We don't know what character set is the user's default...
201	# We could use textcat to guess... for now we'll look at mime content-type
202	# if ($Headers =~ /([[:^ascii:]])/) {
203	# }
204	if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
205	$default_header_encoding=$1;
206	$default_header_encoding =~ s@\-@_@g;
207	$default_header_encoding =~ tr/A-Z/a-z/;
208	}
209
210
211	# Examine each line of the headers
212	my ($line, $name, $value);
213	my @parts;
214	foreach $line (split(/\n/, $Headers)) {
215
216	# Ignore lines with no content or which begin with whitespace
217	next unless ($line =~ /:/);
218	next if ($line =~ /^\s/);
219
220	# Find out what metadata is on this line
221	@parts = split(/:/, $line);
222	$name = shift @parts;
223	# get fieldname in canonical form - first cap, then lower case.
224	$name =~ tr/A-Z/a-z/;
225	# uppercase the first character according to the current locale
226	$name=~s/(.+)/\u$1/;
227	next unless $name;
228	next unless ($raw{$name});
229
230	# Find the value of that metadata
231	$value = join(":", @parts);
232	$value =~ s/^\s+//;
233	$value =~ s/\s+$//;
234	# decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
235	if ($value =~ /=\?.\?[BbQq]\?.\?=/) {
236	my $original_value=$value;
237	my $encoded=$value;
238	$value="";
239	# we should ignore spaces between consecutive encoded-texts
240	$encoded =~ s@\?=\s+=\?@\?==\?@g;
241	while ($encoded =~ s/(.?)=\?([^\?])\?([bq])\?([^\?]+)\?=//i) {
242	my ($charset, $encoding, $data)=($2,$3,$4);
243	my ($decoded_data);
244	$value.="$1"; # any leading chars
245	$data=~s/^\s//; $data=~s/\s$//; # strip whitespace from ends
246	chomp $data;
247	$encoding =~ tr/BQ/bq/;
248	if ($encoding eq "q") { # quoted printable
249	$data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
250	$decoded_data=qp_decode($data);
251	# qp_decode adds \n, which is default for body text
252	chomp($decoded_data);
253	} else { # base 64
254	$decoded_data=base64_decode($data);
255	}
256	$self->convert2unicode($charset, \$decoded_data);
257	$value .= $decoded_data;
258	} # end of while loop
259
260	# get any trailing characters
261	$self->convert2unicode($default_header_encoding, \$encoded);
262	$value.=$encoded;
263
264	if ($value =~ /^\s*$/) { # we couldn't extract anything...
265	$self->convert2unicode($default_header_encoding,
266	\$original_value);
267	$value=$original_value;
268	}
269	} # end of if =?...?=
270
271	# In the absense of other charset information, assume the
272	# header is the default (usually "iso_8859_1") and convert to unicode.
273	else {
274	$self->convert2unicode($default_header_encoding, \$value);
275	}
276
277	# Store the metadata
278	$value =~ s@_@\\_@g; # protect against GS macro language
279	$raw{$name} = $value;
280	}
281
282	# Extract the name and e-mail address from the From metadata
283	my $frommeta = $raw{"From"};
284	my $fromnamemeta;
285	my $fromaddrmeta;
286
287	$frommeta =~ s/\s*$//; # Remove trailing space, if any
288
289	if ($frommeta =~ m/(.+)\s*<(.+)>/) {
290	$fromnamemeta=$1;
291	$fromaddrmeta=$2;
292	} elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
293	$fromnamemeta=$2;
294	$fromaddrmeta=$1;
295	}
296	if (!defined($fromaddrmeta)) {
297	$fromaddrmeta=$frommeta;
298	}
299	$fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
300	# minor attempt to prevent spam-bots from harvesting addresses...
301	$fromaddrmeta=~s/@/@/;
302
303	$doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
304
305	if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
306	$fromnamemeta =~ s/\"//g; # remove quotes
307	$fromnamemeta =~ s/\s+$//; # remove trailing whitespace
308	}
309	else {
310	$fromnamemeta = $fromaddrmeta;
311	}
312	# if name is an address
313	$fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
314	$fromnamemeta=~s/@/&#64\;/;
315	$doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
316
317	$raw{"From"}=$frommeta;
318
319	# Process Date information
320	if ($raw{"Date"} !~ /No Date/) {
321	$raw{"DateText"} = $raw{"Date"};
322
323	# Convert the date text to internal date format
324	$value = $raw{"Date"};
325	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
326	# make some assumptions about the year formatting...
327	# some (old) software thinks 2001 is 101, some think 2001 is 01
328	if ($year < 20) { $year += 2000; } # assume not really 1920...
329	elsif ($year < 150) { $year += 1900; } # assume not really 2150...
330	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
331
332	} else {
333	# We have not extracted a date
334	$raw{"DateText"} = "Unknown.";
335	$raw{"Date"} = "19000000";
336	}
337
338	# Add extracted metadata to document object
339	foreach my $name (keys %raw) {
340	$value = $raw{$name};
341	if ($value) {
342	# assume subject, etc headers have no special HTML meaning.
343	$value = &text_into_html($value);
344	# escape [] so it isn't re-interpreted as metadata
345	$value =~ s/\[/[/g; $value =~ s/\]/]/g;
346	} else {
347	$value = "No $name field";
348	}
349	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
350	}
351
352
353	# extract a message ID from the headers, if there is one, and we'll use
354	# that as the greenstone doc ID. Having a predictable ID means we can
355	# link to other messages, eg from In-Reply-To or References headers...
356	if ($Headers =~ m@^Message-ID:(.+)$@mi) {
357	my $id=escape_msg_id($1);
358	$doc_obj->{'msgid'}=$id;
359	}
360	# link to another message, if this is a reply
361	if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
362	my $id=escape_msg_id($1);
363	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
364	} elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
365	# References can have multiple, get the last one
366	my $id=escape_msg_id($1);
367	# not necessarily in-reply-to, but same thread...
368	$doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
369	}
370
371
372
373	my $mimetype="text/plain";
374	my $mimeinfo="";
375	my $charset = $default_header_encoding;
376	# Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
377	# more than one parameter given to Content-type.
378	# eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
379	if ($Headers =~ m@^content\-type:\s([\w\.\-/]+)\s(\;\s.+)?\s$@mi)
380	{
381	$mimetype=$1;
382	$mimetype =~ tr/[A-Z]/[a-z]/;
383
384	if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
385	$mimetype = "text/plain";
386	}
387
388	$mimeinfo=$2;
389	if (!defined $mimeinfo) {
390	$mimeinfo="";
391	} else { # strip leading and trailing stuff
392	$mimeinfo =~ s/^\;\s*//;
393	$mimeinfo =~ s/\s*$//;
394	}
395	if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
396	$charset = $1;
397	}
398	}
399
400	my $transfer_encoding="7bit";
401	if ($Headers =~ /^content-transfer-encoding:\s([^\s]+)\s$/mi) {
402	$transfer_encoding=$1;
403	}
404
405	if ($mimetype eq "text/html") {
406	$$textref= $self->text_from_part($$textref, $Headers);
407	} elsif ($mimetype ne "text/plain") {
408	$self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
409	$$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
410	} else { # mimetype eq text/plain
411
412	if ($transfer_encoding =~ /quoted\-printable/) {
413	$$textref=qp_decode($$textref);
414	} elsif ($transfer_encoding =~ /base64/) {
415	$$textref=base64_decode($$textref);
416	}
417	$self->convert2unicode($charset, $textref);
418
419	$$textref = &text_into_html($$textref);
420	$$textref =~ s@_@\\_@g; # protect against GS macro language
421
422	}
423
424
425	if ($self->{'headers'} && $self->{'headers'} == 1) {
426	# Add "All headers" metadata
427	$Headers = &text_into_html($Headers);
428
429	$Headers = "No headers" unless ($Headers =~ /\w/);
430	$Headers =~ s/@/&#64\;/g;
431	# escape [] so it isn't re-interpreted as metadata
432	$Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
433	$self->convert2unicode($charset, \$Headers);
434
435	$Headers =~ s@_@\\_@g; # protect against GS macro language
436	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
437	}
438
439
440	# Add Title metadata
441	my $Title = text_into_html($raw{'Subject'});
442	$Title .= "<br>From: " . text_into_html($fromnamemeta);
443	$Title .= "<br>Date: " . text_into_html($raw{'DateText'});
444	$Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
445
446	$doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
447
448	# Add FileFormat metadata
449	$doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
450
451	# Add text to document object
452	$$textref = "No message" unless ($$textref =~ /\w/);
453
454	$doc_obj->add_utf8_text($cursection, $$textref);
455
456	return 1;
457	}
458
459
460	# Convert a text string into HTML.
461	#
462	# The HTML is going to be inserted into a GML file, so
463	# we have to be careful not to use symbols like ">",
464	# which ocurs frequently in email messages (and use
465	# &gt instead.
466	#
467	# This function also turns links and email addresses into hyperlinks,
468	# and replaces carriage returns with <BR> tags (and multiple carriage
469	# returns with <P> tags).
470
471
472	sub text_into_html {
473	my ($text) = @_;
474
475	# Convert problem characters into HTML symbols
476	$text =~ s/&/&/g;
477	$text =~ s/</</g;
478	$text =~ s/>/>/g;
479	$text =~ s/\"/"/g;
480
481	# convert email addresses and URIs into links
482	# don't markup email addresses for now
483	# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
484
485	# try to munge email addresses a little bit...
486	$text =~ s/@/@/;
487	# assume hostnames are \.\w\- only, then might have a trailing '/.*'
488	# assume URI doesn't finish with a '.'
489	$text =~ s@((http\|ftp\|https)://[\w\-]+(\.[\w\-]+)/?((&\|\.)?[\w\?\=\-_/~]+)(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
490
491
492	# Clean up whitespace and convert \n charaters to <BR> or <P>
493	$text =~ s/ +/ /g;
494	$text =~ s/\s*$//g;
495	$text =~ s/^\s*//g;
496	$text =~ s/\n/\n<br>/g;
497	$text =~ s/<br>\s*<br>/<p>/gi;
498
499	return $text;
500	}
501
502
503
504
505	#Process a MIME message.
506	# the textref we are given DOES NOT include the header.
507	sub text_from_mime_message {
508	my $self = shift(@_);
509	my ($mimetype,$mimeinfo,$text)=(@_);
510	my $outhandle=$self->{'outhandle'};
511	# Check for multiparts - $mimeinfo will be a boundary
512	if ($mimetype =~ /multipart/) {
513	my $boundary="";
514	if ($mimeinfo =~ m@boundary=(\"[^\"]+\"\|[^\s]+)\s*$@im) {
515	$boundary=$1;
516	if ($boundary =~ m@^\"@) {
517	$boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
518	}
519	} else {
520	print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
521	}
522	# parts start with "--$boundary"
523	# message ends with "--$boundary--"
524	# RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
525	# that perl might want to interpolate. Also allows spaces...
526	$boundary=~s/\\/\\\\/g;
527	$boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
528	my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
529	# remove first "part" and last "part" (final --)
530	shift @message_parts;
531	my $last=pop @message_parts;
532	# if our boundaries are a bit dodgy and we only found 1 part...
533	if (!defined($last)) {$last="";}
534	# make sure it is only -- and whitespace
535	if ($last !~ /^\-\-\s*$/ms) {
536	print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
537	}
538	foreach my $message_part (@message_parts) {
539	# remove the leading newline left from split.
540	$message_part=~s/^\r?\n//;
541	}
542	if ($mimetype eq "multipart/alternative") {
543	# check for an HTML version first, then TEXT, otherwise use first.
544	my $part_text="";
545	foreach my $message_part (@message_parts) {
546	if ($message_part =~ m@\scontent\-type:\stext/html@mis)
547	{
548	# Use the HTML version
549	$part_text= $self->text_from_part($message_part);
550	$mimetype="text/html";
551	last;
552	}
553	}
554	if ($part_text eq "") { # try getting a text part instead
555	foreach my $message_part (@message_parts) {
556	if ($message_part =~ m@^content\-type:\s*text/plain@mis)
557	{
558	# Use the plain version
559	$part_text= $self->text_from_part($message_part);
560	if ($part_text =~/[^\s]/) {
561	$part_text = text_into_html($part_text);
562	}
563	$mimetype="text/plain";
564	last;
565	}
566	}
567	}
568	if ($part_text eq "") { #use first part (no html/text part found)
569	$part_text = $self->text_from_part(shift @message_parts);
570	$part_text = text_into_html($part_text);
571	}
572	if ($part_text eq "") { # we couldn't get anything!!!
573	# or it was an empty message...
574	# do nothing...
575	gsprintf($outhandle, "{BasPlug.empty_file} - empty body?\n");
576	} else {
577	$text = $part_text;
578	}
579	} elsif ($mimetype =~ m@multipart/(mixed\|digest\|related\|signed)@) {
580	$text = "";
581	# signed is for PGP/GPG messages... the last part is a hash
582	if ($mimetype =~ m@multipart/signed@) {
583	pop @message_parts;
584	}
585	my $is_first_part=1;
586	foreach my $message_part (@message_parts) {
587	if ($is_first_part && $text ne "") {$is_first_part=0;}
588
589	if ($mimetype eq "multipart/digest") {
590	# default type - RTFRFC!! Set if not already set
591	$message_part =~ m@^(.*)\n\r?\n@s;
592	my $part_header=$1;
593	if ($part_header !~ m@^content-type@mi) {
594	$message_part="Content-type: message/rfc822\n"
595	. $message_part; # prepend default type
596	}
597	}
598
599	$text .= $self->process_multipart_part($message_part,
600	$is_first_part);
601	} # foreach message part.
602	} else {
603	# we can't handle this multipart type (not mixed or alternative)
604	# the RFC also mentions "parallel".
605	}
606	} # end of ($mimetype =~ multipart)
607	elsif ($mimetype =~ m@message/rfc822@) {
608	my $msg_header = $text;
609	$msg_header =~ s/\r?\n\r?\n(.*)$//s;
610	$text = $1;
611
612	if ($msg_header =~ /^content\-type:\s([\w\.\-\/]+)\s\;?\s(.+?)\s$/mi)
613	{
614	$mimetype=$1;
615	$mimeinfo=$2;
616	$mimetype =~ tr/[A-Z]/[a-z]/;
617
618	my $msg_text;
619	if ($mimetype =~ m@multipart/@) {
620	$msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
621	$text);
622	} else {
623	$msg_text=$self->text_from_part($text,$msg_header);
624	}
625
626	my $brief_header=text_into_html(get_brief_headers($msg_header));
627	$text= "\n<b><<attached message>></b><br>";
628	$text.= "<table><tr><td width=\"5%\"> </td>\n";
629	$text.="<td>" . $brief_header . "\n</p>" . $msg_text
630	. "</td></tr></table>";
631	}
632	} else {
633	# we don't do any processing of the content.
634	}
635
636	return $text;
637	}
638
639
640
641	# used for turning a message id into a more friendly string for greenstone
642	sub escape_msg_id {
643	#msgid
644	my $id=shift;
645	chomp $id; $id =~ s!\s!!g; # remove spaces
646	$id =~ s![<>\[\]]!!g; # remove [ ] < and >
647	$id =~ s![_&]!-!g; # replace symbols that might cause problems
648	$id =~ s!@!-!g; # replace @ symbol, to avoid spambots
649	return $id;
650	}
651
652
653
654	sub process_multipart_part {
655	my $self = shift;
656	my $message_part = shift;
657	my $is_first_part = shift;
658
659	my $return_text="";
660	my $part_header=$message_part;
661	my $part_body;
662	if ($message_part=~ /^\s*\n/) {
663	# no header... use defaults
664	$part_body=$message_part;
665	$part_header="Content-type: text/plain; charset=us-ascii";
666	} elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
667	$part_body=$1;
668	} else {
669	# something's gone wrong...
670	$part_header="";
671	$part_body=$message_part;
672	}
673
674	$part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
675	my $part_content_type="";
676	my $part_content_info="";
677
678	if ($part_header =~ m@^content\-type:\s([\w\.\-/]+)\s(\;.*)?$@mi) {
679	$part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
680	$part_content_info=$2;
681	if (!defined($part_content_info)) {
682	$part_content_info="";
683	} else {
684	$part_content_info =~ s/^\;\s*//;
685	$part_content_info =~ s/\s*$//;
686	}
687	}
688	my $filename="";
689	if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
690	$filename=$1;
691	$filename =~ s@\r?\s*$@@; # remove trailing space, if any
692	}
693
694	# disposition - either inline or attachment.
695	# NOT CURRENTLY USED - we display all text types instead...
696	# $part_header =~ /^content\-disposition:\s*([\w+])/mis;
697
698	# add <<attachment>> to each part except the first...
699	if (!$is_first_part) {
700	$return_text.="\n<p><hr><strong><<attachment>>";
701	# add part info header
702	my $header_text="<br>Type: $part_content_type<br>\n";
703	if ($filename ne "") {
704	$header_text.="Filename: $filename\n";
705	}
706	$header_text =~ s@_@\\_@g;
707	$return_text.=$header_text . "</strong></p>\n<p>\n";
708	}
709
710	if ($part_content_type =~ m@text/@)
711	{
712	my $part_text= $self->text_from_part($message_part);
713	if ($part_content_type !~ m@text/(ht\|x)ml@) {
714	$part_text = text_into_html($part_text);
715	}
716	if ($part_text eq "") {
717	$part_text = ' ';
718	}
719	$return_text .= $part_text;
720	} elsif ($part_content_type =~ m@message/rfc822@) {
721	# This is a forwarded message
722	my $message_part_headers=$part_body;
723	$message_part_headers=~s/\r?\n\r?\n(.*)$//s;
724	my $message_part_body=$1;
725	$message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
726
727	my $rfc822_formatted_body=""; # put result in here
728	if ($message_part_headers =~
729	/^content\-type:\s([\w\.\-\/]+)\s(\;.*)?$/ims)
730	{
731	# The message header uses MIME flags
732	my $message_content_type=$1;
733	my $message_content_info=$2;
734	if (!defined($message_content_info)) {
735	$message_content_info="";
736	} else {
737	$message_content_info =~ s/^\;\s*//;
738	$message_content_info =~ s/\s*$//;
739	}
740	$message_content_type =~ tr/A-Z/a-z/;
741	if ($message_content_type =~ /multipart/) {
742	$rfc822_formatted_body=
743	$self->text_from_mime_message($message_content_type,
744	$message_content_info,
745	$message_part_body);
746	} else {
747	$message_part_body=$self->text_from_part($part_body,
748	$message_part_headers);
749	$rfc822_formatted_body=text_into_html($message_part_body);
750	}
751	} else {
752	# message doesn't use MIME flags
753	$rfc822_formatted_body=text_into_html($message_part_body);
754	$rfc822_formatted_body =~ s@_@\\_@g;
755	}
756	# Add the returned text to the output
757	# don't put all the headers...
758	# $message_part_headers =~ s/^(X\-.\|received\|message\-id\|return\-path):.\n//img;
759	my $brief_headers=get_brief_headers($message_part_headers);
760	$return_text.=text_into_html($brief_headers);
761	$return_text.="</p><p>\n";
762	$return_text.=$rfc822_formatted_body;
763	$return_text.="</p>\n";
764	# end of message/rfc822
765	} elsif ($part_content_type =~ /multipart/) {
766	# recurse again
767
768	my $tmptext= $self->text_from_mime_message($part_content_type,
769	$part_content_info,
770	$part_body);
771	$return_text.=$tmptext;
772	} else {
773	# this part isn't text/* or another message...
774	if ($is_first_part) {
775	# this is the first part of a multipart, or only part!
776	$return_text="\n<p><hr><strong><<attachment>>";
777	# add part info header
778	my $header_text="<br>Type: $part_content_type<br>\n";
779	$header_text.="Filename: $filename</strong></p>\n<p>\n";
780	$header_text =~ s@_@\\_@g;
781	$return_text.=$header_text;
782	}
783
784	# save attachment by default
785	if (!$self->{'no_attachments'}
786	&& $filename ne "") { # this part has a file...
787	my $encoding="8bit";
788	if ($part_header =~
789	/^content-transfer-encoding:\s*(\w+)/mi ) {
790	$encoding=$1; $encoding =~ tr/A-Z/a-z/;
791	}
792	my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
793	my $save_filename=$filename;
794
795	# make sure we don't clobber files with same name;
796	# need to keep state between .mbx files
797	my $assoc_files=$self->{'assoc_filenames'};
798	if ($assoc_files->{$filename}) { # it's been set...
799	$assoc_files->{$filename}++;
800	$filename =~ m/(.+)\.(\w+)$/;
801	my ($filestem, $ext)=($1,$2);
802	$save_filename="${filestem}_"
803	. $assoc_files->{$filename} . ".$ext";
804	} else { # first file with this name
805	$assoc_files->{$filename}=1;
806	}
807	open (SAVE, ">$tmpdir/$save_filename") \|\|
808	warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
809	my $part_text = $message_part;
810	$part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
811	if ($encoding eq "base64") {
812	print SAVE base64_decode($part_text);
813	} elsif ($encoding eq "quoted-printable") {
814	print SAVE qp_decode($part_text);
815	} else { # 7bit, 8bit, binary, etc...
816	print SAVE $part_text;
817	}
818	close SAVE;
819	my $doc_obj=$self->{'doc_obj'};
820	$doc_obj->associate_file("$tmpdir/$save_filename",
821	"$save_filename",
822	$part_content_type # mimetype
823	);
824	# clean up tmp area...
825	# Can't do this as it hasn't been copied/linked yet!!!
826	# &util::rm("$tmpdir/$save_filename");
827	my $outhandle=$self->{'outhandle'};
828	print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
829
830	# be nice if "download" was a translatable macro :(
831	$return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
832	} # end of save attachment
833	} # end of !text/message part
834
835
836	return $return_text;
837	}
838
839
840	# Return only the "important" headers from a set of message headers
841	sub get_brief_headers {
842	my $msg_header = shift;
843	my $brief_header = "";
844
845	# Order matters!
846	if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
847	if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
848	if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
849	if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
850	if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
851
852	return $brief_header;
853	}
854
855
856	# Process a MIME part. Return "" if we can't decode it.
857	# should only be called for parts with type "text/*" ?
858	# Either pass the entire mime part (including the part's header),
859	# or pass the mime part's text and optionally the part's header.
860	sub text_from_part {
861	my $self = shift;
862	my $text = shift \|\| '';
863	my $part_header = shift;
864
865	my $type="text/plain"; # default, overridden from part header
866	my $charset="ascii"; # default, overridden from part header
867
868	if (! $part_header) { # no header argument was given. check the body
869	$part_header = $text;
870	# check for empty part header (leading blank line)
871	if ($text =~ /^\s*\r?\n/) {
872	$part_header="Content-type: text/plain; charset=us-ascii";
873	} else {
874	$part_header =~ s/\r?\n\r?\n(.*)$//s;
875	$text=$1; if (!defined($text)) {$text="";}
876	}
877	$part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
878	}
879
880	if ($part_header =~
881	/content\-type:\s([\w\.\-\/]+).?charset=\"?([^\;\"\s]+)\"?/is) {
882	$type=$1;
883	$charset=$2;
884	}
885	my $encoding="";
886	if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
887	$encoding=$1; $encoding=~tr/A-Z/a-z/;
888	}
889	# Content-Transfer-Encoding is per-part
890	if ($encoding ne "") {
891	if ($encoding =~ /quoted\-printable/) {
892	$text=qp_decode($text);
893	} elsif ($encoding =~ /base64/) {
894	$text=base64_decode($text);
895	} elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
896	# rfc2045 also allows binary, which we ignore (for now).
897	my $outhandle=$self->{'outhandle'};
898	print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
899	return "";
900	}
901	}
902	if ($type eq "text/html") {
903	# only get stuff between <body> tags, or <html> tags.
904	$text =~ s@^.<html[^>]>@@is;
905	$text =~ s@</html>.*$@@is;
906	$text =~ s/^.?<body[^>]>//si;
907	$text =~ s/<\/body>.*$//si;
908	}
909	elsif ($type eq "text/xml") {
910	$text=~s/</</g;$text=~s/>/>/g;
911	$text="<pre>\n$text\n</pre>\n";
912	}
913	# convert to unicode
914	$self->convert2unicode($charset, \$text);
915
916	$text =~ s@_@\\_@g; # protect against GS macro language
917	return $text;
918	}
919
920
921
922
923	# decode quoted-printable text
924	sub qp_decode {
925	my $text=shift;
926
927	# if a line ends with "=\s*", it is a soft line break, otherwise
928	# keep in any newline characters.
929
930	$text =~ s/=\s*\r?\n//mg;
931	$text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
932	return $text;
933	}
934
935	# decode base64 text. This is fairly slow (since it's interpreted perl rather
936	# than compiled XS stuff like in the ::MIME modules, but this is more portable
937	# for us at least).
938	# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
939	# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
940	# from each encoded byte.
941
942
943	sub base64_decode {
944	my $enc_text = shift;
945	# A=>0, B=>1, ..., '+'=>62, '/'=>63
946	# also '=' is used for padding at the end, but we remove it anyway.
947	my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
948	# map each MIME char into it's value, for more efficient lookup.
949	my %index;
950	map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
951	# remove all non-base64 chars. eval to get variable in transliteration...
952	# also remove '=' - we'll assume (!!) that there are no errors in the encoding
953	eval "\$enc_text =~ tr\|$mimechars\|\|cd";
954	my $decoded="";
955	while (length ($enc_text)>3)
956	{
957	my $fourchars=substr($enc_text,0,4,"");
958	my @chars=(split '',$fourchars);
959	$decoded.=chr( $index{$chars[0]} << 2 \| $index{$chars[1]} >> 4);
960	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
961	$decoded.=chr( ($index{$chars[2]} & 3 ) << 6 \| $index{$chars[3]});
962	}
963	# if there are any input chars left, there are either
964	# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
965	my @chars=(split '',$enc_text);
966	if (length($enc_text)) {
967	$decoded.=chr($index{$chars[0]} << 2 \| (int $index{$chars[1]} >> 4));
968	}
969	if (length($enc_text)==3) {
970	$decoded.=chr( ($index{$chars[1]} & 15) << 4 \| $index{$chars[2]} >> 2);
971	}
972	return $decoded;
973	}
974
975	sub convert2unicode {
976	my $self = shift(@_);
977	my ($charset, $textref) = @_;
978
979	if (!$$textref) {
980	# nothing to do!
981	return;
982	}
983
984	# first get our character encoding name in the right form.
985	$charset = "iso_8859_1" unless defined $charset;
986	$charset =~ tr/A-Z/a-z/; # lowercase
987	$charset =~ s/\-/_/g;
988	if ($charset =~ /gb_?2312/) { $charset="gb" }
989	# assumes EUC-KR, not ISO-2022 !?
990	$charset =~ s/^ks_c_5601_1987/korean/;
991	if ($charset eq 'utf_8') {$charset='utf8'}
992
993	my $outhandle = $self->{'outhandle'};
994
995	if ($charset eq "utf8") {
996	# no conversion needed, but lets check that it's valid utf8
997	# see utf-8 manpage for valid ranges
998	$$textref =~ m/^/g; # to set \G
999	my $badbytesfound=0;
1000	while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1001	my $highbytes=$1;
1002	my $highbyteslength=length($highbytes);
1003	# replace any non utf8 complaint bytes
1004	$highbytes =~ /^/g; # set pos()
1005	while ($highbytes =~
1006	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
1007	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
1008	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
1009	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
1010	[\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1011	)*([\x80-\xff])? !xg
1012	) {
1013	my $badbyte=$1;
1014	if (!defined $badbyte) {next} # hit end of string
1015	my $pos=pos($highbytes);
1016	substr($highbytes, $pos-1, 1, "\xc2\x80");
1017	# update the position to continue searching (for \G)
1018	pos($highbytes) = $pos+1; # set to just after the \x80
1019	$badbytesfound=1;
1020	}
1021	if ($badbytesfound==1) {
1022	# claims to be utf8, but it isn't!
1023	print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
1024	. "detected and removed.\n";
1025
1026	my $replength=length($highbytes);
1027	my $textpos=pos($$textref);
1028	# replace bad bytes with good bytes
1029	substr( $$textref, $textpos-$replength,
1030	$replength, $highbytes);
1031	# update the position to continue searching (for \G)
1032	pos($$textref)=$textpos+($replength-$highbyteslength);
1033	}
1034	}
1035	return;
1036	}
1037
1038	# It appears that we can't always trust ascii text so we'll treat it
1039	# as iso-8859-1 (letting characters above 0x80 through without
1040	# converting them to utf-8 will result in invalid XML documents
1041	# which can't be parsed at build time).
1042	$charset = "iso_8859_1" if ($charset eq "us_ascii" \|\| $charset eq "ascii");
1043
1044	if ($charset eq "iso_8859_1") {
1045	# test if the mailer lied, and it has win1252 chars in it...
1046	# 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1047	if ($$textref =~ m/[\x80-\x9f]/) {
1048	print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
1049	print $outhandle "codepage 1252 detected.\n";
1050	$charset = "windows_1252";
1051	}
1052	}
1053	my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1054
1055	if ($utf8_text ne "") {
1056	$$textref=$utf8_text;
1057	} else {
1058	# we didn't get any text... unsupported encoding perhaps? Or it is
1059	# empty anyway. We'll try to continue, assuming 8859-1. We could strip
1060	# characters out here if this causes problems...
1061	my $outhandle=$self->{'outhandle'};
1062	print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
1063	$$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1064
1065	}
1066	}
1067
1068
1069	sub set_OID {
1070	my $self = shift (@_);
1071	my ($doc_obj, $id, $segment_number) = @_;
1072
1073	if ( exists $doc_obj->{'msgid'} ) {
1074	$doc_obj->set_OID($doc_obj->{'msgid'});
1075	} else {
1076	$doc_obj->set_OID("$id\_$segment_number");
1077	}
1078	}
1079
1080
1081	# Perl packages have to return true if they are run.
1082	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: