source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2630

Last change on this file since 2630 was 2630, checked in by jrm21, 23 years ago

Mime support for multipart messages. Doesn't extract attachments yet...
Also made sure we don't use textcat to guess language - it runs over the
whole file, and grows really big really quickly.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata:
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header - this will be stored as Creator
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51#
52# John McPherson - June/July 2001
53# added (basic) MIME support and quoted-printable and base64 decodings.
54# Minor fixes for names that are actually email addresses (ie <...> was lost)
55#
56# See: * RFC 822 - ARPA Internet Text Messages
57# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
58# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
59# * RFC 1806 - Content Dispositions (ie inline/attachment)
60package EMAILPlug;
61
62use SplitPlug;
63
64use sorttools;
65use util;
66
67
68# EMAILPlug is a sub-class of SplitPlug.
69
70sub BEGIN {
71 @ISA = ('SplitPlug');
72}
73
74# Create a new EMAILPlug object with which to parse a file.
75# Accomplished by creating a new BasPlug and using bless to
76# turn it into an EMAILPlug.
77
78sub new {
79 my ($class) = @_;
80 my $self = new BasPlug ("EMAILPlug", @_);
81 # make sure we don't run textcat (defaults to "auto");
82 $self->{'input_encoding'}="ascii";
83 return bless $self, $class;
84}
85
86sub get_default_process_exp {
87 my $self = shift (@_);
88 # mbx/email for mailbox file format, \d+ for maildir (each message is
89 # in a separate file, with a unique number for filename)
90 return q@([\\/]\d+|\.(mbx|email))$@;
91}
92
93# This plugin splits the mbox mail files at lines starting with From<sp>
94sub get_default_split_exp {
95 return q^\nFrom .*\n^;
96}
97
98
99# do plugin specific processing of doc_obj
100sub process {
101
102 my $self = shift (@_);
103 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
104 my $outhandle = $self->{'outhandle'};
105
106 # Check that we're dealing with a valid mail file
107 return undef unless (($$textref =~ /From:/m) || ($$textref =~ /To:/m));
108
109 # slightly more strict validity check, to prevent us from matching
110 # .so.x files ...
111 return undef unless (($$textref =~ /^From /) ||
112 ($$textref =~ /^[-A-Za-z]{2,100}:/m));
113
114 print $outhandle "EMAILPlug: processing $file\n"
115 if $self->{'verbosity'} > 1;
116
117 my $cursection = $doc_obj->get_top_section();
118
119 #
120 # Parse the document's text and extract metadata
121 #
122
123 # Separate header from body of message
124 my $Headers = $$textref;
125 $Headers =~ s/\r?\n\r?\n(.*)$//s;
126 $$textref = $1;
127
128 # Unfold headers - see rfc822
129 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
130 # Extract basic metadata from header
131 my @headers = ("From", "To", "Subject", "Date");
132 my %raw;
133 foreach my $name (@headers) {
134 $raw{$name} = "No $name value";
135 }
136
137 # Examine each line of the headers
138 my ($line, $name, $value);
139 my @parts;
140 foreach $line (split(/\n/, $Headers)) {
141
142 # Ignore lines with no content or which begin with whitespace
143 next unless ($line =~ /:/);
144 next if ($line =~ /^\s/);
145
146 # Find out what metadata is on this line
147 @parts = split(/:/, $line);
148 $name = shift @parts;
149# uppercase the first character according to the current locale
150 $name=~s/(.+)/\u$1/;
151 next unless $name;
152 next unless ($raw{$name});
153
154 # Find the value of that metadata
155 $value = join(":", @parts);
156 $value =~ s/^\s+//;
157 $value =~ s/\s+$//;
158
159 # Store the metadata
160 $raw{$name} = $value;
161 }
162
163 # Extract the name and e-mail address from the From metadata
164 $frommeta = $raw{"From"};
165 $frommeta =~ m/(.*)<(.*)>/;
166 my $fromnamemeta=$1;
167 my $fromaddrmeta=$2;
168 if (!defined($fromaddrmeta)) {
169 $fromaddrmeta=$frommeta;
170 }
171 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
172
173 if (defined($fromnameneta)) {
174 $fromnamemeta =~ s/\"//g;
175 $fromnamemeta =~ s/(.*) /$1/; # Remove trailing space
176 }
177 else {
178 $fromnamemeta = $fromaddrmeta;
179 }
180 # if name is an address
181 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
182 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
183
184 # Escape < and > in the whole From field;
185 $frommeta =~ s/</&lt;/g; $frommeta =~ s/>/&gt;/g;
186 $raw{"From"}=$frommeta;
187
188 # Process Date information
189 if ($raw{"Date"} !~ /No Date/) {
190 $raw{"DateText"} = $raw{"Date"};
191
192 # Convert the date text to internal date format
193 $value = $raw{"Date"};
194 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
195 if ($year < 100) { $year += 1900; }
196 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
197
198 } else {
199 # We have not extracted a date
200 $raw{"DateText"} = "Unknown.";
201 $raw{"Date"} = "19000000";
202 }
203
204
205 # Add extracted metadata to document object
206 foreach my $name (keys %raw) {
207 $value = $raw{$name};
208 if ($value) {
209 $value = &text_into_html($value);
210 } else {
211 $value = "No $name field";
212 }
213 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
214 }
215
216 my $mimetype="text/plain";
217 my $mimeinfo="";
218 # Do MIME and encoding stuff
219 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*([^\s]+)\s*$/mi)
220 {
221 $mimetype=$1;
222 $mimetype =~ tr/[A-Z]/[a-z]/;
223 $mimeinfo=$2;
224 }
225
226 if ($mimetype ne "text/plain") {
227 $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref);
228 } # end of not text/plain
229
230
231 # Add "All headers" metadata
232 $Headers = &text_into_html($Headers);
233 $Headers = "No headers" unless ($Headers =~ /\w/);
234 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
235
236 # Add text to document object
237 if ($mimetype eq "text/plain") {
238 $$textref = &text_into_html($$textref);
239 }
240 $$textref = "No message" unless ($$textref =~ /\w/);
241 $doc_obj->add_utf8_text($cursection, $$textref);
242
243 return 1;
244}
245
246
247# Convert a text string into HTML.
248#
249# The HTML is going to be inserted into a GML file, so
250# we have to be careful not to use symbols like ">",
251# which ocurs frequently in email messages (and use
252# &gt instead.
253#
254# This function also turns links and email addresses into hyperlinks,
255# and replaces carriage returns with <BR> tags (and multiple carriage
256# returns with <P> tags).
257
258
259sub text_into_html {
260 my ($text) = @_;
261
262 # Convert problem characters into HTML symbols
263 $text =~ s/&/&amp;/go;
264 $text =~ s/</&lt;/go;
265 $text =~ s/>/&gt;/go;
266 $text =~ s/\"/&quot;/go;
267
268 # convert email addresses and URIs into links
269# don't markup email addresses for now
270# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
271
272 # assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
273 # URI can't finish with a '.'
274 $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&amp;|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;
275
276
277 # Clean up whitespace and convert \n charaters to <BR> or <P>
278 $text =~ s/ +/ /go;
279 $text =~ s/\s*$//o;
280 $text =~ s/^\s*//o;
281 $text =~ s/\n/\n<BR>/go;
282 $text =~ s/<BR>\s*<BR>/<P>/go;
283
284 return $text;
285}
286
287
288
289
290#Process a MIME message.
291# the textref we are given DOES NOT include the header.
292sub text_from_mime_message {
293 my ($mimetype,$mimeinfo,$text)=(@_);
294
295 # Check for multiparts - $mimeinfo will be a boundary
296 if ($mimetype =~ /multipart/) {
297 $boundary="";
298 if ($mimeinfo =~ /boundary="?([^\s]+?)"?\s*$/im) {
299 $boundary=$1;
300 }
301 # parts start with "--$boundary"
302 # message ends with "--$boundary--"
303 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
304 # that perl might want to interpolate.
305
306 $boundary=~s/\\/\\\\/g;
307 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
308 my @message_parts = split("\r?\n\-\-$boundary", $text);
309 # remove first "part" and last "part" (final --)
310 shift @message_parts;
311 my $last=pop @message_parts;
312 # make sure it is only -- and whitespace
313 if ($last !~ /^\-\-\s*$/ms) {
314 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
315 }
316 foreach my $message_part (@message_parts) {
317 # remove the leading newline left from split.
318 $message_part=~s/^\r?\n//;
319 }
320 if ($mimetype eq "multipart/alternative") {
321 # check for an HTML version first, then TEXT, otherwise use first.
322 my $part_text="";
323 foreach my $message_part (@message_parts) {
324 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
325 {
326 # Use the HTML version
327 $part_text=text_from_part($message_part);
328 $mimetype="text/html";
329 last;
330 }
331 }
332 if ($part_text eq "") { # try getting a text part instead
333 foreach my $message_part (@message_parts) {
334 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
335 {
336 # Use the plain version
337 $part_text=text_from_part($message_part);
338 $mimetype="text/plain";
339 last;
340 }
341 }
342 }
343 if ($part_text eq "") { # use first part
344 $part_text=text_from_part(shift @message_parts);
345 }
346 if ($part_text eq "") { # we couldn't get anything!!!
347 # or it was an empty message...
348 # do nothing...
349 print $outhandle "EMAILPlug: no text - empty body?\n";
350 } else {
351 $text=$part_text;
352 }
353 } elsif ($mimetype eq "multipart/mixed" ||
354 $mimetype eq "multipart/digest") {
355 $text="";
356 foreach my $message_part (@message_parts) {
357 my $part_header=$message_part;
358 $part_header=~s/\r?\n\r?\n(.*)$//sg;
359 my $part_body=$1;
360 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
361 my $part_content_type="";
362 my $part_content_info="";
363 if ($mimetype eq "multipart/digest") {
364 # default type - RTFRFC!!
365 $part_content_type="message/rfc822";
366 }
367 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*([^\s]+)@mi) {
368 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
369 $part_content_info=$2;
370 }
371 my $filename="";
372 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
373 $filename=$1;
374 }
375
376 # disposition - either inline or attachment.
377 # NOT CURRENTLY USED - we display all text types instead...
378 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
379
380 # add <<attachment>> to each part except the first...
381 if ($text ne "") {
382 $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
383 # add part info header
384 $text.="<br>Type: $part_content_type<br>\n";
385 if ($filename ne "") {
386 $text.="Filename: $filename\n";
387 }
388 $text.="</strong></p>\n";
389 }
390
391 if ($part_content_type =~ m@text/@)
392 {
393 my $part_text=text_from_part($message_part);
394 if ($part_content_type !~ m@text/(ht|x)ml@) {
395 $part_text=text_into_html($part_text);
396 }
397 if ($part_text eq "") {
398 $part_text='&lt;&lt;empty message&gt;&gt;';
399 }
400 $text.=$part_text;
401 } elsif ($part_content_type =~ m@message/rfc822@) {
402 # This is a forwarded message
403 my $message_part_headers=$part_body;
404 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
405 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
406 my $message_part_body=$1;
407
408 my $rfc822_formatted_body=""; # put result in here
409 if ($message_part_headers =~
410 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*?([^\s]+)?\s*$/ims)
411 {
412 # The message header uses MIME flags
413 my $message_content_type=$1;
414 my $message_content_info=$2;
415 if (!defined($message_content_info)) {
416 $message_content_info="";
417 }
418 $message_content_type =~ tr/A-Z/a-z/;
419 if ($message_content_type =~ /multipart/) {
420 $rfc822_formatted_body=
421 text_from_mime_message($message_content_type,
422 $message_content_info,
423 $message_part_body);
424 } else {
425 $message_part_body=text_from_part($part_body);
426 $rfc822_formatted_body=text_into_html($message_part_body);
427 }
428 } else {
429 # message doesn't use MIME flags
430 $rfc822_formatted_body=text_into_html($message_part_body);
431 }
432 # Add the returned text to the output
433 # don't put all the headers...
434 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
435 $text.=text_into_html($message_part_headers);
436 $text.="<p>\n";
437 $text.=$rfc822_formatted_body;
438 # end of message/rfc822
439 } elsif ($part_content_type =~ /multipart/) {
440 # recurse again
441 $tmptext=text_from_mime_message($part_content_type,
442 $part_content_info,
443 $part_body);
444 $text.=$tmptext;
445 } elsif ($text eq "") {
446 # we can't do anything with this part, but if it's the first
447 # part then make sure it is mentioned..
448
449 $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
450 # add part info header
451 $text.="<br>Type: $part_content_type<br>\n";
452 if ($filename ne "") {
453 $text.="Filename: $filename\n";
454 }
455 $text.="</strong></p>\n";
456 }
457 } # foreach message part.
458 } else {
459 # we can't handle this multipart type (not mixed or alternative)
460 # the RFC also mentions "parallel".
461 }
462 } # end of multipart
463 return $text;
464}
465
466
467
468
469
470
471# Process a MIME part. Return "" if we can't decode it.
472sub text_from_part {
473 my $text=shift;
474 my $part_header=$text;
475 $part_header =~ s/\r?\n\r?\n(.*)$//s;
476 $text=$1;
477 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
478 $part_header =~ /content\-type:\s*([\w\/]+)/is;
479 my $type=$1;
480 my $encoding="";
481 if ($part_header =~ /^content\-transfer\-encoding:\s*([^s]+)/mis) {
482 $encoding=$1; $encoding=~tr/A-Z/a-z/;
483 }
484 # Content-Transfer-Encoding is per-part
485 if ($encoding ne "") {
486 if ($encoding =~ /quoted\-printable/) {
487 $text=qp_decode($text);
488 } elsif ($encoding =~ /base64/) {
489 $text=base64_decode($text);
490 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
491 # rfc2045 also allows binary, which we ignore (for now).
492 # maybe this shouldn't go to stderr, but anyway...
493 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
494 return "";
495 }
496 }
497
498 if ($type eq "text/html") {
499 # only get stuff between <body> tags, or <html> tags.
500 $text =~ s/^.*?<(html|HTML)[^>]*>//s;
501 $text =~ s/<\/(html|HTML)>.*$//s;
502
503 $text =~ s/^.*?<(body|BODY)[^>]*>//s;
504 $text =~ s/<\/(body|BODY)>.*$//s;
505 }
506 elsif ($type eq "text/xml") {
507 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
508 $text="<pre>\n$text\n</pre>\n";
509 }
510 return $text;
511}
512
513
514# decode quoted-printable text
515sub qp_decode {
516 my $text=shift;
517
518 my @lines=split('\n', $text);
519
520 # if a line ends with "=\s*", it is a soft line break, otherwise
521 # keep in any newline characters.
522 foreach my $line (@lines) {
523 if ($line =~ s/=\s*$//) {}
524 else {$line.="\n";}
525
526 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
527 my @hexcode_segments=split('=',$line);
528 shift @hexcode_segments;
529 my @hexcodes;
530 foreach my $hexcode (@hexcode_segments) {
531 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
532 chomp($hexcode); # just in case...
533 my $char=chr (hex "0x$hexcode");
534 $line =~ s/=$hexcode/$char/g;
535 }
536 }
537 }
538 $text= join('', @lines);
539 return $text;
540}
541
542# decode base64 text. This is fairly slow (since it's interpreted perl rather
543# than compiled XS stuff like in the ::MIME modules, but this is more portable
544# for us at least).
545# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
546# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
547# from each byte.
548
549
550sub base64_decode {
551 my $enc_text = shift;
552# A=>0, B=>1, ..., '+'=>62, '/'=>63
553# also '=' is used for padding at the end, but we remove it anyway.
554 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
555# map each MIME char into it's value, for more efficient lookup.
556 my %index;
557 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
558# remove all non-base64 chars. eval to get variable in transliteration...
559# also remove '=' - we'll assume (!!) that there are no errors in the encoding
560 eval "\$enc_text =~ tr|$mimechars||cd";
561 my $decoded="";
562 while (length ($enc_text)>3)
563 {
564 my $fourchars=substr($enc_text,0,4,"");
565 my @chars=(split '',$fourchars);
566 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
567 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
568 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
569 }
570# if there are any input chars left, there are either
571# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
572 my @chars=(split '',$enc_text);
573 if (length($enc_text)) {
574 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
575 }
576 if (length($enc_text)==3) {
577 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
578 }
579 return $decoded;
580}
581
582
583# Perl packages have to return true if they are run.
5841;
Note: See TracBrowser for help on using the repository browser.