source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 7703

Last change on this file since 7703 was 7703, checked in by jrm21, 20 years ago

1) use the email's message ID instead of document hash for Identifier.

2) if a message claims to be utf8, actually check it for bad chars.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.1 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content (optional, not stored by default)
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64# 12/05/02 Added usage datastructure - John Thompson
65
66package EMAILPlug;
67
68use SplitPlug;
69
70use unicode;
71
72use sorttools;
73use util;
74
75
76# EMAILPlug is a sub-class of SplitPlug.
77
78sub BEGIN {
79 @ISA = ('SplitPlug');
80}
81
82use strict;
83no strict "refs"; # so we can use a variable as a filehandle for print $out
84
85my $arguments =
86 [ { 'name' => "process_exp",
87 'desc' => "{BasPlug.process_exp}",
88 'type' => "regexp",
89 'reqd' => "no",
90 'deft' => &get_default_process_exp() },
91 { 'name' => "no_attachments",
92 'desc' => "{EMAILPlug.no_attachments}",
93 'type' => "flag",
94 'reqd' => "no" },
95 { 'name' => "headers",
96 'desc' => "{EMAILPlug.headers}",
97 'type' => "flag",
98 'reqd' => "no" },
99 { 'name' => "split_exp",
100 'desc' => "{EMAILPlug.split_exp}",
101 'type' => "regexp",
102 'reqd' => "no",
103 'deft' => &get_default_split_exp() }
104 ];
105
106my $options = { 'name' => "EMAILPlug",
107 'desc' => "{EMAILPlug.desc}",
108 'abstract' => "no",
109 'inherits' => "yes",
110 'args' => $arguments };
111
112# Create a new EMAILPlug object with which to parse a file.
113# Accomplished by creating a new BasPlug and using bless to
114# turn it into an EMAILPlug.
115
116sub new {
117 my ($class) = @_;
118 my $self = new BasPlug ($class, @_);
119 $self->{'plugin_type'} = "EMAILPlug";
120 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
121 my $option_list = $self->{'option_list'};
122 push( @{$option_list}, $options );
123
124 if (!parsargv::parse(\@_,
125 q^split_exp/.*/^, \$self->{'split_exp'},
126 q^no_attachments^, \$self->{'ignore_attachments'},
127 q^headers^, \$self->{'header_metadata'},
128 "allow_extra_options")) {
129 print STDERR "\nIncorrect options passed to $class.";
130 print STDERR "\nCheck your collect.cfg configuration file\n";
131 $self->print_txt_usage(""); # Use default resource bundle
132 die "\n";
133 }
134 $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
135
136 # this might not actually be true at read-time, but after processing
137 # it should all be utf8.
138 $self->{'input_encoding'}="utf8";
139 return bless $self, $class;
140}
141
142sub get_default_process_exp {
143 my $self = shift (@_);
144 # mbx/email for mailbox file format, \d+ for maildir (each message is
145 # in a separate file, with a unique number for filename)
146 # mozilla and IE will save individual mbx format files with a ".eml" ext.
147 return q@([\\/]\d+|\.(mbx|email|eml))$@;
148}
149
150# This plugin splits the mbox mail files at lines starting with From<sp>
151# It is supposed to be "\n\nFrom ", but this isn't always used.
152sub get_default_split_exp {
153 return q^\nFrom .*\n^;
154
155}
156
157
158# do plugin specific processing of doc_obj
159sub process {
160
161 my $self = shift (@_);
162 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
163 my $outhandle = $self->{'outhandle'};
164
165 # Check that we're dealing with a valid mail file
166 # mbox message files start with "From "
167 # maildir messages usually start with Return-Path and Delivered-To
168 # mh is very similar to maildir
169 my $startoffile=substr($$textref,0,256);
170 if (($startoffile !~ /^(From )/) &&
171 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
172 return undef;
173 }
174
175
176 print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
177
178 print $outhandle "EMAILPlug: processing $file\n"
179 if $self->{'verbosity'} > 1;
180
181 my $cursection = $doc_obj->get_top_section();
182
183 #
184 # Parse the document's text and extract metadata
185 #
186
187 # Protect backslashes
188 $$textref =~ s@\\@\\\\@g;
189
190 # Separate header from body of message
191 my $Headers = $$textref;
192 $Headers =~ s/\r?\n\r?\n(.*)$//s;
193 $$textref = $1;
194 $Headers .= "\n";
195
196 # Unfold headers - see rfc822
197 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
198 # Extract basic metadata from header
199 my @headers = ("From", "To", "Subject", "Date");
200 my %raw;
201 foreach my $name (@headers) {
202 $raw{$name} = "No $name value";
203 }
204
205 # Get a default encoding for the header - RFC says should be ascii...
206 my $default_header_encoding="iso_8859_1";
207
208 # We don't know what character set is the user's default...
209 # We could use textcat to guess... for now we'll look at mime content-type
210# if ($Headers =~ /([[:^ascii:]])/) {
211# }
212 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
213 $default_header_encoding=$1;
214 $default_header_encoding =~ s@\-@_@g;
215 $default_header_encoding =~ tr/A-Z/a-z/;
216 }
217
218
219 # Examine each line of the headers
220 my ($line, $name, $value);
221 my @parts;
222 foreach $line (split(/\n/, $Headers)) {
223
224 # Ignore lines with no content or which begin with whitespace
225 next unless ($line =~ /:/);
226 next if ($line =~ /^\s/);
227
228 # Find out what metadata is on this line
229 @parts = split(/:/, $line);
230 $name = shift @parts;
231 # get fieldname in canonical form - first cap, then lower case.
232 $name =~ tr/A-Z/a-z/;
233 # uppercase the first character according to the current locale
234 $name=~s/(.+)/\u$1/;
235 next unless $name;
236 next unless ($raw{$name});
237
238 # Find the value of that metadata
239 $value = join(":", @parts);
240 $value =~ s/^\s+//;
241 $value =~ s/\s+$//;
242 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
243 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
244 my $original_value=$value;
245 my $encoded=$value;
246 $value="";
247 # we should ignore spaces between consecutive encoded-texts
248 $encoded =~ s@\?=\s+=\?@\?==\?@g;
249 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=//i) {
250 my ($charset, $encoding, $data)=($2,$3,$4);
251 my ($decoded_data);
252 $value.="$1"; # any leading chars
253 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
254 chomp $data;
255 $encoding =~ tr/BQ/bq/;
256 if ($encoding eq "q") { # quoted printable
257 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
258 $decoded_data=qp_decode($data);
259 # qp_decode adds \n, which is default for body text
260 chomp($decoded_data);
261 } else { # base 64
262 $decoded_data=base64_decode($data);
263 }
264 $self->convert2unicode($charset, \$decoded_data);
265 $value .= $decoded_data;
266 } # end of while loop
267
268 # get any trailing characters
269 $self->convert2unicode($default_header_encoding, \$encoded);
270 $value.=$encoded;
271
272 if ($value =~ /^\s*$/) { # we couldn't extract anything...
273 $self->convert2unicode($default_header_encoding,
274 \$original_value);
275 $value=$original_value;
276 }
277 } # end of if =?...?=
278
279 # In the absense of other charset information, assume the
280 # header is the default (usually "iso_8859_1") and convert to unicode.
281 else {
282 $self->convert2unicode($default_header_encoding, \$value);
283 }
284
285 # Store the metadata
286 $value =~ s@_@\\_@g; # protect against GS macro language
287 $raw{$name} = $value;
288 }
289
290 # Extract the name and e-mail address from the From metadata
291 my $frommeta = $raw{"From"};
292 my $fromnamemeta;
293 my $fromaddrmeta;
294
295 $frommeta =~ s/\s*$//; # Remove trailing space, if any
296
297 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
298 $fromnamemeta=$1;
299 $fromaddrmeta=$2;
300 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
301 $fromnamemeta=$2;
302 $fromaddrmeta=$1;
303 }
304 if (!defined($fromaddrmeta)) {
305 $fromaddrmeta=$frommeta;
306 }
307 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
308 # minor attempt to prevent spam-bots from harvesting addresses...
309 $fromaddrmeta=~s/@/&#64;/;
310
311 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
312
313 if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
314 $fromnamemeta =~ s/\"//g; # remove quotes
315 $fromnamemeta =~ s/\s+$//; # remove trailing whitespace
316 }
317 else {
318 $fromnamemeta = $fromaddrmeta;
319 }
320 # if name is an address
321 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
322 $fromnamemeta=~s/@/&#64\;/;
323 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
324
325 $raw{"From"}=$frommeta;
326
327 # Process Date information
328 if ($raw{"Date"} !~ /No Date/) {
329 $raw{"DateText"} = $raw{"Date"};
330
331 # Convert the date text to internal date format
332 $value = $raw{"Date"};
333 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
334 # make some assumptions about the year formatting...
335 # some (old) software thinks 2001 is 101, some think 2001 is 01
336 if ($year < 20) { $year += 2000; } # assume not really 1920...
337 elsif ($year < 150) { $year += 1900; } # assume not really 2150...
338 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
339
340 } else {
341 # We have not extracted a date
342 $raw{"DateText"} = "Unknown.";
343 $raw{"Date"} = "19000000";
344 }
345
346 # Add extracted metadata to document object
347 foreach my $name (keys %raw) {
348 $value = $raw{$name};
349 if ($value) {
350 # assume subject, etc headers have no special HTML meaning.
351 $value = &text_into_html($value);
352 # escape [] so it isn't re-interpreted as metadata
353 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
354 } else {
355 $value = "No $name field";
356 }
357 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
358 }
359
360
361 # extract a message ID from the headers, if there is one, and we'll use
362 # that as the greenstone doc ID. Having a predictable ID means we can
363 # link to other messages, eg from In-Reply-To or References headers...
364 if ($Headers =~ m@^Message-ID:(.+)$@mi) {
365 my $id=escape_msg_id($1);
366 $doc_obj->{'msgid'}=$id;
367 }
368 # link to another message, if this is a reply
369 if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
370 my $id=escape_msg_id($1);
371 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
372 } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
373 # References can have multiple, get the last one
374 my $id=escape_msg_id($1);
375 # not necessarily in-reply-to, but same thread...
376 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
377 }
378
379
380
381 my $mimetype="text/plain";
382 my $mimeinfo="";
383 my $charset = $default_header_encoding;
384 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
385 # more than one parameter given to Content-type.
386 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
387 if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
388 {
389 $mimetype=$1;
390 $mimetype =~ tr/[A-Z]/[a-z]/;
391
392 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
393 $mimetype = "text/plain";
394 }
395
396 $mimeinfo=$2;
397 if (!defined $mimeinfo) {
398 $mimeinfo="";
399 } else { # strip leading and trailing stuff
400 $mimeinfo =~ s/^\;\s*//;
401 $mimeinfo =~ s/\s*$//;
402 }
403 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
404 $charset = $1;
405 }
406 }
407
408 my $transfer_encoding="7bit";
409 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
410 $transfer_encoding=$1;
411 }
412
413 if ($mimetype eq "text/html") {
414 $$textref= $self->text_from_part("$Headers\n$$textref");
415 } elsif ($mimetype ne "text/plain") {
416 $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
417 $$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
418 } else { # mimetype eq text/plain
419 $$textref = &text_into_html($$textref);
420 $$textref =~ s@_@\\_@g; # protect against GS macro language
421
422 if ($transfer_encoding =~ /quoted\-printable/) {
423 $$textref=qp_decode($$textref);
424 } elsif ($transfer_encoding =~ /base64/) {
425 $$textref=base64_decode($$textref);
426 }
427 $self->convert2unicode($charset, $textref);
428 }
429
430
431 if ($self->{'header_metadata'} && $self->{'header_metadata'} == 1) {
432 # Add "All headers" metadata
433 $Headers = &text_into_html($Headers);
434
435 $Headers = "No headers" unless ($Headers =~ /\w/);
436 $Headers =~ s/@/&#64\;/g;
437 # escape [] so it isn't re-interpreted as metadata
438 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
439 $self->convert2unicode($charset, \$Headers);
440
441 $Headers =~ s@_@\\_@g; # protect against GS macro language
442 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
443 }
444
445
446 # Add Title metadata
447 my $Title = text_into_html($raw{'Subject'});
448 $Title .= "<br>From: " . text_into_html($fromnamemeta);
449 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
450 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
451
452 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
453
454
455 # Add text to document object
456 $$textref = "No message" unless ($$textref =~ /\w/);
457
458 $doc_obj->add_utf8_text($cursection, $$textref);
459
460 return 1;
461}
462
463
464# Convert a text string into HTML.
465#
466# The HTML is going to be inserted into a GML file, so
467# we have to be careful not to use symbols like ">",
468# which ocurs frequently in email messages (and use
469# &gt instead.
470#
471# This function also turns links and email addresses into hyperlinks,
472# and replaces carriage returns with <BR> tags (and multiple carriage
473# returns with <P> tags).
474
475
476sub text_into_html {
477 my ($text) = @_;
478
479 # Convert problem characters into HTML symbols
480 $text =~ s/&/&amp;/g;
481 $text =~ s/</&lt;/g;
482 $text =~ s/>/&gt;/g;
483 $text =~ s/\"/&quot;/g;
484
485 # convert email addresses and URIs into links
486# don't markup email addresses for now
487# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
488
489 # try to munge email addresses a little bit...
490 $text =~ s/@/&#64;/;
491 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
492 # assume URI doesn't finish with a '.'
493 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
494
495
496 # Clean up whitespace and convert \n charaters to <BR> or <P>
497 $text =~ s/ +/ /g;
498 $text =~ s/\s*$//g;
499 $text =~ s/^\s*//g;
500 $text =~ s/\n/\n<br>/g;
501 $text =~ s/<br>\s*<br>/<p>/gi;
502
503 return $text;
504}
505
506
507
508
509#Process a MIME message.
510# the textref we are given DOES NOT include the header.
511sub text_from_mime_message {
512 my $self = shift(@_);
513 my ($mimetype,$mimeinfo,$text)=(@_);
514 my $outhandle=$self->{'outhandle'};
515 # Check for multiparts - $mimeinfo will be a boundary
516 if ($mimetype =~ /multipart/) {
517 my $boundary="";
518 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
519 $boundary=$1;
520 if ($boundary =~ m@^\"@) {
521 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
522 }
523 } else {
524 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
525 }
526 # parts start with "--$boundary"
527 # message ends with "--$boundary--"
528 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
529 # that perl might want to interpolate. Also allows spaces...
530 $boundary=~s/\\/\\\\/g;
531 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
532 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
533 # remove first "part" and last "part" (final --)
534 shift @message_parts;
535 my $last=pop @message_parts;
536 # if our boundaries are a bit dodgy and we only found 1 part...
537 if (!defined($last)) {$last="";}
538 # make sure it is only -- and whitespace
539 if ($last !~ /^\-\-\s*$/ms) {
540 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
541 }
542 foreach my $message_part (@message_parts) {
543 # remove the leading newline left from split.
544 $message_part=~s/^\r?\n//;
545 }
546 if ($mimetype eq "multipart/alternative") {
547 # check for an HTML version first, then TEXT, otherwise use first.
548 my $part_text="";
549 foreach my $message_part (@message_parts) {
550 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
551 {
552 # Use the HTML version
553 $part_text= $self->text_from_part($message_part);
554 $mimetype="text/html";
555 last;
556 }
557 }
558 if ($part_text eq "") { # try getting a text part instead
559 foreach my $message_part (@message_parts) {
560 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
561 {
562 # Use the plain version
563 $part_text= $self->text_from_part($message_part);
564 if ($part_text =~/[^\s]/) {
565 $part_text = text_into_html($part_text);
566 }
567 $mimetype="text/plain";
568 last;
569 }
570 }
571 }
572 if ($part_text eq "") { #use first part (no html/text part found)
573 $part_text = $self->text_from_part(shift @message_parts);
574 $part_text = text_into_html($part_text);
575 }
576 if ($part_text eq "") { # we couldn't get anything!!!
577 # or it was an empty message...
578 # do nothing...
579 print $outhandle "EMAILPlug: no text - empty body?\n";
580 } else {
581 $text = $part_text;
582 }
583 } elsif ($mimetype =~ m@multipart/(mixed|digest|related|signed)@) {
584 $text = "";
585 # signed is for PGP/GPG messages... the last part is a hash
586 if ($mimetype =~ m@multipart/signed@) {
587 pop @message_parts;
588 }
589 my $is_first_part=1;
590 foreach my $message_part (@message_parts) {
591 if ($is_first_part && $text ne "") {$is_first_part=0;}
592
593 if ($mimetype eq "multipart/digest") {
594 # default type - RTFRFC!! Set if not already set
595 $message_part =~ m@^(.*)\n\r?\n@s;
596 my $part_header=$1;
597 if ($part_header !~ m@^content-type@mi) {
598 $message_part="Content-type: message/rfc822\n"
599 . $message_part; # prepend default type
600 }
601 }
602
603 $text .= $self->process_multipart_part($message_part,
604 $is_first_part);
605 } # foreach message part.
606 } else {
607 # we can't handle this multipart type (not mixed or alternative)
608 # the RFC also mentions "parallel".
609 }
610 } # end of ($mimetype =~ multipart)
611 elsif ($mimetype =~ m@message/rfc822@) {
612 my $msg_header = $text;
613 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
614 $text = $1;
615
616 if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
617 {
618 $mimetype=$1;
619 $mimetype =~ tr/[A-Z]/[a-z]/;
620 $mimeinfo=$2;
621 #if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
622 # $charset = $1;
623 #}
624 my $msg_text;
625 if ($mimetype =~ m@multipart/@) {
626 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
627 $text);
628 } else {$msg_text=text_from_part($text);}
629
630 my $brief_header=text_into_html(get_brief_headers($msg_header));
631 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
632 $text.= "<table><tr><td width=\"5%\"> </td>\n";
633 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
634 . "</td></tr></table>";
635 }
636 } else {
637 # we don't do any processing of the content.
638 }
639
640 return $text;
641}
642
643
644
645# used for turning a message id into a more friendly string for greenstone
646sub escape_msg_id {
647#msgid
648 my $id=shift;
649 chomp $id; $id =~ s!\s!!g; # remove spaces
650 $id =~ s![<>\[\]]!!g; # remove [ ] < and >
651 $id =~ s![_&]!-!g; # replace symbols that might cause problems
652 $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
653 return $id;
654}
655
656
657
658sub process_multipart_part {
659 my $self = shift;
660 my $message_part = shift;
661 my $is_first_part = shift;
662
663 my $return_text="";
664 my $part_header=$message_part;
665 my $part_body;
666 if ($message_part=~ /^\s*\n/) {
667 # no header... use defaults
668 $part_body=$message_part;
669 $part_header="Content-type: text/plain; charset=us-ascii";
670 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
671 $part_body=$1;
672 } else {
673 # something's gone wrong...
674 $part_header="";
675 $part_body=$message_part;
676 }
677
678 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
679 my $part_content_type="";
680 my $part_content_info="";
681
682 if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;.*)?$@mi) {
683 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
684 $part_content_info=$2;
685 if (!defined($part_content_info)) {
686 $part_content_info="";
687 } else {
688 $part_content_info =~ s/^\;\s*//;
689 $part_content_info =~ s/\s*$//;
690 }
691 }
692 my $filename="";
693 if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
694 $filename=$1;
695 $filename =~ s@\r?\s*$@@; # remove trailing space, if any
696 }
697
698 # disposition - either inline or attachment.
699 # NOT CURRENTLY USED - we display all text types instead...
700 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
701
702 # add <<attachment>> to each part except the first...
703 if (!$is_first_part) {
704 $return_text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
705 # add part info header
706 my $header_text="<br>Type: $part_content_type<br>\n";
707 if ($filename ne "") {
708 $header_text.="Filename: $filename\n";
709 }
710 $header_text =~ s@_@\\_@g;
711 $return_text.=$header_text . "</strong></p>\n<p>\n";
712 }
713
714 if ($part_content_type =~ m@text/@)
715 {
716 my $part_text= $self->text_from_part($message_part);
717 if ($part_content_type !~ m@text/(ht|x)ml@) {
718 $part_text = text_into_html($part_text);
719 }
720 if ($part_text eq "") {
721 $part_text = ' ';
722 }
723 $return_text .= $part_text;
724 } elsif ($part_content_type =~ m@message/rfc822@) {
725 # This is a forwarded message
726 my $message_part_headers=$part_body;
727 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
728 my $message_part_body=$1;
729 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
730
731 my $rfc822_formatted_body=""; # put result in here
732 if ($message_part_headers =~
733 /^content\-type:\s*([\w\.\-\/]+)\s*(\;.*)?$/ims)
734 {
735 # The message header uses MIME flags
736 my $message_content_type=$1;
737 my $message_content_info=$2;
738 if (!defined($message_content_info)) {
739 $message_content_info="";
740 } else {
741 $message_content_info =~ s/^\;\s*//;
742 $message_content_info =~ s/\s*$//;
743 }
744 $message_content_type =~ tr/A-Z/a-z/;
745 if ($message_content_type =~ /multipart/) {
746 $rfc822_formatted_body=
747 $self->text_from_mime_message($message_content_type,
748 $message_content_info,
749 $message_part_body);
750 } else {
751 $message_part_body= $self->text_from_part($part_body);
752 $rfc822_formatted_body=text_into_html($message_part_body);
753 }
754 } else {
755 # message doesn't use MIME flags
756 $rfc822_formatted_body=text_into_html($message_part_body);
757 $rfc822_formatted_body =~ s@_@\\_@g;
758 }
759 # Add the returned text to the output
760 # don't put all the headers...
761# $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
762 my $brief_headers=get_brief_headers($message_part_headers);
763 $return_text.=text_into_html($brief_headers);
764 $return_text.="</p><p>\n";
765 $return_text.=$rfc822_formatted_body;
766 $return_text.="</p>\n";
767 # end of message/rfc822
768 } elsif ($part_content_type =~ /multipart/) {
769 # recurse again
770
771 my $tmptext= $self->text_from_mime_message($part_content_type,
772 $part_content_info,
773 $part_body);
774 $return_text.=$tmptext;
775 } else {
776 # this part isn't text/* or another message...
777 if ($is_first_part) {
778 # this is the first part of a multipart, or only part!
779 $return_text="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
780 # add part info header
781 my $header_text="<br>Type: $part_content_type<br>\n";
782 $header_text.="Filename: $filename</strong></p>\n<p>\n";
783 $header_text =~ s@_@\\_@g;
784 $return_text.=$header_text;
785 }
786
787 # save attachment by default
788 if (!$self->{'ignore_attachments'}
789 && $filename ne "") { # this part has a file...
790 my $encoding="8bit";
791 if ($part_header =~
792 /^content-transfer-encoding:\s*(\w+)/mi ) {
793 $encoding=$1; $encoding =~ tr/A-Z/a-z/;
794 }
795 my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
796 my $save_filename=$filename;
797
798 # make sure we don't clobber files with same name;
799 # need to keep state between .mbx files
800 my $assoc_files=$self->{'assoc_filenames'};
801 if ($assoc_files->{$filename}) { # it's been set...
802 $assoc_files->{$filename}++;
803 $filename =~ m/(.+)\.(\w+)$/;
804 my ($filestem, $ext)=($1,$2);
805 $save_filename="${filestem}_"
806 . $assoc_files->{$filename} . ".$ext";
807 } else { # first file with this name
808 $assoc_files->{$filename}=1;
809 }
810 open (SAVE, ">$tmpdir/$save_filename") ||
811 warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
812 my $part_text = $message_part;
813 $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
814 if ($encoding eq "base64") {
815 print SAVE base64_decode($part_text);
816 } elsif ($encoding eq "quoted-printable") {
817 print SAVE qp_decode($part_text);
818 } else { # 7bit, 8bit, binary, etc...
819 print SAVE $part_text;
820 }
821 close SAVE;
822 my $doc_obj=$self->{'doc_obj'};
823 $doc_obj->associate_file("$tmpdir/$save_filename",
824 "$save_filename",
825 $part_content_type # mimetype
826 );
827 # clean up tmp area...
828 # Can't do this as it hasn't been copied/linked yet!!!
829# &util::rm("$tmpdir/$save_filename");
830 my $outhandle=$self->{'outhandle'};
831 print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
832
833 # be nice if "download" was a translatable macro :(
834 $return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
835 } # end of save attachment
836 } # end of !text/message part
837
838
839 return $return_text;
840}
841
842
843# Return only the "important" headers from a set of message headers
844sub get_brief_headers {
845 my $msg_header = shift;
846 my $brief_header = "";
847
848 # Order matters!
849 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
850 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
851 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
852 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
853 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
854
855 return $brief_header;
856}
857
858
859# Process a MIME part. Return "" if we can't decode it.
860# should only be called for parts with type "text/*" ?
861sub text_from_part {
862 my $self = shift;
863 my $text = shift || '';
864 my $part_header = $text;
865
866 # check for empty part header (leading blank line)
867 if ($text =~ /^\s*\r?\n/) {
868 $part_header="Content-type: text/plain; charset=us-ascii";
869 } else {
870 $part_header =~ s/\r?\n\r?\n(.*)$//s;
871 $text=$1; if (!defined($text)) {$text="";}
872 }
873 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
874 $part_header =~ /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
875 my $type=$1;
876 my $charset=$2;
877 if (!defined($type)) {$type="";}
878 if (!defined($charset)) {$charset="ascii";}
879 my $encoding="";
880 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
881 $encoding=$1; $encoding=~tr/A-Z/a-z/;
882 }
883 # Content-Transfer-Encoding is per-part
884 if ($encoding ne "") {
885 if ($encoding =~ /quoted\-printable/) {
886 $text=qp_decode($text);
887 } elsif ($encoding =~ /base64/) {
888 $text=base64_decode($text);
889 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
890 # rfc2045 also allows binary, which we ignore (for now).
891 my $outhandle=$self->{'outhandle'};
892 print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
893 return "";
894 }
895 }
896 if ($type eq "text/html") {
897 # only get stuff between <body> tags, or <html> tags.
898 $text =~ s@^.*<html[^>]*>@@is;
899 $text =~ s@</html>.*$@@is;
900 $text =~ s/^.*?<body[^>]*>//si;
901 $text =~ s/<\/body>.*$//si;
902 }
903 elsif ($type eq "text/xml") {
904 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
905 $text="<pre>\n$text\n</pre>\n";
906 }
907 # convert to unicode
908 $self->convert2unicode($charset, \$text);
909
910 $text =~ s@_@\\_@g; # protect against GS macro language
911 return $text;
912}
913
914
915
916
917# decode quoted-printable text
918sub qp_decode {
919 my $text=shift;
920
921 # if a line ends with "=\s*", it is a soft line break, otherwise
922 # keep in any newline characters.
923
924 $text =~ s/=\s*\r?\n//mg;
925 $text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
926 return $text;
927}
928
929# decode base64 text. This is fairly slow (since it's interpreted perl rather
930# than compiled XS stuff like in the ::MIME modules, but this is more portable
931# for us at least).
932# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
933# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
934# from each encoded byte.
935
936
937sub base64_decode {
938 my $enc_text = shift;
939# A=>0, B=>1, ..., '+'=>62, '/'=>63
940# also '=' is used for padding at the end, but we remove it anyway.
941 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
942# map each MIME char into it's value, for more efficient lookup.
943 my %index;
944 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
945# remove all non-base64 chars. eval to get variable in transliteration...
946# also remove '=' - we'll assume (!!) that there are no errors in the encoding
947 eval "\$enc_text =~ tr|$mimechars||cd";
948 my $decoded="";
949 while (length ($enc_text)>3)
950 {
951 my $fourchars=substr($enc_text,0,4,"");
952 my @chars=(split '',$fourchars);
953 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
954 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
955 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
956 }
957# if there are any input chars left, there are either
958# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
959 my @chars=(split '',$enc_text);
960 if (length($enc_text)) {
961 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
962 }
963 if (length($enc_text)==3) {
964 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
965 }
966 return $decoded;
967}
968
969sub convert2unicode {
970 my $self = shift(@_);
971 my ($charset, $textref) = @_;
972
973 if (!$$textref) {
974 # nothing to do!
975 return;
976 }
977
978 # first get our character encoding name in the right form.
979 $charset = "iso_8859_1" unless defined $charset;
980 $charset =~ tr/A-Z/a-z/; # lowercase
981 $charset =~ s/\-/_/g;
982 $charset =~ s/gb2312/gb/;
983 # assumes EUC-KR, not ISO-2022 !?
984 $charset =~ s/^ks_c_5601_1987/korean/;
985 if ($charset eq 'utf_8') {$charset='utf8'}
986
987 my $outhandle = $self->{'outhandle'};
988
989 if ($charset eq "utf8") {
990 # no conversion needed, but lets check that it's valid utf8
991 # see utf-8 manpage for valid ranges
992 $$textref =~ m/^/g; # to set \G
993 my $badbytesfound=0;
994 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
995 my $highbytes=$1;
996 my $highbyteslength=length($highbytes);
997 # replace any non utf8 complaint bytes
998 $highbytes =~ /^/g; # set pos()
999 while ($highbytes =~
1000 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
1001 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
1002 [\xf0-\xf7][\x80-\xbf]{3} # 4 byte
1003 [\xf8-\xfb][\x80-\xbf]{4} # 5 byte
1004 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1005 )*([\x80-\xff])? !xg
1006 ) {
1007 my $badbyte=$1;
1008 if (!defined $badbyte) {next} # hit end of string
1009 my $pos=pos($highbytes);
1010 substr($highbytes, $pos-1, 1, "\xc2\x80");
1011 # update the position to continue searching (for \G)
1012 pos($highbytes) = $pos+1; # set to just after the \x80
1013 $badbytesfound=1;
1014 }
1015 if ($badbytesfound==1) {
1016 # claims to be utf8, but it isn't!
1017 print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
1018 . "detected and removed.\n";
1019
1020 my $replength=length($highbytes);
1021 my $textpos=pos($$textref);
1022 # replace bad bytes with good bytes
1023 substr( $$textref, $textpos-$replength,
1024 $replength, $highbytes);
1025 # update the position to continue searching (for \G)
1026 pos($$textref)=$textpos+($replength-$highbyteslength);
1027 }
1028 }
1029 return;
1030 }
1031
1032 # It appears that we can't always trust ascii text so we'll treat it
1033 # as iso-8859-1 (letting characters above 0x80 through without
1034 # converting them to utf-8 will result in invalid XML documents
1035 # which can't be parsed at build time).
1036 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
1037
1038 if ($charset eq "iso_8859_1") {
1039 # test if the mailer lied, and it has win1252 chars in it...
1040 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1041 if ($$textref =~ m/[\x80-\x9f]/) {
1042 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
1043 print $outhandle "codepage 1252 detected.\n";
1044 $charset = "windows_1252";
1045 }
1046 }
1047 my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1048
1049 if ($utf8_text ne "") {
1050 $$textref=$utf8_text;
1051 } else {
1052 # we didn't get any text... unsupported encoding perhaps? Or it is
1053 # empty anyway. We'll try to continue, assuming 8859-1. We could strip
1054 # characters out here if this causes problems...
1055 my $outhandle=$self->{'outhandle'};
1056 print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
1057 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1058
1059 }
1060}
1061
1062
1063sub set_OID {
1064 my $self = shift (@_);
1065 my ($doc_obj, $id, $segment_number) = @_;
1066
1067 if ( exists $doc_obj->{'msgid'} ) {
1068 $doc_obj->set_OID($doc_obj->{'msgid'});
1069 } else {
1070 $doc_obj->set_OID("$id\_$segment_number");
1071 }
1072}
1073
1074
1075# Perl packages have to return true if they are run.
10761;
Note: See TracBrowser for help on using the repository browser.