source: gsdl/trunk/perllib/plugins/EmailPlugin.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 35.5 KB
Line 
1###########################################################################
2#
3# EmailPlugin.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EmailPlugin
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content (optional, not stored by default)
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64# 12/05/02 Added usage datastructure - John Thompson
65package EmailPlugin;
66
67use strict;
68no strict "refs"; # so we can use a variable as a filehandle for print $out
69
70
71use SplitTextFile;
72use unicode; # gs conv functions
73use gsprintf 'gsprintf'; # translations
74
75use sorttools;
76use util;
77
78sub BEGIN {
79 @EmailPlugin::ISA = ('SplitTextFile');
80}
81
82
83my $arguments =
84 [ { 'name' => "process_exp",
85 'desc' => "{BasePlugin.process_exp}",
86 'type' => "regexp",
87 'reqd' => "no",
88 'deft' => &get_default_process_exp() },
89 { 'name' => "no_attachments",
90 'desc' => "{EmailPlugin.no_attachments}",
91 'type' => "flag",
92 'reqd' => "no" },
93 { 'name' => "headers",
94 'desc' => "{EmailPlugin.headers}",
95 'type' => "flag",
96 'reqd' => "no" },
97 { 'name' => "split_exp",
98 'desc' => "{EmailPlugin.split_exp}",
99 'type' => "regexp",
100 'reqd' => "no",
101 'deft' => &get_default_split_exp() }
102 ];
103
104my $options = { 'name' => "EmailPlugin",
105 'desc' => "{EmailPlugin.desc}",
106 'abstract' => "no",
107 'inherits' => "yes",
108 'args' => $arguments };
109
110# Create a new EmailPlugin object with which to parse a file.
111# Accomplished by creating a new SplitTextFile and using bless to
112# turn it into an EmailPlugin.
113
114sub new {
115 my ($class) = shift (@_);
116 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
117 push(@$pluginlist, $class);
118
119 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120 push(@{$hashArgOptLists->{"OptList"}},$options);
121
122 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
123
124 $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
125
126 # this might not actually be true at read-time, but after processing
127 # it should all be utf8.
128 $self->{'input_encoding'}="utf8";
129 return bless $self, $class;
130}
131
132sub get_default_process_exp {
133 my $self = shift (@_);
134 # mbx/email for mailbox file format, \d+ for maildir (each message is
135 # in a separate file, with a unique number for filename)
136 # mozilla and IE will save individual mbx format files with a ".eml" ext.
137 return q@([\\/]\d+|\.(mbx|email|eml))$@;
138}
139
140# This plugin splits the mbox mail files at lines starting with From<sp>
141# It is supposed to be "\n\nFrom ", but this isn't always used.
142# add \d{4} so that the line ends in a year (in case the text has an
143# unescaped "From " at the start of a line).
144sub get_default_split_exp {
145 return q^\nFrom .*\d{4}\n^;
146
147}
148
149
150# do plugin specific processing of doc_obj
151sub process {
152
153 my $self = shift (@_);
154 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
155 my $outhandle = $self->{'outhandle'};
156
157 # Check that we're dealing with a valid mail file
158 # mbox message files start with "From "
159 # maildir messages usually start with Return-Path and Delivered-To
160 # mh is very similar to maildir
161 my $startoffile=substr($$textref,0,256);
162 if (($startoffile !~ /^(From )/) &&
163 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
164 return undef;
165 }
166
167 my $cursection = $doc_obj->get_top_section();
168
169 #
170 # Parse the document's text and extract metadata
171 #
172
173 # Protect backslashes
174 $$textref =~ s@\\@\\\\@g;
175
176 # Separate header from body of message
177 my $Headers = $$textref;
178 $Headers =~ s/\r?\n\r?\n(.*)$//s;
179 $$textref = $1;
180 $Headers .= "\n";
181
182 # Unfold headers - see rfc822
183 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
184 # Extract basic metadata from header
185 my @headers = ("From", "To", "Subject", "Date");
186 my %raw;
187 foreach my $name (@headers) {
188 $raw{$name} = "No $name value";
189 }
190
191 # Get a default encoding for the header - RFC says should be ascii...
192 my $default_header_encoding="iso_8859_1";
193
194 # We don't know what character set is the user's default...
195 # We could use textcat to guess... for now we'll look at mime content-type
196# if ($Headers =~ /([[:^ascii:]])/) {
197# }
198 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
199 $default_header_encoding=$1;
200 $default_header_encoding =~ s@\-@_@g;
201 $default_header_encoding =~ tr/A-Z/a-z/;
202 }
203
204
205 # Examine each line of the headers
206 my ($line, $name, $value);
207 my @parts;
208 foreach $line (split(/\n/, $Headers)) {
209
210 # Ignore lines with no content or which begin with whitespace
211 next unless ($line =~ /:/);
212 next if ($line =~ /^\s/);
213
214 # Find out what metadata is on this line
215 @parts = split(/:/, $line);
216 $name = shift @parts;
217 # get fieldname in canonical form - first cap, then lower case.
218 $name =~ tr/A-Z/a-z/;
219 # uppercase the first character according to the current locale
220 $name=~s/(.+)/\u$1/;
221 next unless $name;
222 next unless ($raw{$name});
223
224 # Find the value of that metadata
225 $value = join(":", @parts);
226 $value =~ s/^\s+//;
227 $value =~ s/\s+$//;
228 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
229 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
230 my $original_value=$value;
231 my $encoded=$value;
232 $value="";
233 # we should ignore spaces between consecutive encoded-texts
234 $encoded =~ s@\?=\s+=\?@\?==\?@g;
235 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=//i) {
236 my ($charset, $encoding, $data)=($2,$3,$4);
237 my ($decoded_data);
238 $value.="$1"; # any leading chars
239 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
240 chomp $data;
241 $encoding =~ tr/BQ/bq/;
242 if ($encoding eq "q") { # quoted printable
243 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
244 $decoded_data=qp_decode($data);
245 # qp_decode adds \n, which is default for body text
246 chomp($decoded_data);
247 } else { # base 64
248 $decoded_data=base64_decode($data);
249 }
250 $self->convert2unicode($charset, \$decoded_data);
251 $value .= $decoded_data;
252 } # end of while loop
253
254 # get any trailing characters
255 $self->convert2unicode($default_header_encoding, \$encoded);
256 $value.=$encoded;
257
258 if ($value =~ /^\s*$/) { # we couldn't extract anything...
259 $self->convert2unicode($default_header_encoding,
260 \$original_value);
261 $value=$original_value;
262 }
263 } # end of if =?...?=
264
265 # In the absense of other charset information, assume the
266 # header is the default (usually "iso_8859_1") and convert to unicode.
267 else {
268 $self->convert2unicode($default_header_encoding, \$value);
269 }
270
271 # Store the metadata
272 $value =~ s@_@\\_@g; # protect against GS macro language
273 $raw{$name} = $value;
274 }
275
276 # Extract the name and e-mail address from the From metadata
277 my $frommeta = $raw{"From"};
278 my $fromnamemeta;
279 my $fromaddrmeta;
280
281 $frommeta =~ s/\s*$//; # Remove trailing space, if any
282
283 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
284 $fromnamemeta=$1;
285 $fromaddrmeta=$2;
286 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
287 $fromnamemeta=$2;
288 $fromaddrmeta=$1;
289 }
290 if (!defined($fromaddrmeta)) {
291 $fromaddrmeta=$frommeta;
292 }
293 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
294 # minor attempt to prevent spam-bots from harvesting addresses...
295 $fromaddrmeta=~s/@/&#64;/;
296
297 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
298
299 if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
300 $fromnamemeta =~ s/\"//g; # remove quotes
301 $fromnamemeta =~ s/\s+$//; # remove trailing whitespace
302 }
303 else {
304 $fromnamemeta = $fromaddrmeta;
305 }
306 # if name is an address
307 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
308 $fromnamemeta=~s/@/&#64\;/;
309 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
310
311 $raw{"From"}=$frommeta;
312
313 # Process Date information
314 if ($raw{"Date"} !~ /No Date/) {
315 $raw{"DateText"} = $raw{"Date"};
316
317 # Convert the date text to internal date format
318 $value = $raw{"Date"};
319 # proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
320 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
321 if (!defined($day) || !defined($month) || !defined ($year)) {
322 # try monthly archive format: Wed Apr 23 00:26:08 2008
323 ($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
324 }
325
326 # make some assumptions about the year formatting...
327 # some (old) software thinks 2001 is 101, some think 2001 is 01
328 if ($year < 20) { $year += 2000; } # assume not really 1920...
329 elsif ($year < 150) { $year += 1900; } # assume not really 2150...
330 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
331
332 } else {
333 # We have not extracted a date
334 $raw{"DateText"} = "Unknown.";
335 $raw{"Date"} = "19000000";
336 }
337
338 # Add extracted metadata to document object
339 foreach my $name (keys %raw) {
340 $value = $raw{$name};
341 if ($value) {
342 # assume subject, etc headers have no special HTML meaning.
343 $value = &text_into_html($value);
344 # escape [] so it isn't re-interpreted as metadata
345 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
346 } else {
347 $value = "No $name field";
348 }
349 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
350 }
351
352
353 # extract a message ID from the headers, if there is one, and we'll use
354 # that as the greenstone doc ID. Having a predictable ID means we can
355 # link to other messages, eg from In-Reply-To or References headers...
356 if ($Headers =~ m@^Message-ID:(.+)$@mi) {
357 my $id=escape_msg_id($1);
358 $doc_obj->{'msgid'}=$id;
359 }
360 # link to another message, if this is a reply
361 if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
362 my $id=escape_msg_id($1);
363 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
364 } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
365 # References can have multiple, get the last one
366 my $id=escape_msg_id($1);
367 # not necessarily in-reply-to, but same thread...
368 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
369 }
370
371
372
373 my $mimetype="text/plain";
374 my $mimeinfo="";
375 my $charset = $default_header_encoding;
376 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
377 # more than one parameter given to Content-type.
378 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
379 if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
380 {
381 $mimetype=$1;
382 $mimetype =~ tr/[A-Z]/[a-z]/;
383
384 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
385 $mimetype = "text/plain";
386 }
387
388 $mimeinfo=$2;
389 if (!defined $mimeinfo) {
390 $mimeinfo="";
391 } else { # strip leading and trailing stuff
392 $mimeinfo =~ s/^\;\s*//;
393 $mimeinfo =~ s/\s*$//;
394 }
395 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
396 $charset = $1;
397 }
398 }
399
400 my $transfer_encoding="7bit";
401 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
402 $transfer_encoding=$1;
403 }
404
405 if ($mimetype eq "text/html") {
406 $$textref= $self->text_from_part($$textref, $Headers);
407 } elsif ($mimetype ne "text/plain") {
408 $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
409 $$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
410 } else { # mimetype eq text/plain
411
412 if ($transfer_encoding =~ /quoted\-printable/) {
413 $$textref=qp_decode($$textref);
414 } elsif ($transfer_encoding =~ /base64/) {
415 $$textref=base64_decode($$textref);
416 }
417 $self->convert2unicode($charset, $textref);
418
419 $$textref = &text_into_html($$textref);
420 $$textref =~ s@_@\\_@g; # protect against GS macro language
421
422 }
423
424
425 if ($self->{'headers'} && $self->{'headers'} == 1) {
426 # Add "All headers" metadata
427 $Headers = &text_into_html($Headers);
428
429 $Headers = "No headers" unless ($Headers =~ /\w/);
430 $Headers =~ s/@/&#64\;/g;
431 # escape [] so it isn't re-interpreted as metadata
432 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
433 $self->convert2unicode($charset, \$Headers);
434
435 $Headers =~ s@_@\\_@g; # protect against GS macro language
436 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
437 }
438
439
440 # Add Title metadata
441 my $Title = text_into_html($raw{'Subject'});
442 $Title .= "<br>From: " . text_into_html($fromnamemeta);
443 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
444 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
445
446 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
447
448 # Add FileFormat metadata
449 $doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
450
451 # Add text to document object
452 $$textref = "No message" unless ($$textref =~ /\w/);
453
454 $doc_obj->add_utf8_text($cursection, $$textref);
455
456 return 1;
457}
458
459
460# Convert a text string into HTML.
461#
462# The HTML is going to be inserted into a GML file, so
463# we have to be careful not to use symbols like ">",
464# which ocurs frequently in email messages (and use
465# &gt instead.
466#
467# This function also turns links and email addresses into hyperlinks,
468# and replaces carriage returns with <BR> tags (and multiple carriage
469# returns with <P> tags).
470
471
472sub text_into_html {
473 my ($text) = @_;
474
475 # Convert problem characters into HTML symbols
476 $text =~ s/&/&amp;/g;
477 $text =~ s/</&lt;/g;
478 $text =~ s/>/&gt;/g;
479 $text =~ s/\"/&quot;/g;
480
481 # convert email addresses and URIs into links
482# don't markup email addresses for now
483# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
484
485 # try to munge email addresses a little bit...
486 $text =~ s/@/&#64;/;
487 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
488 # assume URI doesn't finish with a '.'
489 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)*(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
490
491
492 # Clean up whitespace and convert \n charaters to <BR> or <P>
493 $text =~ s/ +/ /g;
494 $text =~ s/\s*$//g;
495 $text =~ s/^\s*//g;
496 $text =~ s/\n/\n<br>/g;
497 $text =~ s/<br>\s*<br>/<p>/gi;
498
499 return $text;
500}
501
502
503
504
505#Process a MIME message.
506# the textref we are given DOES NOT include the header.
507sub text_from_mime_message {
508 my $self = shift(@_);
509 my ($mimetype,$mimeinfo,$text)=(@_);
510 my $outhandle=$self->{'outhandle'};
511 # Check for multiparts - $mimeinfo will be a boundary
512 if ($mimetype =~ /multipart/) {
513 my $boundary="";
514 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
515 $boundary=$1;
516 if ($boundary =~ m@^\"@) {
517 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
518 }
519 } else {
520 print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
521 }
522 # parts start with "--$boundary"
523 # message ends with "--$boundary--"
524 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
525 # that perl might want to interpolate. Also allows spaces...
526 $boundary=~s/\\/\\\\/g;
527 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
528 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
529 # remove first "part" and last "part" (final --)
530 shift @message_parts;
531 my $last=pop @message_parts;
532 # if our boundaries are a bit dodgy and we only found 1 part...
533 if (!defined($last)) {$last="";}
534 # make sure it is only -- and whitespace
535 if ($last !~ /^\-\-\s*$/ms) {
536 print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
537 }
538 foreach my $message_part (@message_parts) {
539 # remove the leading newline left from split.
540 $message_part=~s/^\r?\n//;
541 }
542 if ($mimetype eq "multipart/alternative") {
543 # check for an HTML version first, then TEXT, otherwise use first.
544 my $part_text="";
545 foreach my $message_part (@message_parts) {
546 if ($message_part =~ m@^content\-type:\s*text/html@i)
547 {
548 # Use the HTML version
549 $part_text = $self->text_from_part($message_part);
550 $mimetype="text/html";
551 last;
552 }
553 }
554 if ($part_text eq "") { # try getting a text part instead
555 foreach my $message_part (@message_parts) {
556 if ($message_part =~ m@^content\-type:\s*text/plain@i)
557 {
558 # Use the plain version
559 $part_text = $self->text_from_part($message_part);
560 if ($part_text =~/[^\s]/) {
561 $part_text = text_into_html($part_text);
562 }
563 $mimetype="text/plain";
564 last;
565 }
566 }
567 }
568 if ($part_text eq "") { #use first part (no html/text part found)
569 $part_text = $self->text_from_part(shift @message_parts);
570 $part_text = text_into_html($part_text);
571 }
572 if ($part_text eq "") { # we couldn't get anything!!!
573 # or it was an empty message...
574 # do nothing...
575 gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
576 } else {
577 $text = $part_text;
578 }
579 } elsif ($mimetype =~ m@multipart/(mixed|digest|related|signed)@) {
580 $text = "";
581 # signed is for PGP/GPG messages... the last part is a hash
582 if ($mimetype =~ m@multipart/signed@) {
583 pop @message_parts;
584 }
585 my $is_first_part=1;
586 foreach my $message_part (@message_parts) {
587 if ($is_first_part && $text ne "") {$is_first_part=0;}
588
589 if ($mimetype eq "multipart/digest") {
590 # default type - RTFRFC!! Set if not already set
591 $message_part =~ m@^(.*)\n\r?\n@s;
592 my $part_header=$1;
593 if ($part_header !~ m@^content-type@mi) {
594 $message_part="Content-type: message/rfc822\n"
595 . $message_part; # prepend default type
596 }
597 }
598
599 $text .= $self->process_multipart_part($message_part,
600 $is_first_part);
601 } # foreach message part.
602 } else {
603 # we can't handle this multipart type (not mixed or alternative)
604 # the RFC also mentions "parallel".
605 }
606 } # end of ($mimetype =~ multipart)
607 elsif ($mimetype =~ m@message/rfc822@) {
608 my $msg_header = $text;
609 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
610 $text = $1;
611
612 if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
613 {
614 $mimetype=$1;
615 $mimeinfo=$2;
616 $mimetype =~ tr/[A-Z]/[a-z]/;
617
618 my $msg_text;
619 if ($mimetype =~ m@multipart/@) {
620 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
621 $text);
622 } else {
623 $msg_text=$self->text_from_part($text,$msg_header);
624 }
625
626 my $brief_header=text_into_html(get_brief_headers($msg_header));
627 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
628 $text.= "<table><tr><td width=\"5%\"> </td>\n";
629 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
630 . "</td></tr></table>";
631 }
632 } else {
633 # we don't do any processing of the content.
634 }
635
636 return $text;
637}
638
639
640
641# used for turning a message id into a more friendly string for greenstone
642sub escape_msg_id {
643#msgid
644 my $id=shift;
645 chomp $id; $id =~ s!\s!!g; # remove spaces
646 $id =~ s![<>\[\]]!!g; # remove [ ] < and >
647 $id =~ s![_&]!-!g; # replace symbols that might cause problems
648 $id =~ s!\.!-!g; # . means section to greenstone doc ids!
649 $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
650 return $id;
651}
652
653
654
655sub process_multipart_part {
656 my $self = shift;
657 my $message_part = shift;
658 my $is_first_part = shift;
659
660 my $return_text="";
661 my $part_header=$message_part;
662 my $part_body;
663 if ($message_part=~ /^\s*\n/) {
664 # no header... use defaults
665 $part_body=$message_part;
666 $part_header="Content-type: text/plain; charset=us-ascii";
667 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
668 $part_body=$1;
669 } else {
670 # something's gone wrong...
671 $part_header="";
672 $part_body=$message_part;
673 }
674
675 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
676 my $part_content_type="";
677 my $part_content_info="";
678
679 if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;.*)?$@mi) {
680 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
681 $part_content_info=$2;
682 if (!defined($part_content_info)) {
683 $part_content_info="";
684 } else {
685 $part_content_info =~ s/^\;\s*//;
686 $part_content_info =~ s/\s*$//;
687 }
688 }
689 my $filename="";
690 if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
691 $filename=$1;
692 $filename =~ s@\r?\s*$@@; # remove trailing space, if any
693 }
694
695 # disposition - either inline or attachment.
696 # NOT CURRENTLY USED - we display all text types instead...
697 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
698
699 # add <<attachment>> to each part except the first...
700 if (!$is_first_part) {
701 $return_text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
702 # add part info header
703 my $header_text = "<br>Type: $part_content_type<br>\n";
704 if ($filename ne "") {
705 $header_text .= "Filename: $filename\n";
706 }
707 $header_text =~ s@_@\\_@g;
708 $return_text .= $header_text . "</strong></p>\n<p>\n";
709 }
710
711 if ($part_content_type =~ m@text/@)
712 {
713 # $message_part includes the mime part headers
714 my $part_text = $self->text_from_part($message_part);
715 if ($part_content_type !~ m@text/(ht|x)ml@) {
716 $part_text = text_into_html($part_text);
717 }
718 if ($part_text eq "") {
719 $part_text = ' ';
720 }
721 $return_text .= $part_text;
722 } elsif ($part_content_type =~ m@message/rfc822@) {
723 # This is a forwarded message
724 my $message_part_headers=$part_body;
725 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
726 my $message_part_body=$1;
727 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
728
729 my $rfc822_formatted_body=""; # put result in here
730 if ($message_part_headers =~
731 /^content\-type:\s*([\w\.\-\/]+)\s*(\;.*)?$/ims)
732 {
733 # The message header uses MIME flags
734 my $message_content_type=$1;
735 my $message_content_info=$2;
736 if (!defined($message_content_info)) {
737 $message_content_info="";
738 } else {
739 $message_content_info =~ s/^\;\s*//;
740 $message_content_info =~ s/\s*$//;
741 }
742 $message_content_type =~ tr/A-Z/a-z/;
743 if ($message_content_type =~ /multipart/) {
744 $rfc822_formatted_body=
745 $self->text_from_mime_message($message_content_type,
746 $message_content_info,
747 $message_part_body);
748 } else {
749 $message_part_body=$self->text_from_part($part_body,
750 $message_part_headers);
751 $rfc822_formatted_body=text_into_html($message_part_body);
752 }
753 } else {
754 # message doesn't use MIME flags
755 $rfc822_formatted_body=text_into_html($message_part_body);
756 $rfc822_formatted_body =~ s@_@\\_@g;
757 }
758 # Add the returned text to the output
759 # don't put all the headers...
760# $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
761 my $brief_headers=get_brief_headers($message_part_headers);
762 $return_text.=text_into_html($brief_headers);
763 $return_text.="</p><p>\n";
764 $return_text.=$rfc822_formatted_body;
765 $return_text.="</p>\n";
766 # end of message/rfc822
767 } elsif ($part_content_type =~ /multipart/) {
768 # recurse again
769
770 my $tmptext= $self->text_from_mime_message($part_content_type,
771 $part_content_info,
772 $part_body);
773 $return_text.=$tmptext;
774 } else {
775 # this part isn't text/* or another message...
776 if ($is_first_part) {
777 # this is the first part of a multipart, or only part!
778 $return_text="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
779 # add part info header
780 my $header_text="<br>Type: $part_content_type<br>\n";
781 $header_text.="Filename: $filename</strong></p>\n<p>\n";
782 $header_text =~ s@_@\\_@g;
783 $return_text.=$header_text;
784 }
785
786 # save attachment by default
787 if (!$self->{'no_attachments'}
788 && $filename ne "") { # this part has a file...
789 my $encoding="8bit";
790 if ($part_header =~
791 /^content-transfer-encoding:\s*(\w+)/mi ) {
792 $encoding=$1; $encoding =~ tr/A-Z/a-z/;
793 }
794 my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
795 my $save_filename=$filename;
796
797 # make sure we don't clobber files with same name;
798 # need to keep state between .mbx files
799 my $assoc_files=$self->{'assoc_filenames'};
800 if ($assoc_files->{$filename}) { # it's been set...
801 $assoc_files->{$filename}++;
802 $filename =~ m/(.+)\.(\w+)$/;
803 my ($filestem, $ext)=($1,$2);
804 $save_filename="${filestem}_"
805 . $assoc_files->{$filename} . ".$ext";
806 } else { # first file with this name
807 $assoc_files->{$filename}=1;
808 }
809 open (SAVE, ">$tmpdir/$save_filename") ||
810 warn "EmailPlugin: Can't save attachment as $tmpdir/$save_filename: $!";
811 my $part_text = $message_part;
812 $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
813 if ($encoding eq "base64") {
814 print SAVE base64_decode($part_text);
815 } elsif ($encoding eq "quoted-printable") {
816 print SAVE qp_decode($part_text);
817 } else { # 7bit, 8bit, binary, etc...
818 print SAVE $part_text;
819 }
820 close SAVE;
821 my $doc_obj=$self->{'doc_obj'};
822 $doc_obj->associate_file("$tmpdir/$save_filename",
823 "$save_filename",
824 $part_content_type # mimetype
825 );
826 # clean up tmp area...
827 # Can't do this as it hasn't been copied/linked yet!!!
828# &util::rm("$tmpdir/$save_filename");
829 my $outhandle=$self->{'outhandle'};
830 print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
831
832 # be nice if "download" was a translatable macro :(
833 $return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
834 } # end of save attachment
835 } # end of !text/message part
836
837
838 return $return_text;
839}
840
841
842# Return only the "important" headers from a set of message headers
843sub get_brief_headers {
844 my $msg_header = shift;
845 my $brief_header = "";
846
847 # Order matters!
848 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
849 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
850 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
851 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
852 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
853
854 return $brief_header;
855}
856
857
858# Process a MIME part. Return "" if we can't decode it.
859# should only be called for parts with type "text/*" ?
860# Either pass the entire mime part (including the part's header),
861# or pass the mime part's text and optionally the part's header.
862sub text_from_part {
863 my $self = shift;
864 my $text = shift || '';
865 my $part_header = shift;
866
867
868 my $type="text/plain"; # default, overridden from part header
869 my $charset=undef; # convert2unicode() will guess if necessary
870
871 if (! $part_header) { # no header argument was given. check the body
872 $part_header = $text;
873 # check for empty part header (leading blank line)
874 if ($text =~ /^\s*\r?\n/) {
875 $part_header="Content-type: text/plain; charset=us-ascii";
876 } else {
877 $part_header =~ s/\r?\n\r?\n(.*)$//s;
878 $text=$1; if (!defined($text)) {$text="";}
879 }
880 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
881 }
882
883 if ($part_header =~
884 /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is) {
885 $type=$1;
886 $charset=$2;
887 }
888 my $encoding="";
889 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
890 $encoding=$1; $encoding=~tr/A-Z/a-z/;
891 }
892 # Content-Transfer-Encoding is per-part
893 if ($encoding ne "") {
894 if ($encoding =~ /quoted\-printable/) {
895 $text=qp_decode($text);
896 } elsif ($encoding =~ /base64/) {
897 $text=base64_decode($text);
898 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
899 # rfc2045 also allows binary, which we ignore (for now).
900 my $outhandle=$self->{'outhandle'};
901 print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
902 return "";
903 }
904 }
905
906 if ($type eq "text/html") {
907 # only get stuff between <body> tags, or <html> tags.
908 $text =~ s@^.*<html[^>]*>@@is;
909 $text =~ s@</html>.*$@@is;
910 $text =~ s/^.*?<body[^>]*>//si;
911 $text =~ s/<\/body>.*$//si;
912 }
913 elsif ($type eq "text/xml") {
914 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
915 $text="<pre>\n$text\n</pre>\n";
916 }
917 # convert to unicode
918 $self->convert2unicode($charset, \$text);
919 $text =~ s@_@\\_@g; # protect against GS macro language
920 return $text;
921}
922
923
924
925
926# decode quoted-printable text
927sub qp_decode {
928 my $text=shift;
929
930 # if a line ends with "=\s*", it is a soft line break, otherwise
931 # keep in any newline characters.
932
933 $text =~ s/=\s*\r?\n//mg;
934 $text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
935 return $text;
936}
937
938# decode base64 text. This is fairly slow (since it's interpreted perl rather
939# than compiled XS stuff like in the ::MIME modules, but this is more portable
940# for us at least).
941# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
942# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
943# from each encoded byte.
944
945
946sub base64_decode {
947 my $enc_text = shift;
948# A=>0, B=>1, ..., '+'=>62, '/'=>63
949# also '=' is used for padding at the end, but we remove it anyway.
950 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
951# map each MIME char into it's value, for more efficient lookup.
952 my %index;
953 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
954# remove all non-base64 chars. eval to get variable in transliteration...
955# also remove '=' - we'll assume (!!) that there are no errors in the encoding
956 eval "\$enc_text =~ tr|$mimechars||cd";
957 my $decoded="";
958 while (length ($enc_text)>3)
959 {
960 my $fourchars=substr($enc_text,0,4,"");
961 my @chars=(split '',$fourchars);
962 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
963 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
964 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
965 }
966# if there are any input chars left, there are either
967# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
968 my @chars=(split '',$enc_text);
969 if (length($enc_text)) {
970 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
971 }
972 if (length($enc_text)==3) {
973 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
974 }
975 return $decoded;
976}
977
978# returns 0 if valid utf-8, 1 if invalid
979sub is_utf8 {
980 my $self = shift;
981 my $textref = shift;
982
983 $$textref =~ m/^/g; # to set \G
984 my $badbytesfound=0;
985 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
986 my $highbytes=$1;
987 my $highbyteslength=length($highbytes);
988 # replace any non utf8 complaint bytes
989 $highbytes =~ /^/g; # set pos()
990 while ($highbytes =~
991 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
992 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
993 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
994 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
995 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
996 )*([\x80-\xff])? !xg
997 ) {
998 my $badbyte=$1;
999 if (!defined $badbyte) {next} # hit end of string
1000 return 1;
1001 }
1002 }
1003 return 0;
1004}
1005
1006
1007
1008sub convert2unicode {
1009 my $self = shift(@_);
1010 my ($charset, $textref) = @_;
1011
1012 if (!$$textref) {
1013 # nothing to do!
1014 return;
1015 }
1016
1017 if (! defined $charset) {
1018 # check if we have valid utf-8
1019 if ($self->is_utf8($textref)) { $charset = "utf8" }
1020
1021 # default to latin
1022 $charset = "iso_8859_1" if ! defined($charset);
1023 }
1024
1025 # first get our character encoding name in the right form.
1026 $charset =~ tr/A-Z/a-z/; # lowercase
1027 $charset =~ s/\-/_/g;
1028 if ($charset =~ /gb_?2312/) { $charset="gb" }
1029 # assumes EUC-KR, not ISO-2022 !?
1030 $charset =~ s/^ks_c_5601_1987/korean/;
1031 if ($charset eq 'utf_8') {$charset='utf8'}
1032
1033 my $outhandle = $self->{'outhandle'};
1034
1035 if ($charset eq "utf8") {
1036 # no conversion needed, but lets check that it's valid utf8
1037 # see utf-8 manpage for valid ranges
1038 $$textref =~ m/^/g; # to set \G
1039 my $badbytesfound=0;
1040 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1041 my $highbytes=$1;
1042 my $highbyteslength=length($highbytes);
1043 # replace any non utf8 complaint bytes
1044 $highbytes =~ /^/g; # set pos()
1045 while ($highbytes =~
1046 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
1047 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
1048 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
1049 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
1050 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1051 )*([\x80-\xff])? !xg
1052 ) {
1053 my $badbyte=$1;
1054 if (!defined $badbyte) {next} # hit end of string
1055 my $pos=pos($highbytes);
1056 substr($highbytes, $pos-1, 1, "\xc2\x80");
1057 # update the position to continue searching (for \G)
1058 pos($highbytes) = $pos+1; # set to just after the \x80
1059 $badbytesfound=1;
1060 }
1061 if ($badbytesfound==1) {
1062 # claims to be utf8, but it isn't!
1063 print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
1064 . "detected and removed.\n";
1065
1066 my $replength=length($highbytes);
1067 my $textpos=pos($$textref);
1068 # replace bad bytes with good bytes
1069 substr( $$textref, $textpos-$replength,
1070 $replength, $highbytes);
1071 # update the position to continue searching (for \G)
1072 pos($$textref)=$textpos+($replength-$highbyteslength);
1073 }
1074 }
1075 return;
1076 }
1077
1078 # It appears that we can't always trust ascii text so we'll treat it
1079 # as iso-8859-1 (letting characters above 0x80 through without
1080 # converting them to utf-8 will result in invalid XML documents
1081 # which can't be parsed at build time).
1082 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
1083
1084 if ($charset eq "iso_8859_1") {
1085 # test if the mailer lied, and it has win1252 chars in it...
1086 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1087 if ($$textref =~ m/[\x80-\x9f]/) {
1088 print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
1089 print $outhandle "codepage 1252 detected.\n";
1090 $charset = "windows_1252";
1091 }
1092 }
1093 my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1094
1095 if ($utf8_text ne "") {
1096 $$textref=$utf8_text;
1097 } else {
1098 # we didn't get any text... unsupported encoding perhaps? Or it is
1099 # empty anyway. We'll try to continue, assuming 8859-1. We could strip
1100 # characters out here if this causes problems...
1101 my $outhandle=$self->{'outhandle'};
1102 print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
1103 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1104
1105 }
1106}
1107
1108
1109sub set_OID {
1110 my $self = shift (@_);
1111 my ($doc_obj, $id, $segment_number) = @_;
1112
1113 if ( exists $doc_obj->{'msgid'} ) {
1114 $doc_obj->set_OID($doc_obj->{'msgid'});
1115 } else {
1116 $doc_obj->set_OID("$id\_$segment_number");
1117 }
1118}
1119
1120
1121# Perl packages have to return true if they are run.
11221;
Note: See TracBrowser for help on using the repository browser.