source: gsdl/trunk/perllib/plugins/EMAILPlug.pm@ 15212

Last change on this file since 15212 was 15212, checked in by kjdon, 16 years ago

for the greenstone archive collection, we are now using monthly archives rather than raw mbox. The date format is slightly different in this case. I have added code to deal with this format too as it may be useful for other people.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 35.6 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content (optional, not stored by default)
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64# 12/05/02 Added usage datastructure - John Thompson
65package EMAILPlug;
66
67use strict;
68no strict "refs"; # so we can use a variable as a filehandle for print $out
69
70
71use SplitPlug;
72use unicode; # gs conv functions
73use gsprintf 'gsprintf'; # translations
74
75use sorttools;
76use util;
77
78sub BEGIN {
79 @EMAILPlug::ISA = ('SplitPlug');
80}
81
82
83my $arguments =
84 [ { 'name' => "process_exp",
85 'desc' => "{BasPlug.process_exp}",
86 'type' => "regexp",
87 'reqd' => "no",
88 'deft' => &get_default_process_exp() },
89 { 'name' => "no_attachments",
90 'desc' => "{EMAILPlug.no_attachments}",
91 'type' => "flag",
92 'reqd' => "no" },
93 { 'name' => "headers",
94 'desc' => "{EMAILPlug.headers}",
95 'type' => "flag",
96 'reqd' => "no" },
97 { 'name' => "split_exp",
98 'desc' => "{EMAILPlug.split_exp}",
99 'type' => "regexp",
100 'reqd' => "no",
101 'deft' => &get_default_split_exp() }
102 ];
103
104my $options = { 'name' => "EMAILPlug",
105 'desc' => "{EMAILPlug.desc}",
106 'abstract' => "no",
107 'inherits' => "yes",
108 'args' => $arguments };
109
110# Create a new EMAILPlug object with which to parse a file.
111# Accomplished by creating a new BasPlug and using bless to
112# turn it into an EMAILPlug.
113
114sub new {
115 my ($class) = shift (@_);
116 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
117 push(@$pluginlist, $class);
118
119 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
120 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
121
122 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
123
124 $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
125
126 # this might not actually be true at read-time, but after processing
127 # it should all be utf8.
128 $self->{'input_encoding'}="utf8";
129 return bless $self, $class;
130}
131
132sub get_default_process_exp {
133 my $self = shift (@_);
134 # mbx/email for mailbox file format, \d+ for maildir (each message is
135 # in a separate file, with a unique number for filename)
136 # mozilla and IE will save individual mbx format files with a ".eml" ext.
137 return q@([\\/]\d+|\.(mbx|email|eml))$@;
138}
139
140# This plugin splits the mbox mail files at lines starting with From<sp>
141# It is supposed to be "\n\nFrom ", but this isn't always used.
142# add \d{4} so that the line ends in a year (in case the text has an
143# unescaped "From " at the start of a line).
144sub get_default_split_exp {
145 return q^\nFrom .*\d{4}\n^;
146
147}
148
149
150# do plugin specific processing of doc_obj
151sub process {
152
153 my $self = shift (@_);
154 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
155 my $outhandle = $self->{'outhandle'};
156
157 # Check that we're dealing with a valid mail file
158 # mbox message files start with "From "
159 # maildir messages usually start with Return-Path and Delivered-To
160 # mh is very similar to maildir
161 my $startoffile=substr($$textref,0,256);
162 if (($startoffile !~ /^(From )/) &&
163 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
164 return undef;
165 }
166
167
168 print STDERR "<Processing n='$file' p='EMAILPlug'>\n" if ($gli);
169
170 gsprintf($outhandle, "EMAILPlug: {common.processing} $file\n")
171 if $self->{'verbosity'} > 1;
172
173 my $cursection = $doc_obj->get_top_section();
174
175 #
176 # Parse the document's text and extract metadata
177 #
178
179 # Protect backslashes
180 $$textref =~ s@\\@\\\\@g;
181
182 # Separate header from body of message
183 my $Headers = $$textref;
184 $Headers =~ s/\r?\n\r?\n(.*)$//s;
185 $$textref = $1;
186 $Headers .= "\n";
187
188 # Unfold headers - see rfc822
189 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
190 # Extract basic metadata from header
191 my @headers = ("From", "To", "Subject", "Date");
192 my %raw;
193 foreach my $name (@headers) {
194 $raw{$name} = "No $name value";
195 }
196
197 # Get a default encoding for the header - RFC says should be ascii...
198 my $default_header_encoding="iso_8859_1";
199
200 # We don't know what character set is the user's default...
201 # We could use textcat to guess... for now we'll look at mime content-type
202# if ($Headers =~ /([[:^ascii:]])/) {
203# }
204 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
205 $default_header_encoding=$1;
206 $default_header_encoding =~ s@\-@_@g;
207 $default_header_encoding =~ tr/A-Z/a-z/;
208 }
209
210
211 # Examine each line of the headers
212 my ($line, $name, $value);
213 my @parts;
214 foreach $line (split(/\n/, $Headers)) {
215
216 # Ignore lines with no content or which begin with whitespace
217 next unless ($line =~ /:/);
218 next if ($line =~ /^\s/);
219
220 # Find out what metadata is on this line
221 @parts = split(/:/, $line);
222 $name = shift @parts;
223 # get fieldname in canonical form - first cap, then lower case.
224 $name =~ tr/A-Z/a-z/;
225 # uppercase the first character according to the current locale
226 $name=~s/(.+)/\u$1/;
227 next unless $name;
228 next unless ($raw{$name});
229
230 # Find the value of that metadata
231 $value = join(":", @parts);
232 $value =~ s/^\s+//;
233 $value =~ s/\s+$//;
234 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
235 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
236 my $original_value=$value;
237 my $encoded=$value;
238 $value="";
239 # we should ignore spaces between consecutive encoded-texts
240 $encoded =~ s@\?=\s+=\?@\?==\?@g;
241 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=//i) {
242 my ($charset, $encoding, $data)=($2,$3,$4);
243 my ($decoded_data);
244 $value.="$1"; # any leading chars
245 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
246 chomp $data;
247 $encoding =~ tr/BQ/bq/;
248 if ($encoding eq "q") { # quoted printable
249 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
250 $decoded_data=qp_decode($data);
251 # qp_decode adds \n, which is default for body text
252 chomp($decoded_data);
253 } else { # base 64
254 $decoded_data=base64_decode($data);
255 }
256 $self->convert2unicode($charset, \$decoded_data);
257 $value .= $decoded_data;
258 } # end of while loop
259
260 # get any trailing characters
261 $self->convert2unicode($default_header_encoding, \$encoded);
262 $value.=$encoded;
263
264 if ($value =~ /^\s*$/) { # we couldn't extract anything...
265 $self->convert2unicode($default_header_encoding,
266 \$original_value);
267 $value=$original_value;
268 }
269 } # end of if =?...?=
270
271 # In the absense of other charset information, assume the
272 # header is the default (usually "iso_8859_1") and convert to unicode.
273 else {
274 $self->convert2unicode($default_header_encoding, \$value);
275 }
276
277 # Store the metadata
278 $value =~ s@_@\\_@g; # protect against GS macro language
279 $raw{$name} = $value;
280 }
281
282 # Extract the name and e-mail address from the From metadata
283 my $frommeta = $raw{"From"};
284 my $fromnamemeta;
285 my $fromaddrmeta;
286
287 $frommeta =~ s/\s*$//; # Remove trailing space, if any
288
289 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
290 $fromnamemeta=$1;
291 $fromaddrmeta=$2;
292 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
293 $fromnamemeta=$2;
294 $fromaddrmeta=$1;
295 }
296 if (!defined($fromaddrmeta)) {
297 $fromaddrmeta=$frommeta;
298 }
299 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
300 # minor attempt to prevent spam-bots from harvesting addresses...
301 $fromaddrmeta=~s/@/&#64;/;
302
303 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
304
305 if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
306 $fromnamemeta =~ s/\"//g; # remove quotes
307 $fromnamemeta =~ s/\s+$//; # remove trailing whitespace
308 }
309 else {
310 $fromnamemeta = $fromaddrmeta;
311 }
312 # if name is an address
313 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
314 $fromnamemeta=~s/@/&#64\;/;
315 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
316
317 $raw{"From"}=$frommeta;
318
319 # Process Date information
320 if ($raw{"Date"} !~ /No Date/) {
321 $raw{"DateText"} = $raw{"Date"};
322
323 # Convert the date text to internal date format
324 $value = $raw{"Date"};
325 # proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
326 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
327 if (!defined($day) || !defined($month) || !defined ($year)) {
328 # try monthly archive format: Wed Apr 23 00:26:08 2008
329 ($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
330 }
331
332 # make some assumptions about the year formatting...
333 # some (old) software thinks 2001 is 101, some think 2001 is 01
334 if ($year < 20) { $year += 2000; } # assume not really 1920...
335 elsif ($year < 150) { $year += 1900; } # assume not really 2150...
336 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
337
338 } else {
339 # We have not extracted a date
340 $raw{"DateText"} = "Unknown.";
341 $raw{"Date"} = "19000000";
342 }
343
344 # Add extracted metadata to document object
345 foreach my $name (keys %raw) {
346 $value = $raw{$name};
347 if ($value) {
348 # assume subject, etc headers have no special HTML meaning.
349 $value = &text_into_html($value);
350 # escape [] so it isn't re-interpreted as metadata
351 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
352 } else {
353 $value = "No $name field";
354 }
355 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
356 }
357
358
359 # extract a message ID from the headers, if there is one, and we'll use
360 # that as the greenstone doc ID. Having a predictable ID means we can
361 # link to other messages, eg from In-Reply-To or References headers...
362 if ($Headers =~ m@^Message-ID:(.+)$@mi) {
363 my $id=escape_msg_id($1);
364 $doc_obj->{'msgid'}=$id;
365 }
366 # link to another message, if this is a reply
367 if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
368 my $id=escape_msg_id($1);
369 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
370 } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
371 # References can have multiple, get the last one
372 my $id=escape_msg_id($1);
373 # not necessarily in-reply-to, but same thread...
374 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
375 }
376
377
378
379 my $mimetype="text/plain";
380 my $mimeinfo="";
381 my $charset = $default_header_encoding;
382 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
383 # more than one parameter given to Content-type.
384 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
385 if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
386 {
387 $mimetype=$1;
388 $mimetype =~ tr/[A-Z]/[a-z]/;
389
390 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
391 $mimetype = "text/plain";
392 }
393
394 $mimeinfo=$2;
395 if (!defined $mimeinfo) {
396 $mimeinfo="";
397 } else { # strip leading and trailing stuff
398 $mimeinfo =~ s/^\;\s*//;
399 $mimeinfo =~ s/\s*$//;
400 }
401 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
402 $charset = $1;
403 }
404 }
405
406 my $transfer_encoding="7bit";
407 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
408 $transfer_encoding=$1;
409 }
410
411 if ($mimetype eq "text/html") {
412 $$textref= $self->text_from_part($$textref, $Headers);
413 } elsif ($mimetype ne "text/plain") {
414 $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
415 $$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$$textref);
416 } else { # mimetype eq text/plain
417
418 if ($transfer_encoding =~ /quoted\-printable/) {
419 $$textref=qp_decode($$textref);
420 } elsif ($transfer_encoding =~ /base64/) {
421 $$textref=base64_decode($$textref);
422 }
423 $self->convert2unicode($charset, $textref);
424
425 $$textref = &text_into_html($$textref);
426 $$textref =~ s@_@\\_@g; # protect against GS macro language
427
428 }
429
430
431 if ($self->{'headers'} && $self->{'headers'} == 1) {
432 # Add "All headers" metadata
433 $Headers = &text_into_html($Headers);
434
435 $Headers = "No headers" unless ($Headers =~ /\w/);
436 $Headers =~ s/@/&#64\;/g;
437 # escape [] so it isn't re-interpreted as metadata
438 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
439 $self->convert2unicode($charset, \$Headers);
440
441 $Headers =~ s@_@\\_@g; # protect against GS macro language
442 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
443 }
444
445
446 # Add Title metadata
447 my $Title = text_into_html($raw{'Subject'});
448 $Title .= "<br>From: " . text_into_html($fromnamemeta);
449 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
450 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
451
452 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
453
454 # Add FileFormat metadata
455 $doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
456
457 # Add text to document object
458 $$textref = "No message" unless ($$textref =~ /\w/);
459
460 $doc_obj->add_utf8_text($cursection, $$textref);
461
462 return 1;
463}
464
465
466# Convert a text string into HTML.
467#
468# The HTML is going to be inserted into a GML file, so
469# we have to be careful not to use symbols like ">",
470# which ocurs frequently in email messages (and use
471# &gt instead.
472#
473# This function also turns links and email addresses into hyperlinks,
474# and replaces carriage returns with <BR> tags (and multiple carriage
475# returns with <P> tags).
476
477
478sub text_into_html {
479 my ($text) = @_;
480
481 # Convert problem characters into HTML symbols
482 $text =~ s/&/&amp;/g;
483 $text =~ s/</&lt;/g;
484 $text =~ s/>/&gt;/g;
485 $text =~ s/\"/&quot;/g;
486
487 # convert email addresses and URIs into links
488# don't markup email addresses for now
489# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
490
491 # try to munge email addresses a little bit...
492 $text =~ s/@/&#64;/;
493 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
494 # assume URI doesn't finish with a '.'
495 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)*(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
496
497
498 # Clean up whitespace and convert \n charaters to <BR> or <P>
499 $text =~ s/ +/ /g;
500 $text =~ s/\s*$//g;
501 $text =~ s/^\s*//g;
502 $text =~ s/\n/\n<br>/g;
503 $text =~ s/<br>\s*<br>/<p>/gi;
504
505 return $text;
506}
507
508
509
510
511#Process a MIME message.
512# the textref we are given DOES NOT include the header.
513sub text_from_mime_message {
514 my $self = shift(@_);
515 my ($mimetype,$mimeinfo,$text)=(@_);
516 my $outhandle=$self->{'outhandle'};
517 # Check for multiparts - $mimeinfo will be a boundary
518 if ($mimetype =~ /multipart/) {
519 my $boundary="";
520 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
521 $boundary=$1;
522 if ($boundary =~ m@^\"@) {
523 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
524 }
525 } else {
526 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
527 }
528 # parts start with "--$boundary"
529 # message ends with "--$boundary--"
530 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
531 # that perl might want to interpolate. Also allows spaces...
532 $boundary=~s/\\/\\\\/g;
533 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
534 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
535 # remove first "part" and last "part" (final --)
536 shift @message_parts;
537 my $last=pop @message_parts;
538 # if our boundaries are a bit dodgy and we only found 1 part...
539 if (!defined($last)) {$last="";}
540 # make sure it is only -- and whitespace
541 if ($last !~ /^\-\-\s*$/ms) {
542 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
543 }
544 foreach my $message_part (@message_parts) {
545 # remove the leading newline left from split.
546 $message_part=~s/^\r?\n//;
547 }
548 if ($mimetype eq "multipart/alternative") {
549 # check for an HTML version first, then TEXT, otherwise use first.
550 my $part_text="";
551 foreach my $message_part (@message_parts) {
552 if ($message_part =~ m@^content\-type:\s*text/html@i)
553 {
554 # Use the HTML version
555 $part_text = $self->text_from_part($message_part);
556 $mimetype="text/html";
557 last;
558 }
559 }
560 if ($part_text eq "") { # try getting a text part instead
561 foreach my $message_part (@message_parts) {
562 if ($message_part =~ m@^content\-type:\s*text/plain@i)
563 {
564 # Use the plain version
565 $part_text = $self->text_from_part($message_part);
566 if ($part_text =~/[^\s]/) {
567 $part_text = text_into_html($part_text);
568 }
569 $mimetype="text/plain";
570 last;
571 }
572 }
573 }
574 if ($part_text eq "") { #use first part (no html/text part found)
575 $part_text = $self->text_from_part(shift @message_parts);
576 $part_text = text_into_html($part_text);
577 }
578 if ($part_text eq "") { # we couldn't get anything!!!
579 # or it was an empty message...
580 # do nothing...
581 gsprintf($outhandle, "{BasPlug.empty_file} - empty body?\n");
582 } else {
583 $text = $part_text;
584 }
585 } elsif ($mimetype =~ m@multipart/(mixed|digest|related|signed)@) {
586 $text = "";
587 # signed is for PGP/GPG messages... the last part is a hash
588 if ($mimetype =~ m@multipart/signed@) {
589 pop @message_parts;
590 }
591 my $is_first_part=1;
592 foreach my $message_part (@message_parts) {
593 if ($is_first_part && $text ne "") {$is_first_part=0;}
594
595 if ($mimetype eq "multipart/digest") {
596 # default type - RTFRFC!! Set if not already set
597 $message_part =~ m@^(.*)\n\r?\n@s;
598 my $part_header=$1;
599 if ($part_header !~ m@^content-type@mi) {
600 $message_part="Content-type: message/rfc822\n"
601 . $message_part; # prepend default type
602 }
603 }
604
605 $text .= $self->process_multipart_part($message_part,
606 $is_first_part);
607 } # foreach message part.
608 } else {
609 # we can't handle this multipart type (not mixed or alternative)
610 # the RFC also mentions "parallel".
611 }
612 } # end of ($mimetype =~ multipart)
613 elsif ($mimetype =~ m@message/rfc822@) {
614 my $msg_header = $text;
615 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
616 $text = $1;
617
618 if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
619 {
620 $mimetype=$1;
621 $mimeinfo=$2;
622 $mimetype =~ tr/[A-Z]/[a-z]/;
623
624 my $msg_text;
625 if ($mimetype =~ m@multipart/@) {
626 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
627 $text);
628 } else {
629 $msg_text=$self->text_from_part($text,$msg_header);
630 }
631
632 my $brief_header=text_into_html(get_brief_headers($msg_header));
633 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
634 $text.= "<table><tr><td width=\"5%\"> </td>\n";
635 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
636 . "</td></tr></table>";
637 }
638 } else {
639 # we don't do any processing of the content.
640 }
641
642 return $text;
643}
644
645
646
647# used for turning a message id into a more friendly string for greenstone
648sub escape_msg_id {
649#msgid
650 my $id=shift;
651 chomp $id; $id =~ s!\s!!g; # remove spaces
652 $id =~ s![<>\[\]]!!g; # remove [ ] < and >
653 $id =~ s![_&]!-!g; # replace symbols that might cause problems
654 $id =~ s!\.!-!g; # . means section to greenstone doc ids!
655 $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
656 return $id;
657}
658
659
660
661sub process_multipart_part {
662 my $self = shift;
663 my $message_part = shift;
664 my $is_first_part = shift;
665
666 my $return_text="";
667 my $part_header=$message_part;
668 my $part_body;
669 if ($message_part=~ /^\s*\n/) {
670 # no header... use defaults
671 $part_body=$message_part;
672 $part_header="Content-type: text/plain; charset=us-ascii";
673 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
674 $part_body=$1;
675 } else {
676 # something's gone wrong...
677 $part_header="";
678 $part_body=$message_part;
679 }
680
681 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
682 my $part_content_type="";
683 my $part_content_info="";
684
685 if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;.*)?$@mi) {
686 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
687 $part_content_info=$2;
688 if (!defined($part_content_info)) {
689 $part_content_info="";
690 } else {
691 $part_content_info =~ s/^\;\s*//;
692 $part_content_info =~ s/\s*$//;
693 }
694 }
695 my $filename="";
696 if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
697 $filename=$1;
698 $filename =~ s@\r?\s*$@@; # remove trailing space, if any
699 }
700
701 # disposition - either inline or attachment.
702 # NOT CURRENTLY USED - we display all text types instead...
703 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
704
705 # add <<attachment>> to each part except the first...
706 if (!$is_first_part) {
707 $return_text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
708 # add part info header
709 my $header_text = "<br>Type: $part_content_type<br>\n";
710 if ($filename ne "") {
711 $header_text .= "Filename: $filename\n";
712 }
713 $header_text =~ s@_@\\_@g;
714 $return_text .= $header_text . "</strong></p>\n<p>\n";
715 }
716
717 if ($part_content_type =~ m@text/@)
718 {
719 # $message_part includes the mime part headers
720 my $part_text = $self->text_from_part($message_part);
721 if ($part_content_type !~ m@text/(ht|x)ml@) {
722 $part_text = text_into_html($part_text);
723 }
724 if ($part_text eq "") {
725 $part_text = ' ';
726 }
727 $return_text .= $part_text;
728 } elsif ($part_content_type =~ m@message/rfc822@) {
729 # This is a forwarded message
730 my $message_part_headers=$part_body;
731 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
732 my $message_part_body=$1;
733 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
734
735 my $rfc822_formatted_body=""; # put result in here
736 if ($message_part_headers =~
737 /^content\-type:\s*([\w\.\-\/]+)\s*(\;.*)?$/ims)
738 {
739 # The message header uses MIME flags
740 my $message_content_type=$1;
741 my $message_content_info=$2;
742 if (!defined($message_content_info)) {
743 $message_content_info="";
744 } else {
745 $message_content_info =~ s/^\;\s*//;
746 $message_content_info =~ s/\s*$//;
747 }
748 $message_content_type =~ tr/A-Z/a-z/;
749 if ($message_content_type =~ /multipart/) {
750 $rfc822_formatted_body=
751 $self->text_from_mime_message($message_content_type,
752 $message_content_info,
753 $message_part_body);
754 } else {
755 $message_part_body=$self->text_from_part($part_body,
756 $message_part_headers);
757 $rfc822_formatted_body=text_into_html($message_part_body);
758 }
759 } else {
760 # message doesn't use MIME flags
761 $rfc822_formatted_body=text_into_html($message_part_body);
762 $rfc822_formatted_body =~ s@_@\\_@g;
763 }
764 # Add the returned text to the output
765 # don't put all the headers...
766# $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
767 my $brief_headers=get_brief_headers($message_part_headers);
768 $return_text.=text_into_html($brief_headers);
769 $return_text.="</p><p>\n";
770 $return_text.=$rfc822_formatted_body;
771 $return_text.="</p>\n";
772 # end of message/rfc822
773 } elsif ($part_content_type =~ /multipart/) {
774 # recurse again
775
776 my $tmptext= $self->text_from_mime_message($part_content_type,
777 $part_content_info,
778 $part_body);
779 $return_text.=$tmptext;
780 } else {
781 # this part isn't text/* or another message...
782 if ($is_first_part) {
783 # this is the first part of a multipart, or only part!
784 $return_text="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
785 # add part info header
786 my $header_text="<br>Type: $part_content_type<br>\n";
787 $header_text.="Filename: $filename</strong></p>\n<p>\n";
788 $header_text =~ s@_@\\_@g;
789 $return_text.=$header_text;
790 }
791
792 # save attachment by default
793 if (!$self->{'no_attachments'}
794 && $filename ne "") { # this part has a file...
795 my $encoding="8bit";
796 if ($part_header =~
797 /^content-transfer-encoding:\s*(\w+)/mi ) {
798 $encoding=$1; $encoding =~ tr/A-Z/a-z/;
799 }
800 my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
801 my $save_filename=$filename;
802
803 # make sure we don't clobber files with same name;
804 # need to keep state between .mbx files
805 my $assoc_files=$self->{'assoc_filenames'};
806 if ($assoc_files->{$filename}) { # it's been set...
807 $assoc_files->{$filename}++;
808 $filename =~ m/(.+)\.(\w+)$/;
809 my ($filestem, $ext)=($1,$2);
810 $save_filename="${filestem}_"
811 . $assoc_files->{$filename} . ".$ext";
812 } else { # first file with this name
813 $assoc_files->{$filename}=1;
814 }
815 open (SAVE, ">$tmpdir/$save_filename") ||
816 warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
817 my $part_text = $message_part;
818 $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
819 if ($encoding eq "base64") {
820 print SAVE base64_decode($part_text);
821 } elsif ($encoding eq "quoted-printable") {
822 print SAVE qp_decode($part_text);
823 } else { # 7bit, 8bit, binary, etc...
824 print SAVE $part_text;
825 }
826 close SAVE;
827 my $doc_obj=$self->{'doc_obj'};
828 $doc_obj->associate_file("$tmpdir/$save_filename",
829 "$save_filename",
830 $part_content_type # mimetype
831 );
832 # clean up tmp area...
833 # Can't do this as it hasn't been copied/linked yet!!!
834# &util::rm("$tmpdir/$save_filename");
835 my $outhandle=$self->{'outhandle'};
836 print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
837
838 # be nice if "download" was a translatable macro :(
839 $return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
840 } # end of save attachment
841 } # end of !text/message part
842
843
844 return $return_text;
845}
846
847
848# Return only the "important" headers from a set of message headers
849sub get_brief_headers {
850 my $msg_header = shift;
851 my $brief_header = "";
852
853 # Order matters!
854 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
855 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
856 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
857 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
858 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
859
860 return $brief_header;
861}
862
863
864# Process a MIME part. Return "" if we can't decode it.
865# should only be called for parts with type "text/*" ?
866# Either pass the entire mime part (including the part's header),
867# or pass the mime part's text and optionally the part's header.
868sub text_from_part {
869 my $self = shift;
870 my $text = shift || '';
871 my $part_header = shift;
872
873
874 my $type="text/plain"; # default, overridden from part header
875 my $charset=undef; # convert2unicode() will guess if necessary
876
877 if (! $part_header) { # no header argument was given. check the body
878 $part_header = $text;
879 # check for empty part header (leading blank line)
880 if ($text =~ /^\s*\r?\n/) {
881 $part_header="Content-type: text/plain; charset=us-ascii";
882 } else {
883 $part_header =~ s/\r?\n\r?\n(.*)$//s;
884 $text=$1; if (!defined($text)) {$text="";}
885 }
886 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
887 }
888
889 if ($part_header =~
890 /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is) {
891 $type=$1;
892 $charset=$2;
893 }
894 my $encoding="";
895 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
896 $encoding=$1; $encoding=~tr/A-Z/a-z/;
897 }
898 # Content-Transfer-Encoding is per-part
899 if ($encoding ne "") {
900 if ($encoding =~ /quoted\-printable/) {
901 $text=qp_decode($text);
902 } elsif ($encoding =~ /base64/) {
903 $text=base64_decode($text);
904 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
905 # rfc2045 also allows binary, which we ignore (for now).
906 my $outhandle=$self->{'outhandle'};
907 print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
908 return "";
909 }
910 }
911
912 if ($type eq "text/html") {
913 # only get stuff between <body> tags, or <html> tags.
914 $text =~ s@^.*<html[^>]*>@@is;
915 $text =~ s@</html>.*$@@is;
916 $text =~ s/^.*?<body[^>]*>//si;
917 $text =~ s/<\/body>.*$//si;
918 }
919 elsif ($type eq "text/xml") {
920 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
921 $text="<pre>\n$text\n</pre>\n";
922 }
923 # convert to unicode
924 $self->convert2unicode($charset, \$text);
925 $text =~ s@_@\\_@g; # protect against GS macro language
926 return $text;
927}
928
929
930
931
932# decode quoted-printable text
933sub qp_decode {
934 my $text=shift;
935
936 # if a line ends with "=\s*", it is a soft line break, otherwise
937 # keep in any newline characters.
938
939 $text =~ s/=\s*\r?\n//mg;
940 $text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
941 return $text;
942}
943
944# decode base64 text. This is fairly slow (since it's interpreted perl rather
945# than compiled XS stuff like in the ::MIME modules, but this is more portable
946# for us at least).
947# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
948# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
949# from each encoded byte.
950
951
952sub base64_decode {
953 my $enc_text = shift;
954# A=>0, B=>1, ..., '+'=>62, '/'=>63
955# also '=' is used for padding at the end, but we remove it anyway.
956 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
957# map each MIME char into it's value, for more efficient lookup.
958 my %index;
959 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
960# remove all non-base64 chars. eval to get variable in transliteration...
961# also remove '=' - we'll assume (!!) that there are no errors in the encoding
962 eval "\$enc_text =~ tr|$mimechars||cd";
963 my $decoded="";
964 while (length ($enc_text)>3)
965 {
966 my $fourchars=substr($enc_text,0,4,"");
967 my @chars=(split '',$fourchars);
968 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
969 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
970 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
971 }
972# if there are any input chars left, there are either
973# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
974 my @chars=(split '',$enc_text);
975 if (length($enc_text)) {
976 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
977 }
978 if (length($enc_text)==3) {
979 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
980 }
981 return $decoded;
982}
983
984# returns 0 if valid utf-8, 1 if invalid
985sub is_utf8 {
986 my $self = shift;
987 my $textref = shift;
988
989 $$textref =~ m/^/g; # to set \G
990 my $badbytesfound=0;
991 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
992 my $highbytes=$1;
993 my $highbyteslength=length($highbytes);
994 # replace any non utf8 complaint bytes
995 $highbytes =~ /^/g; # set pos()
996 while ($highbytes =~
997 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
998 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
999 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
1000 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
1001 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1002 )*([\x80-\xff])? !xg
1003 ) {
1004 my $badbyte=$1;
1005 if (!defined $badbyte) {next} # hit end of string
1006 return 1;
1007 }
1008 }
1009 return 0;
1010}
1011
1012
1013
1014sub convert2unicode {
1015 my $self = shift(@_);
1016 my ($charset, $textref) = @_;
1017
1018 if (!$$textref) {
1019 # nothing to do!
1020 return;
1021 }
1022
1023 if (! defined $charset) {
1024 # check if we have valid utf-8
1025 if ($self->is_utf8($textref)) { $charset = "utf8" }
1026
1027 # default to latin
1028 $charset = "iso_8859_1" if ! defined($charset);
1029 }
1030
1031 # first get our character encoding name in the right form.
1032 $charset =~ tr/A-Z/a-z/; # lowercase
1033 $charset =~ s/\-/_/g;
1034 if ($charset =~ /gb_?2312/) { $charset="gb" }
1035 # assumes EUC-KR, not ISO-2022 !?
1036 $charset =~ s/^ks_c_5601_1987/korean/;
1037 if ($charset eq 'utf_8') {$charset='utf8'}
1038
1039 my $outhandle = $self->{'outhandle'};
1040
1041 if ($charset eq "utf8") {
1042 # no conversion needed, but lets check that it's valid utf8
1043 # see utf-8 manpage for valid ranges
1044 $$textref =~ m/^/g; # to set \G
1045 my $badbytesfound=0;
1046 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1047 my $highbytes=$1;
1048 my $highbyteslength=length($highbytes);
1049 # replace any non utf8 complaint bytes
1050 $highbytes =~ /^/g; # set pos()
1051 while ($highbytes =~
1052 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
1053 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
1054 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
1055 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
1056 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1057 )*([\x80-\xff])? !xg
1058 ) {
1059 my $badbyte=$1;
1060 if (!defined $badbyte) {next} # hit end of string
1061 my $pos=pos($highbytes);
1062 substr($highbytes, $pos-1, 1, "\xc2\x80");
1063 # update the position to continue searching (for \G)
1064 pos($highbytes) = $pos+1; # set to just after the \x80
1065 $badbytesfound=1;
1066 }
1067 if ($badbytesfound==1) {
1068 # claims to be utf8, but it isn't!
1069 print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
1070 . "detected and removed.\n";
1071
1072 my $replength=length($highbytes);
1073 my $textpos=pos($$textref);
1074 # replace bad bytes with good bytes
1075 substr( $$textref, $textpos-$replength,
1076 $replength, $highbytes);
1077 # update the position to continue searching (for \G)
1078 pos($$textref)=$textpos+($replength-$highbyteslength);
1079 }
1080 }
1081 return;
1082 }
1083
1084 # It appears that we can't always trust ascii text so we'll treat it
1085 # as iso-8859-1 (letting characters above 0x80 through without
1086 # converting them to utf-8 will result in invalid XML documents
1087 # which can't be parsed at build time).
1088 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
1089
1090 if ($charset eq "iso_8859_1") {
1091 # test if the mailer lied, and it has win1252 chars in it...
1092 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1093 if ($$textref =~ m/[\x80-\x9f]/) {
1094 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
1095 print $outhandle "codepage 1252 detected.\n";
1096 $charset = "windows_1252";
1097 }
1098 }
1099 my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1100
1101 if ($utf8_text ne "") {
1102 $$textref=$utf8_text;
1103 } else {
1104 # we didn't get any text... unsupported encoding perhaps? Or it is
1105 # empty anyway. We'll try to continue, assuming 8859-1. We could strip
1106 # characters out here if this causes problems...
1107 my $outhandle=$self->{'outhandle'};
1108 print $outhandle "EMAILPlug: falling back to iso-8859-1\n";
1109 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1110
1111 }
1112}
1113
1114
1115sub set_OID {
1116 my $self = shift (@_);
1117 my ($doc_obj, $id, $segment_number) = @_;
1118
1119 if ( exists $doc_obj->{'msgid'} ) {
1120 $doc_obj->set_OID($doc_obj->{'msgid'});
1121 } else {
1122 $doc_obj->set_OID("$id\_$segment_number");
1123 }
1124}
1125
1126
1127# Perl packages have to return true if they are run.
11281;
Note: See TracBrowser for help on using the repository browser.