1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # EMAILPlug.pm - a plugin for parsing email files
|
---|
4 | #
|
---|
5 | # A component of the Greenstone digital library software
|
---|
6 | # from the New Zealand Digital Library Project at the
|
---|
7 | # University of Waikato, New Zealand.
|
---|
8 | #
|
---|
9 | # Copyright (C) 1999-2002 New Zealand Digital Library Project
|
---|
10 | #
|
---|
11 | # This program is free software; you can redistribute it and/or modify
|
---|
12 | # it under the terms of the GNU General Public License as published by
|
---|
13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
14 | # (at your option) any later version.
|
---|
15 | #
|
---|
16 | # This program is distributed in the hope that it will be useful,
|
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | # GNU General Public License for more details.
|
---|
20 | #
|
---|
21 | # You should have received a copy of the GNU General Public License
|
---|
22 | # along with this program; if not, write to the Free Software
|
---|
23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | #
|
---|
25 | ###########################################################################
|
---|
26 |
|
---|
27 |
|
---|
28 |
|
---|
29 | # EMAILPlug
|
---|
30 | #
|
---|
31 | # by Gordon Paynter ([email protected])
|
---|
32 | #
|
---|
33 | # Email plug reads email files. These are named with a simple
|
---|
34 | # number (i.e. as they appear in maildir folders) or with the
|
---|
35 | # extension .mbx (for mbox mail file format)
|
---|
36 | #
|
---|
37 | # Document text:
|
---|
38 | # The document text consists of all the text
|
---|
39 | # after the first blank line in the document.
|
---|
40 | #
|
---|
41 | # Metadata (not Dublin Core!):
|
---|
42 | # $Headers All the header content
|
---|
43 | # $Subject Subject: header
|
---|
44 | # $To To: header
|
---|
45 | # $From From: header
|
---|
46 | # $FromName Name of sender (where available)
|
---|
47 | # $FromAddr E-mail address of sender
|
---|
48 | # $DateText Date: header
|
---|
49 | # $Date Date: header in GSDL format (eg: 19990924)
|
---|
50 | #
|
---|
51 | # $Title made up of Subject, Date and Sender (for default formatting)
|
---|
52 | #
|
---|
53 | #
|
---|
54 | # John McPherson - June/July 2001
|
---|
55 | # added (basic) MIME support and quoted-printable and base64 decodings.
|
---|
56 | # Minor fixes for names that are actually email addresses (ie <...> was lost)
|
---|
57 | #
|
---|
58 | # See: * RFC 822 - ARPA Internet Text Messages
|
---|
59 | # * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
|
---|
60 | # * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
|
---|
61 | # * RFC 2047 - MIME (part 3) Message Header Extensions
|
---|
62 | # * RFC 1806 - Content Dispositions (ie inline/attachment)
|
---|
63 |
|
---|
64 | # 12/05/02 Added usage datastructure - John Thompson
|
---|
65 |
|
---|
66 | package EMAILPlug;
|
---|
67 |
|
---|
68 | use SplitPlug;
|
---|
69 |
|
---|
70 | use unicode;
|
---|
71 |
|
---|
72 | use sorttools;
|
---|
73 | use util;
|
---|
74 |
|
---|
75 |
|
---|
76 | # EMAILPlug is a sub-class of SplitPlug.
|
---|
77 |
|
---|
78 | sub BEGIN {
|
---|
79 | @ISA = ('SplitPlug');
|
---|
80 | }
|
---|
81 |
|
---|
82 | sub print_usage {
|
---|
83 | print STDERR "\n usage: plugin EMAILPlug [options]\n\n";
|
---|
84 | print STDERR " options:\n";
|
---|
85 | print STDERR " -split_exp A perl regular expression used to split files\n";
|
---|
86 | print STDERR " containing many messages into individual documents.\n\n";
|
---|
87 | print STDERR " -no_attachments Do not save message attachments.\n\n";
|
---|
88 |
|
---|
89 | }
|
---|
90 |
|
---|
91 | my $arguments =
|
---|
92 | [ { 'name' => "process_exp",
|
---|
93 | 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
|
---|
94 | 'type' => "string",
|
---|
95 | 'reqd' => "no",
|
---|
96 | 'deft' => q@([\\/]\d+|\.(mbx|email|eml))$@
|
---|
97 | },
|
---|
98 | { 'name' => "no_attachments",
|
---|
99 | 'desc' => "Do not save message attachments.",
|
---|
100 | 'type' => "flag",
|
---|
101 | 'reqd' => "no"
|
---|
102 | },
|
---|
103 | { 'name' => "block_exp",
|
---|
104 | 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
|
---|
105 | 'type' => "string",
|
---|
106 | 'reqd' => "no",
|
---|
107 | 'deft' => q^^}
|
---|
108 | ];
|
---|
109 |
|
---|
110 | my $options =
|
---|
111 | { 'name' => "EMAILPlug",
|
---|
112 | 'desc' => "Email plug reads email files. These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers All the header content\n\t\$Subject Subject: header\n\t\$To To: header\n\t\$From From: header\n\t\$FromName Name of sender (where available)\n\t\$FromAddr E-mail address of sender\n\t\$DateText Date: header\n\t\$Date Date: header in GSDL format (eg: 19990924)",
|
---|
113 | 'inherits' => "Yes",
|
---|
114 | 'args' => $arguments };
|
---|
115 |
|
---|
116 | # Create a new EMAILPlug object with which to parse a file.
|
---|
117 | # Accomplished by creating a new BasPlug and using bless to
|
---|
118 | # turn it into an EMAILPlug.
|
---|
119 |
|
---|
120 | sub new {
|
---|
121 | my ($class) = @_;
|
---|
122 | my $self = new BasPlug ($class, @_);
|
---|
123 |
|
---|
124 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
125 | my $option_list = $self->{'option_list'};
|
---|
126 | push( @{$option_list}, $options );
|
---|
127 |
|
---|
128 | if (!parsargv::parse(\@_,
|
---|
129 | q^split_exp/.*/^, \$self->{'split_exp'},
|
---|
130 | q^no_attachments^, \$self->{'ignore_attachments'},
|
---|
131 | "allow_extra_options")) {
|
---|
132 | print STDERR "\nIncorrect options passed to $class.";
|
---|
133 | print STDERR "\nCheck your collect.cfg configuration file\n";
|
---|
134 | die "\n";
|
---|
135 | }
|
---|
136 | $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
|
---|
137 |
|
---|
138 | # this might not actually be true at read-time, but after processing
|
---|
139 | # it should all be utf8.
|
---|
140 | $self->{'input_encoding'}="utf8";
|
---|
141 | return bless $self, $class;
|
---|
142 | }
|
---|
143 |
|
---|
144 | sub get_default_process_exp {
|
---|
145 | my $self = shift (@_);
|
---|
146 | # mbx/email for mailbox file format, \d+ for maildir (each message is
|
---|
147 | # in a separate file, with a unique number for filename)
|
---|
148 | # mozilla and IE will save individual mbx format files with a ".eml" ext.
|
---|
149 | return q@([\\/]\d+|\.(mbx|email|eml))$@;
|
---|
150 | }
|
---|
151 |
|
---|
152 | # This plugin splits the mbox mail files at lines starting with From<sp>
|
---|
153 | # It is supposed to be "\n\nFrom ", but this isn't always used.
|
---|
154 | sub get_default_split_exp {
|
---|
155 | return q^\nFrom .*\n^;
|
---|
156 |
|
---|
157 | }
|
---|
158 |
|
---|
159 |
|
---|
160 | # do plugin specific processing of doc_obj
|
---|
161 | sub process {
|
---|
162 |
|
---|
163 | my $self = shift (@_);
|
---|
164 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
165 | my $outhandle = $self->{'outhandle'};
|
---|
166 |
|
---|
167 | # Check that we're dealing with a valid mail file
|
---|
168 | # mbox message files start with "From "
|
---|
169 | # maildir messages usually start with Return-Path and Delivered-To
|
---|
170 | # mh is very similar to maildir
|
---|
171 | my $startoffile=substr($$textref,0,256);
|
---|
172 | if (($startoffile !~ /^(From )/) &&
|
---|
173 | ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
|
---|
174 | return undef;
|
---|
175 | }
|
---|
176 |
|
---|
177 |
|
---|
178 | print $outhandle "EMAILPlug: processing $file\n"
|
---|
179 | if $self->{'verbosity'} > 1;
|
---|
180 |
|
---|
181 | my $cursection = $doc_obj->get_top_section();
|
---|
182 |
|
---|
183 | #
|
---|
184 | # Parse the document's text and extract metadata
|
---|
185 | #
|
---|
186 |
|
---|
187 | # Protect backslashes
|
---|
188 | $$textref =~ s@\\@\\\\@g;
|
---|
189 |
|
---|
190 | # Separate header from body of message
|
---|
191 | my $Headers = $$textref;
|
---|
192 | $Headers =~ s/\r?\n\r?\n(.*)$//s;
|
---|
193 | $$textref = $1;
|
---|
194 |
|
---|
195 | # Unfold headers - see rfc822
|
---|
196 | $Headers =~ s/\r?\n[\t\ ]+/ /gs;
|
---|
197 | # Extract basic metadata from header
|
---|
198 | my @headers = ("From", "To", "Subject", "Date");
|
---|
199 | my %raw;
|
---|
200 | foreach my $name (@headers) {
|
---|
201 | $raw{$name} = "No $name value";
|
---|
202 | }
|
---|
203 |
|
---|
204 | # Get a default encoding for the header - RFC says should be ascii...
|
---|
205 | my $default_heading_encoding="iso_8859_1";
|
---|
206 |
|
---|
207 | # We don't know what character set is the user's default...
|
---|
208 | # We could use textcat to guess... for now we'll look at mime content-type
|
---|
209 | # if ($Headers =~ /([[:^ascii:]])/) {
|
---|
210 | # }
|
---|
211 | if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
|
---|
212 | $default_header_encoding=$1;
|
---|
213 | $default_header_encoding =~ s@\-@_@g;
|
---|
214 | $default_header_encoding =~ tr/A-Z/a-z/;
|
---|
215 | }
|
---|
216 |
|
---|
217 |
|
---|
218 | # Examine each line of the headers
|
---|
219 | my ($line, $name, $value);
|
---|
220 | my @parts;
|
---|
221 | foreach $line (split(/\n/, $Headers)) {
|
---|
222 |
|
---|
223 | # Ignore lines with no content or which begin with whitespace
|
---|
224 | next unless ($line =~ /:/);
|
---|
225 | next if ($line =~ /^\s/);
|
---|
226 |
|
---|
227 | # Find out what metadata is on this line
|
---|
228 | @parts = split(/:/, $line);
|
---|
229 | $name = shift @parts;
|
---|
230 | # get fieldname in canonical form - first cap, then lower case.
|
---|
231 | $name =~ tr/A-Z/a-z/;
|
---|
232 | # uppercase the first character according to the current locale
|
---|
233 | $name=~s/(.+)/\u$1/;
|
---|
234 | next unless $name;
|
---|
235 | next unless ($raw{$name});
|
---|
236 |
|
---|
237 | # Find the value of that metadata
|
---|
238 | $value = join(":", @parts);
|
---|
239 | $value =~ s/^\s+//;
|
---|
240 | $value =~ s/\s+$//;
|
---|
241 | # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
|
---|
242 | if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
|
---|
243 | my $original_value=$value;
|
---|
244 | my $encoded=$value;
|
---|
245 | $value="";
|
---|
246 | # we should ignore spaces between consecutive encoded-texts
|
---|
247 | $encoded =~ s@\?=\s+=\?@\?==\?@g;
|
---|
248 | while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=//i) {
|
---|
249 | my ($charset, $encoding, $data)=($2,$3,$4);
|
---|
250 | my ($decoded_data);
|
---|
251 | $value.="$1"; # any leading chars
|
---|
252 | $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
|
---|
253 | chomp $data;
|
---|
254 | $encoding =~ tr/BQ/bq/;
|
---|
255 | if ($encoding eq "q") { # quoted printable
|
---|
256 | $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
|
---|
257 | $decoded_data=qp_decode($data);
|
---|
258 | # qp_decode adds \n, which is default for body text
|
---|
259 | chomp($decoded_data);
|
---|
260 | } else { # base 64
|
---|
261 | $decoded_data=base64_decode($data);
|
---|
262 | }
|
---|
263 | $self->convert2unicode($charset, \$decoded_data);
|
---|
264 | $value .= $decoded_data;
|
---|
265 | } # end of while loop
|
---|
266 |
|
---|
267 | # get any trailing characters
|
---|
268 | $self->convert2unicode($default_header_encoding, \$encoded);
|
---|
269 | $value.=$encoded;
|
---|
270 |
|
---|
271 | if ($value =~ /^\s*$/) { # we couldn't extract anything...
|
---|
272 | $self->convert2unicode($default_header_encoding,
|
---|
273 | \$original_value);
|
---|
274 | $value=original_value;
|
---|
275 | }
|
---|
276 | } # end of if =?...?=
|
---|
277 |
|
---|
278 | # In the absense of other charset information, assume the
|
---|
279 | # header is the default (usually "iso_8859_1") and convert it to unicode.
|
---|
280 | else {
|
---|
281 | $self->convert2unicode($default_header_encoding, \$value);
|
---|
282 | }
|
---|
283 |
|
---|
284 | # Store the metadata
|
---|
285 | $raw{$name} = $value;
|
---|
286 | }
|
---|
287 |
|
---|
288 | # Extract the name and e-mail address from the From metadata
|
---|
289 | $frommeta = $raw{"From"};
|
---|
290 | my $fromnamemeta;
|
---|
291 | my $fromaddrmeta;
|
---|
292 |
|
---|
293 | $frommeta =~ s/\s*$//; # Remove trailing space, if any
|
---|
294 |
|
---|
295 | if ($frommeta =~ m/(.+)\s*<(.+)>/) {
|
---|
296 | $fromnamemeta=$1;
|
---|
297 | $fromaddrmeta=$2;
|
---|
298 | } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
|
---|
299 | $fromnamemeta=$2;
|
---|
300 | $fromaddrmeta=$1;
|
---|
301 | }
|
---|
302 | if (!defined($fromaddrmeta)) {
|
---|
303 | $fromaddrmeta=$frommeta;
|
---|
304 | }
|
---|
305 | $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
|
---|
306 | # minor attempt to prevent spam-bots from harvesting addresses...
|
---|
307 | $fromaddrmeta=~s/@/@/;
|
---|
308 | $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
|
---|
309 |
|
---|
310 | if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
|
---|
311 | $fromnamemeta =~ s/\"//g; # remove quotes
|
---|
312 | $fromnamemeta =~ s/\s+$//; # remove trailing whitespace
|
---|
313 | }
|
---|
314 | else {
|
---|
315 | $fromnamemeta = $fromaddrmeta;
|
---|
316 | }
|
---|
317 | # if name is an address
|
---|
318 | $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
|
---|
319 | $fromnamemeta=~s/@/@\;/;
|
---|
320 | $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
|
---|
321 |
|
---|
322 | $raw{"From"}=$frommeta;
|
---|
323 |
|
---|
324 | # Process Date information
|
---|
325 | if ($raw{"Date"} !~ /No Date/) {
|
---|
326 | $raw{"DateText"} = $raw{"Date"};
|
---|
327 |
|
---|
328 | # Convert the date text to internal date format
|
---|
329 | $value = $raw{"Date"};
|
---|
330 | my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
|
---|
331 | # make some assumptions about the year formatting...
|
---|
332 | # some (old) software thinks 2001 is 101, some think 2001 is 01
|
---|
333 | if ($year < 20) { $year += 2000; } # assume not really 1920...
|
---|
334 | elsif ($year < 150) { $year += 1900; } # assume not really 2150...
|
---|
335 | $raw{"Date"} = &sorttools::format_date($day, $month, $year);
|
---|
336 |
|
---|
337 | } else {
|
---|
338 | # We have not extracted a date
|
---|
339 | $raw{"DateText"} = "Unknown.";
|
---|
340 | $raw{"Date"} = "19000000";
|
---|
341 | }
|
---|
342 |
|
---|
343 | # Add extracted metadata to document object
|
---|
344 | foreach my $name (keys %raw) {
|
---|
345 | $value = $raw{$name};
|
---|
346 | if ($value) {
|
---|
347 | # assume subject, etc headers have no special HTML meaning.
|
---|
348 | $value = &text_into_html($value);
|
---|
349 | # escape [] so it isn't re-interpreted as metadata
|
---|
350 | $value =~ s/\[/[/g; $value =~ s/\]/]/g;
|
---|
351 | } else {
|
---|
352 | $value = "No $name field";
|
---|
353 | }
|
---|
354 | $doc_obj->add_utf8_metadata ($cursection, $name, $value);
|
---|
355 | }
|
---|
356 |
|
---|
357 | my $mimetype="text/plain";
|
---|
358 | my $mimeinfo="";
|
---|
359 | my $charset = $default_header_encoding;
|
---|
360 | # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
|
---|
361 | # more than one parameter given to Content-type.
|
---|
362 | # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
|
---|
363 | if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
|
---|
364 | {
|
---|
365 | $mimetype=$1;
|
---|
366 | $mimetype =~ tr/[A-Z]/[a-z]/;
|
---|
367 |
|
---|
368 | if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
|
---|
369 | $mimetype = "text/plain";
|
---|
370 | }
|
---|
371 |
|
---|
372 | $mimeinfo=$2;
|
---|
373 | if (!defined $mimeinfo) {
|
---|
374 | $mimeinfo="";
|
---|
375 | } else { # strip leading and trailing stuff
|
---|
376 | $mimeinfo =~ s/^\;\s*//;
|
---|
377 | $mimeinfo =~ s/\s*$//;
|
---|
378 | }
|
---|
379 | if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
|
---|
380 | $charset = $1;
|
---|
381 | }
|
---|
382 | }
|
---|
383 |
|
---|
384 | my $transfer_encoding="7bit";
|
---|
385 | if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
|
---|
386 | $transfer_encoding=$1;
|
---|
387 | }
|
---|
388 | if ($mimetype eq "text/html") {
|
---|
389 | $$textref= $self->text_from_part("$Headers\n$$textref");
|
---|
390 | } elsif ($mimetype ne "text/plain") {
|
---|
391 | $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
|
---|
392 | $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
|
---|
393 | $outhandle);
|
---|
394 | } elsif ($transfer_encoding =~ /quoted\-printable/) {
|
---|
395 | $$textref=qp_decode($$textref);
|
---|
396 | $self->convert2unicode($charset, $textref);
|
---|
397 | } elsif ($transfer_encoding =~ /base64/) {
|
---|
398 | $$textref=base64_decode($$textref);
|
---|
399 | $self->convert2unicode($charset, $textref);
|
---|
400 | } else {
|
---|
401 | $self->convert2unicode($charset, $textref);
|
---|
402 | }
|
---|
403 |
|
---|
404 |
|
---|
405 | # Add "All headers" metadata
|
---|
406 | $Headers = &text_into_html($Headers);
|
---|
407 |
|
---|
408 | $Headers = "No headers" unless ($Headers =~ /\w/);
|
---|
409 | $Headers =~ s/@/@\;/g;
|
---|
410 | # escape [] so it isn't re-interpreted as metadata
|
---|
411 | $Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;
|
---|
412 | $self->convert2unicode($charset, \$Headers);
|
---|
413 | $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
|
---|
414 |
|
---|
415 |
|
---|
416 | # Add Title metadata
|
---|
417 | my $Title = text_into_html($raw{'Subject'});
|
---|
418 | $Title .= "<br>From: " . text_into_html($raw{'From'});
|
---|
419 | $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
|
---|
420 | $Title =~ s/\[/[/g; $Title =~ s/\]/]/g;
|
---|
421 |
|
---|
422 | $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
|
---|
423 |
|
---|
424 |
|
---|
425 | # Add text to document object
|
---|
426 | if ($mimetype eq "text/plain") {
|
---|
427 | $$textref = &text_into_html($$textref);
|
---|
428 | }
|
---|
429 | $$textref = "No message" unless ($$textref =~ /\w/);
|
---|
430 | $doc_obj->add_utf8_text($cursection, $$textref);
|
---|
431 |
|
---|
432 | return 1;
|
---|
433 | }
|
---|
434 |
|
---|
435 |
|
---|
436 | # Convert a text string into HTML.
|
---|
437 | #
|
---|
438 | # The HTML is going to be inserted into a GML file, so
|
---|
439 | # we have to be careful not to use symbols like ">",
|
---|
440 | # which ocurs frequently in email messages (and use
|
---|
441 | # > instead.
|
---|
442 | #
|
---|
443 | # This function also turns links and email addresses into hyperlinks,
|
---|
444 | # and replaces carriage returns with <BR> tags (and multiple carriage
|
---|
445 | # returns with <P> tags).
|
---|
446 |
|
---|
447 |
|
---|
448 | sub text_into_html {
|
---|
449 | my ($text) = @_;
|
---|
450 |
|
---|
451 | # Convert problem characters into HTML symbols
|
---|
452 | $text =~ s/&/&/g;
|
---|
453 | $text =~ s/</</g;
|
---|
454 | $text =~ s/>/>/g;
|
---|
455 | $text =~ s/\"/"/g;
|
---|
456 |
|
---|
457 | $text =~ s@_@\\_@g; # protect against greenstone macros...
|
---|
458 |
|
---|
459 | # convert email addresses and URIs into links
|
---|
460 | # don't markup email addresses for now
|
---|
461 | # $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
|
---|
462 |
|
---|
463 | # try to munge email addresses a little bit...
|
---|
464 | $text =~ s/@/@/;
|
---|
465 | # assume hostnames are \.\w\- only, then might have a trailing '/.*'
|
---|
466 | # assume URI doesn't finish with a '.'
|
---|
467 | $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&|\.)?[\w\?\=\-_/~]+)*(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
|
---|
468 |
|
---|
469 |
|
---|
470 | # Clean up whitespace and convert \n charaters to <BR> or <P>
|
---|
471 | $text =~ s/ +/ /g;
|
---|
472 | $text =~ s/\s*$//g;
|
---|
473 | $text =~ s/^\s*//g;
|
---|
474 | $text =~ s/\n/\n<br>/g;
|
---|
475 | $text =~ s/<br>\s*<br>/<p>/gi;
|
---|
476 |
|
---|
477 | return $text;
|
---|
478 | }
|
---|
479 |
|
---|
480 |
|
---|
481 |
|
---|
482 |
|
---|
483 | #Process a MIME message.
|
---|
484 | # the textref we are given DOES NOT include the header.
|
---|
485 | sub text_from_mime_message {
|
---|
486 | my $self = shift(@_);
|
---|
487 | my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
|
---|
488 |
|
---|
489 | # Check for multiparts - $mimeinfo will be a boundary
|
---|
490 | if ($mimetype =~ /multipart/) {
|
---|
491 | $boundary="";
|
---|
492 | if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
|
---|
493 | $boundary=$1;
|
---|
494 | if ($boundary =~ m@^\"@) {
|
---|
495 | $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
|
---|
496 | }
|
---|
497 | } else {
|
---|
498 | print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
|
---|
499 | }
|
---|
500 | # parts start with "--$boundary"
|
---|
501 | # message ends with "--$boundary--"
|
---|
502 | # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
|
---|
503 | # that perl might want to interpolate. Also allows spaces...
|
---|
504 | $boundary=~s/\\/\\\\/g;
|
---|
505 | $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
|
---|
506 | my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
|
---|
507 | # remove first "part" and last "part" (final --)
|
---|
508 | shift @message_parts;
|
---|
509 | my $last=pop @message_parts;
|
---|
510 | # if our boundaries are a bit dodgy and we only found 1 part...
|
---|
511 | if (!defined($last)) {$last="";}
|
---|
512 | # make sure it is only -- and whitespace
|
---|
513 | if ($last !~ /^\-\-\s*$/ms) {
|
---|
514 | print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
|
---|
515 | }
|
---|
516 | foreach my $message_part (@message_parts) {
|
---|
517 | # remove the leading newline left from split.
|
---|
518 | $message_part=~s/^\r?\n//;
|
---|
519 | }
|
---|
520 | if ($mimetype eq "multipart/alternative") {
|
---|
521 | # check for an HTML version first, then TEXT, otherwise use first.
|
---|
522 | my $part_text="";
|
---|
523 | foreach my $message_part (@message_parts) {
|
---|
524 | if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
|
---|
525 | {
|
---|
526 | # Use the HTML version
|
---|
527 | $part_text= $self->text_from_part($message_part);
|
---|
528 | $mimetype="text/html";
|
---|
529 | last;
|
---|
530 | }
|
---|
531 | }
|
---|
532 | if ($part_text eq "") { # try getting a text part instead
|
---|
533 | foreach my $message_part (@message_parts) {
|
---|
534 | if ($message_part =~ m@^content\-type:\s*text/plain@mis)
|
---|
535 | {
|
---|
536 | # Use the plain version
|
---|
537 | $part_text= $self->text_from_part($message_part);
|
---|
538 | if ($part_text =~/[^\s]/) {
|
---|
539 | $part_text = text_into_html($part_text);
|
---|
540 | }
|
---|
541 | $mimetype="text/plain";
|
---|
542 | last;
|
---|
543 | }
|
---|
544 | }
|
---|
545 | }
|
---|
546 | if ($part_text eq "") { #use first part (no html/text part found)
|
---|
547 | $part_text = $self->text_from_part(shift @message_parts);
|
---|
548 | $part_text = text_into_html($part_text);
|
---|
549 | }
|
---|
550 | if ($part_text eq "") { # we couldn't get anything!!!
|
---|
551 | # or it was an empty message...
|
---|
552 | # do nothing...
|
---|
553 | print $outhandle "EMAILPlug: no text - empty body?\n";
|
---|
554 | } else {
|
---|
555 | $text = $part_text;
|
---|
556 | }
|
---|
557 | } elsif ($mimetype =~ m@multipart/(mixed|digest|related|signed)@) {
|
---|
558 | $text = "";
|
---|
559 | # signed is for PGP/GPG messages... the last part is a hash
|
---|
560 | if ($mimetype =~ m@multipart/signed@) {
|
---|
561 | pop @message_parts;
|
---|
562 | }
|
---|
563 | foreach my $message_part (@message_parts) {
|
---|
564 | my $part_header=$message_part;
|
---|
565 | my $part_body;
|
---|
566 | if ($message_part=~ /^\s*\n/) {
|
---|
567 | # no header... use defaults
|
---|
568 | $part_body=$message_part;
|
---|
569 | $part_header="Content-type: text/plain; charset=us-ascii";
|
---|
570 | } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
|
---|
571 | $part_body=$1;
|
---|
572 | } else {
|
---|
573 | # something's gone wrong...
|
---|
574 | $part_header="";
|
---|
575 | $part_body=$message_part;
|
---|
576 | }
|
---|
577 |
|
---|
578 | $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
|
---|
579 | my $part_content_type="";
|
---|
580 | my $part_content_info="";
|
---|
581 | if ($mimetype eq "multipart/digest") {
|
---|
582 | # default type - RTFRFC!!
|
---|
583 | $part_content_type="message/rfc822";
|
---|
584 | }
|
---|
585 | if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*\;?\s*(.*?)\s*$@mi) {
|
---|
586 | $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
|
---|
587 | $part_content_info=$2;
|
---|
588 | }
|
---|
589 | my $filename="";
|
---|
590 | if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
|
---|
591 | $filename=$1;
|
---|
592 | }
|
---|
593 |
|
---|
594 | # disposition - either inline or attachment.
|
---|
595 | # NOT CURRENTLY USED - we display all text types instead...
|
---|
596 | # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
|
---|
597 |
|
---|
598 | # add <<attachment>> to each part except the first...
|
---|
599 | if ($text ne "") {
|
---|
600 | $text.="\n<p><hr><strong><<attachment>></>";
|
---|
601 | # add part info header
|
---|
602 | $text.="<br>Type: $part_content_type<br>\n";
|
---|
603 | if ($filename ne "") {
|
---|
604 | $text.="Filename: $filename\n";
|
---|
605 | }
|
---|
606 | $text.="</strong></p>\n";
|
---|
607 | }
|
---|
608 | if ($part_content_type =~ m@text/@)
|
---|
609 | {
|
---|
610 | my $part_text= $self->text_from_part($message_part);
|
---|
611 | if ($part_content_type !~ m@text/(ht|x)ml@) {
|
---|
612 | $part_text = text_into_html($part_text);
|
---|
613 | }
|
---|
614 | if ($part_text eq "") {
|
---|
615 | $part_text = '<<empty message>>';
|
---|
616 | }
|
---|
617 | $text .= $part_text;
|
---|
618 | } elsif ($part_content_type =~ m@message/rfc822@) {
|
---|
619 | # This is a forwarded message
|
---|
620 | my $message_part_headers=$part_body;
|
---|
621 | $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
|
---|
622 | my $message_part_body=$1;
|
---|
623 | $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
|
---|
624 |
|
---|
625 | my $rfc822_formatted_body=""; # put result in here
|
---|
626 | if ($message_part_headers =~
|
---|
627 | /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.*?)\s*$/ims)
|
---|
628 | {
|
---|
629 | # The message header uses MIME flags
|
---|
630 | my $message_content_type=$1;
|
---|
631 | my $message_content_info=$2;
|
---|
632 | if (!defined($message_content_info)) {
|
---|
633 | $message_content_info="";
|
---|
634 | }
|
---|
635 | $message_content_type =~ tr/A-Z/a-z/;
|
---|
636 | if ($message_content_type =~ /multipart/) {
|
---|
637 | $rfc822_formatted_body=
|
---|
638 | $self->text_from_mime_message($message_content_type,
|
---|
639 | $message_content_info,
|
---|
640 | $message_part_body,
|
---|
641 | $outhandle);
|
---|
642 | } else {
|
---|
643 | $message_part_body= $self->text_from_part($part_body);
|
---|
644 | $rfc822_formatted_body=text_into_html($message_part_body);
|
---|
645 | }
|
---|
646 | } else {
|
---|
647 | # message doesn't use MIME flags
|
---|
648 | $rfc822_formatted_body=text_into_html($message_part_body);
|
---|
649 | }
|
---|
650 | # Add the returned text to the output
|
---|
651 | # don't put all the headers...
|
---|
652 | # $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
|
---|
653 | my $brief_headers=get_brief_headers($message_part_headers);
|
---|
654 | $text.=text_into_html($brief_headers);
|
---|
655 | $text.="<p>\n";
|
---|
656 | $text.=$rfc822_formatted_body;
|
---|
657 | $text.="</p>\n";
|
---|
658 | # end of message/rfc822
|
---|
659 | } elsif ($part_content_type =~ /multipart/) {
|
---|
660 | # recurse again
|
---|
661 |
|
---|
662 | $tmptext= $self->text_from_mime_message($part_content_type,
|
---|
663 | $part_content_info,
|
---|
664 | $part_body,
|
---|
665 | $outhandle);
|
---|
666 | $text.=$tmptext;
|
---|
667 | } else {
|
---|
668 | # this part isn't text/* or another message...
|
---|
669 | if ($text eq "") {
|
---|
670 | # this is the first part of a multipart, or only part!
|
---|
671 | $text="\n<p><hr><strong><<attachment>></>";
|
---|
672 | # add part info header
|
---|
673 | $text.="<br>Type: $part_content_type<br>\n";
|
---|
674 | $text.="Filename: $filename</strong></p>\n";
|
---|
675 | }
|
---|
676 |
|
---|
677 | # save attachment by default
|
---|
678 | if (!$self->{'ignore_attachments'}
|
---|
679 | && $filename ne "") { # this part has a file...
|
---|
680 | my $encoding="8bit";
|
---|
681 | if ($part_header =~
|
---|
682 | /^content-transfer-encoding:\s*(\w+)/mi ) {
|
---|
683 | $encoding=$1; $encoding =~ tr/A-Z/a-z/;
|
---|
684 | }
|
---|
685 | my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
|
---|
686 | my $save_filename="$filename";
|
---|
687 |
|
---|
688 | # make sure we don't clobber files with same name;
|
---|
689 | # need to keep state between .mbx files
|
---|
690 | my $assoc_files=$self->{'assoc_filenames'};
|
---|
691 | if ($assoc_files{$filename}) { # it's been set...
|
---|
692 | $assoc_files{$filename}++;
|
---|
693 | $filename =~ m/(.+)\.(\w+)$/;
|
---|
694 | my ($filestem, $ext)=($1,$2);
|
---|
695 | $save_filename="${filestem}_"
|
---|
696 | . $assoc_files{$filename} . ".$ext";
|
---|
697 | } else { # first file with this name
|
---|
698 | $assoc_files{$filename}=1;
|
---|
699 | }
|
---|
700 | open (SAVE, ">$tmpdir/$save_filename") ||
|
---|
701 | warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
|
---|
702 | $part_text = $message_part;
|
---|
703 | $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
|
---|
704 | if ($encoding eq "base64") {
|
---|
705 | print SAVE base64_decode($part_text);
|
---|
706 | } elsif ($encoding eq "quoted-printable") {
|
---|
707 | print SAVE qp_decode($part_text);
|
---|
708 | } else { # 7bit, 8bit, binary, etc...
|
---|
709 | print SAVE $part_text;
|
---|
710 | }
|
---|
711 | close SAVE;
|
---|
712 | my $doc_obj=$self->{'doc_obj'};
|
---|
713 | $doc_obj->associate_file("$tmpdir/$save_filename",
|
---|
714 | "$save_filename",
|
---|
715 | $part_content_type # mimetype
|
---|
716 | );
|
---|
717 | # clean up tmp area...
|
---|
718 | # Can't do this as it hasn't been copied/linked yet!!!
|
---|
719 | # &util::rm("$tmpdir/$save_filename");
|
---|
720 | print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
|
---|
721 |
|
---|
722 | # be nice if "download" was a translatable macro :(
|
---|
723 | $text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
|
---|
724 | } # end of save attachment
|
---|
725 | } # end of !text/message part
|
---|
726 | } # foreach message part.
|
---|
727 | } else {
|
---|
728 | # we can't handle this multipart type (not mixed or alternative)
|
---|
729 | # the RFC also mentions "parallel".
|
---|
730 | }
|
---|
731 | } # end of ($mimetype =~ multipart)
|
---|
732 | elsif ($mimetype =~ m@message/rfc822@) {
|
---|
733 | my $msg_header = $text;
|
---|
734 | $msg_header =~ s/\r?\n\r?\n(.*)$//s;
|
---|
735 | $text = $1;
|
---|
736 |
|
---|
737 | if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
|
---|
738 | {
|
---|
739 | $mimetype=$1;
|
---|
740 | $mimetype =~ tr/[A-Z]/[a-z]/;
|
---|
741 | $mimeinfo=$2;
|
---|
742 | if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
|
---|
743 | $charset = $1;
|
---|
744 | }
|
---|
745 | my $msg_text;
|
---|
746 | if ($mimetype =~ m@multipart/@) {
|
---|
747 | $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
|
---|
748 | $text, $outhandle);
|
---|
749 | } else {$msg_text=$text;}
|
---|
750 |
|
---|
751 | my $brief_header=text_into_html(get_brief_headers($msg_header));
|
---|
752 | $text= "\n<b><<attached message>></b><br>";
|
---|
753 | $text.= "<table><tr><td width=\"5%\"> </td>\n";
|
---|
754 | $text.="<td>" . $brief_header . "\n</p>" . $msg_text
|
---|
755 | . "</td></tr></table>";
|
---|
756 | }
|
---|
757 | } else {
|
---|
758 | # we don't do any processing of the content.
|
---|
759 | }
|
---|
760 |
|
---|
761 | return $text;
|
---|
762 | }
|
---|
763 |
|
---|
764 |
|
---|
765 | # Return only the "important" headers from a set of message headers
|
---|
766 | sub get_brief_headers {
|
---|
767 | my $msg_header = shift;
|
---|
768 | my $brief_header = "";
|
---|
769 |
|
---|
770 | # Order matters!
|
---|
771 | if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
|
---|
772 | if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
|
---|
773 | if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
|
---|
774 | if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
|
---|
775 | if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
|
---|
776 |
|
---|
777 | return $brief_header;
|
---|
778 | }
|
---|
779 |
|
---|
780 |
|
---|
781 | # Process a MIME part. Return "" if we can't decode it.
|
---|
782 | sub text_from_part {
|
---|
783 | my $self = shift;
|
---|
784 | my $text = shift || '';
|
---|
785 | my $part_header = $text;
|
---|
786 |
|
---|
787 | # check for empty part header (leading blank line)
|
---|
788 | if ($text =~ /^\s*\r?\n/) {
|
---|
789 | $part_header="Content-type: text/plain; charset=us-ascii";
|
---|
790 | } else {
|
---|
791 | $part_header =~ s/\r?\n\r?\n(.*)$//s;
|
---|
792 | $text=$1; if (!defined($text)) {$text="";}
|
---|
793 | }
|
---|
794 | $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
|
---|
795 | $part_header =~ /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
|
---|
796 | my $type=$1;
|
---|
797 | my $charset=$2;
|
---|
798 | if (!defined($type)) {$type="";}
|
---|
799 | if (!defined($charset)) {$charset="ascii";}
|
---|
800 | my $encoding="";
|
---|
801 | if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
|
---|
802 | $encoding=$1; $encoding=~tr/A-Z/a-z/;
|
---|
803 | }
|
---|
804 | # Content-Transfer-Encoding is per-part
|
---|
805 | if ($encoding ne "") {
|
---|
806 | if ($encoding =~ /quoted\-printable/) {
|
---|
807 | $text=qp_decode($text);
|
---|
808 | } elsif ($encoding =~ /base64/) {
|
---|
809 | $text=base64_decode($text);
|
---|
810 | } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
|
---|
811 | # rfc2045 also allows binary, which we ignore (for now).
|
---|
812 | my $outhandle=$self->{'outhandle'};
|
---|
813 | print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
|
---|
814 | return "";
|
---|
815 | }
|
---|
816 | }
|
---|
817 | if ($type eq "text/html") {
|
---|
818 | # only get stuff between <body> tags, or <html> tags.
|
---|
819 | $text =~ s@^.*<html[^>]*>@@is;
|
---|
820 | $text =~ s@</html>.*$@@is;
|
---|
821 | $text =~ s/^.*?<body[^>]*>//si;
|
---|
822 | $text =~ s/<\/body>.*$//si;
|
---|
823 | }
|
---|
824 | elsif ($type eq "text/xml") {
|
---|
825 | $text=~s/</</g;$text=~s/>/>/g;
|
---|
826 | $text="<pre>\n$text\n</pre>\n";
|
---|
827 | }
|
---|
828 | # convert to unicode
|
---|
829 | $self->convert2unicode($charset, \$text);
|
---|
830 | return $text;
|
---|
831 | }
|
---|
832 |
|
---|
833 |
|
---|
834 | # decode quoted-printable text
|
---|
835 | sub qp_decode {
|
---|
836 | my $text=shift;
|
---|
837 |
|
---|
838 | # if a line ends with "=\s*", it is a soft line break, otherwise
|
---|
839 | # keep in any newline characters.
|
---|
840 |
|
---|
841 | $text =~ s/=\s*\r?\n//mg;
|
---|
842 | $text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
|
---|
843 | return $text;
|
---|
844 | }
|
---|
845 |
|
---|
846 | # decode base64 text. This is fairly slow (since it's interpreted perl rather
|
---|
847 | # than compiled XS stuff like in the ::MIME modules, but this is more portable
|
---|
848 | # for us at least).
|
---|
849 | # see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
|
---|
850 | # 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
|
---|
851 | # from each encoded byte.
|
---|
852 |
|
---|
853 |
|
---|
854 | sub base64_decode {
|
---|
855 | my $enc_text = shift;
|
---|
856 | # A=>0, B=>1, ..., '+'=>62, '/'=>63
|
---|
857 | # also '=' is used for padding at the end, but we remove it anyway.
|
---|
858 | my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
---|
859 | # map each MIME char into it's value, for more efficient lookup.
|
---|
860 | my %index;
|
---|
861 | map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
|
---|
862 | # remove all non-base64 chars. eval to get variable in transliteration...
|
---|
863 | # also remove '=' - we'll assume (!!) that there are no errors in the encoding
|
---|
864 | eval "\$enc_text =~ tr|$mimechars||cd";
|
---|
865 | my $decoded="";
|
---|
866 | while (length ($enc_text)>3)
|
---|
867 | {
|
---|
868 | my $fourchars=substr($enc_text,0,4,"");
|
---|
869 | my @chars=(split '',$fourchars);
|
---|
870 | $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
|
---|
871 | $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
|
---|
872 | $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
|
---|
873 | }
|
---|
874 | # if there are any input chars left, there are either
|
---|
875 | # 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
|
---|
876 | my @chars=(split '',$enc_text);
|
---|
877 | if (length($enc_text)) {
|
---|
878 | $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
|
---|
879 | }
|
---|
880 | if (length($enc_text)==3) {
|
---|
881 | $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
|
---|
882 | }
|
---|
883 | return $decoded;
|
---|
884 | }
|
---|
885 |
|
---|
886 | sub convert2unicode {
|
---|
887 | my $self = shift(@_);
|
---|
888 | my ($charset, $textref) = @_;
|
---|
889 |
|
---|
890 | # first get our character encoding name in the right form.
|
---|
891 | $charset = "iso_8859_1" unless defined $charset;
|
---|
892 | $charset=~tr/A-Z/a-z/;
|
---|
893 | $charset=~s/\-/_/g;
|
---|
894 | $charset=~s/gb2312/gb/;
|
---|
895 | # assumes EUC-KR, not ISO-2022 !?
|
---|
896 | $charset=~s/ks_c_5601_1987/korean/;
|
---|
897 |
|
---|
898 | if ($charset eq "utf_8") {
|
---|
899 | # nothing to do!
|
---|
900 | return;
|
---|
901 | }
|
---|
902 |
|
---|
903 | # It appears that we can't always trust ascii text so we'll treat it
|
---|
904 | # as iso-8859-1 (letting characters above 0x80 through without
|
---|
905 | # converting them to utf-8 will result in invalid XML documents
|
---|
906 | # which can't be parsed at build time).
|
---|
907 | $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
|
---|
908 |
|
---|
909 | if ($charset eq "iso_8859_1") {
|
---|
910 | # test if the mailer lied, and it has win1252 chars in it...
|
---|
911 | # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
|
---|
912 | if ($$textref =~ m/[\x80-\x9f]/) {
|
---|
913 | my $outhandle = $self->{'outhandle'};
|
---|
914 | print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
|
---|
915 | print $outhandle "codepage 1252 detected.\n";
|
---|
916 | $charset = "windows_1252";
|
---|
917 | }
|
---|
918 | }
|
---|
919 | my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
|
---|
920 |
|
---|
921 | if ($utf8_text ne "") {
|
---|
922 | $$textref=$utf8_text;
|
---|
923 | } else {
|
---|
924 | # we didn't get any text... unsupported encoding perhaps? Or it is
|
---|
925 | # empty anyway. We'll try to continue, assuming 8859-1. We could strip
|
---|
926 | # characters out here if this causes problems...
|
---|
927 | $$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
|
---|
928 |
|
---|
929 | }
|
---|
930 | }
|
---|
931 |
|
---|
932 |
|
---|
933 | # Perl packages have to return true if they are run.
|
---|
934 | 1;
|
---|